├── CGC
    ├── ORF1_list.txt
    ├── ORF2_list.txt
    ├── make_ORF1_and_intact_table.py
    ├── make_ORF1_and_intact_table_stranded.py
    ├── make_l1pa1to4table.py
    ├── make_l1pa1to4table_stranded.py
    ├── median_template_and_pairs.py
    ├── read_or_pair_overlap_bed_and_unmapped.py
    ├── report_l1_exp_counts.py
    ├── report_l1_exp_counts_unstranded.py
    ├── total_orf1_and_orf2.py
    └── total_orf1_and_orf2_stranded.py
├── Dockerfile
├── L1EM.yml
├── L1EM
    ├── G_of_R.py
    ├── G_of_R_single_unstranded.py
    ├── G_of_R_unstranded.py
    └── L1EM.py
├── LICENSE.txt
├── README.md
├── annotation
    ├── L1EM.400.bed
    └── mm39.L1EM.bed
├── generate_L1EM_fasta_and_index.sh
├── generate_mm39_L1EM_fasta_and_index.sh
├── manual.md
├── parameters.sh
├── run_L1EM.sh
├── run_L1EM_fortcga.sh
├── run_L1EM_mm39.sh
├── run_L1EM_mm39_unstranded.sh
├── run_L1EM_unstranded.sh
├── run_L1EM_unstranded_fromdocker.sh
├── run_L1EM_withlessmemory.sh
└── utilities
    ├── L1EM_readpairs.py
    ├── filtered_and_normalized_active_l1md.py
    ├── filtered_and_normalized_active_l1md_unstranded.py
    ├── filtered_and_normalized_l1hs.py
    ├── filtered_and_normalized_l1hs_unstranded.py
    ├── median_template.py
    ├── read_or_pair_overlap_bed.py
    ├── report_l1_exp_counts.py
    ├── report_l1_exp_counts_clip.py
    ├── report_l1_exp_counts_unstranded.py
    ├── report_l1hs_transcription.py
    └── report_l1hs_transcription_unstranded.py


/CGC/ORF1_list.txt:
--------------------------------------------------------------------------------
  1 | L1HS.1.chrX:141421202-141427246
  2 | L1HS.1.chr2:172315270-172321297
  3 | L1HS.1.chr17:70458956-70464987
  4 | L1HS.1.chr15:82882881-82888919
  5 | L1HS.1.chr14:63116706-63122735
  6 | L1HS.1.chr13:29641706-29647706
  7 | L1HS.1.chr12:126299023-126305038
  8 | L1PA3.1.chr12:13391606-13397632
  9 | L1HS.1.chr11:95436216-95442246
 10 | L1HS.1.chr10:98782941-98788971
 11 | L1HS.1.chrX:11935296-11941314
 12 | L1HS.1.chr7:111243515-111249546
 13 | L1HS.1.chr7:96846650-96852680
 14 | L1HS.1.chr7:66286853-66292884
 15 | L1HS.1.chr7:49680245-49686300
 16 | L1HS.1.chr6:24811657-24817706
 17 | L1HS.1.chr5:109259387-109265418
 18 | L1HS.1.chr5:104518587-104524616
 19 | L1HS.1.chr4:136293494-136299546
 20 | L1PA2.1.chr4:128213789-128219796
 21 | L1HS.1.chr4:79966907-79972933
 22 | L1HS.1.chr4:70328906-70334307
 23 | L1HS.1.chr4:21159390-21165421
 24 | L1HS.1.chr3:89460825-89466856
 25 | L1PA2.1.chr3:81051389-81057413
 26 | L1HS.1.chr1:237019467-237025494
 27 | L1HS.1.chr1:180866811-180872843
 28 | L1HS.1.chr1:84052389-84058406
 29 | L1HS.1.chr1:104770247-104776278
 30 | L1HS.1.chr6:86000000-86005073
 31 | L1HS.1.chr22:28663283-28669315
 32 | L1HS.1.chr11:78677772-78683802
 33 | L1HS.1.chr10:19088601-19094618
 34 | L1HS.1.chr9:90149604-90155634
 35 | L1PA2.1.chr8:91558668-91564687
 36 | L1HS.1.chr3:46783105-46789138
 37 | L1HS.1.chr1:174590323-174596379
 38 | L1HS.1.chr22:48985761-48991792
 39 | L1HS.1.chr17:70544788-70550795
 40 | L1HS.1.chrX:155516016-155522048
 41 | L1HS.1.chrX:83059584-83065637
 42 | L1HS.1.chr9:112798107-112804159
 43 | L1HS.1.chr9:94113535-94119565
 44 | L1HS.1.chr8:72875538-72881588
 45 | L1HS.1.chr5:173402796-173408828
 46 | L1HS.1.chr20:12801017-12807044
 47 | L1HS.1.chr18:37819737-37825798
 48 | L1HS.1.chr16:68583448-68589505
 49 | L1HS.1.chr16:33952564-33958612
 50 | L1HS.1.chrX:130517377-130523407
 51 | L1HS.1.chrX:11707248-11713279
 52 | L1HS.1.chr8:134070756-134076773
 53 | L1HS.1.chr4:166569976-166576007
 54 | L1HS.1.chr4:87347103-87353146
 55 | L1HS.1.chr3:130628808-130634065
 56 | L1HS.1.chr2:71411474-71417501
 57 | L1HS.1.chr1:118852351-118858380
 58 | L1HS.1.chr20:55859566-55865521
 59 | L1HS.1.chr16:83637252-83643296
 60 | L1HS.1.chr15:83450804-83456834
 61 | L1HS.1.chr10:5245354-5251383
 62 | L1HS.1.chr9:110791097-110797129
 63 | L1HS.1.chr8:125582886-125588889
 64 | L1HS.1.chr7:141920659-141926712
 65 | L1HS.1.chr7:25041860-25047891
 66 | L1HS.1.chr5:156061919-156067966
 67 | L1HS.1.chr4:90675739-90681757
 68 | L1HS.1.chr4:59078847-59084877
 69 | L1HS.1.chr3:163236941-163242962
 70 | L1HS.1.chr3:22050867-22053197
 71 | L1HS.1.chr2:148188745-148194773
 72 | L1HS.1.chr2:4733729-4739760
 73 | L1HS.1.chr15:70729744-70735160
 74 | L1HS.1.chr1:121532230-121538261
 75 | L1HS.1.chr12:73283667-73289668
 76 | L1HS.1.chr6:51874783-51880802
 77 | L1HS.1.chr2:112503812-112509845
 78 | L1HS.1.chr13:108510472-108516495
 79 | L1HS.1.chr11:93136638-93142673
 80 | L1HS.1.chr11:24327951-24334001
 81 | L1HS.1.chr6:133020691-133026746
 82 | L1HS.1.chr4:98592435-98598463
 83 | L1HS.1.chr4:23614771-23620793
 84 | L1HS.1.chr3:159095379-159101394
 85 | L1HS.1.chr16:9584490-9590522
 86 | L1HS.1.chr10:33510845-33516876
 87 | L1HS.1.chrX:106469285-106475319
 88 | L1HS.1.chr4:79704552-79710581
 89 | L1HS.1.chr3:158019676-158025704
 90 | L1PA2.1.chr3:63211708-63217714
 91 | L1HS.1.chr2:166988454-166994509
 92 | L1HS.1.chr13:31302314-31308370
 93 | L1HS.1.chr12:74874868-74880901
 94 | L1HS.1.chr7:30439242-30445274
 95 | L1HS.1.chr6:72988654-72994686
 96 | L1HS.1.chr4:166755895-166761908
 97 | L1HS.1.chr4:79937715-79943746
 98 | L1HS.1.chr2:102566355-102572385
 99 | L1PA2.1.chr18:59403939-59409970
100 | L1HS.1.chr12:3500000-3505228
101 | L1HS.1.chr11:93420986-93427031
102 | L1HS.1.chr11:90400067-90406098
103 | L1HS.1.chr11:36551606-36557636
104 | L1HS.1.chr8:128453002-128459020
105 | L1HS.1.chr5:166966760-166972815
106 | L1HS.1.chr5:146609485-146615534
107 | L1HS.1.chr3:109199872-109205903
108 | L1PA3.1.chrX:64252345-64258375
109 | L1HS.1.chr6:2417774-2423803
110 | L1HS.1.chr5:102189483-102194435
111 | L1PA2.1.chr5:39787652-39793671
112 | L1HS.1.chr4:169515501-169521532
113 | L1HS.1.chr4:78105735-78111765
114 | L1PA2.1.chr4:55619153-55625181
115 | L1HS.1.chr3:136479056-136485103
116 | L1HS.1.chr3:116359999-116366026
117 | L1PA2.1.chr2:106130892-106136925
118 | L1PA2.1.chr1:71888203-71894235
119 | L1HS.1.chr15:87509891-87515920
120 | L1HS.1.chr10:109812437-109818457
121 | L1HS.1.chr10:105775520-105781551
122 | L1PA2.1.chrX:42888370-42894396
123 | L1HS.1.chr7:93787624-93793679
124 | L1HS.1.chr5:13416497-13422525
125 | L1HS.1.chr2:11000000-11002136
126 | L1PA2.1.chr10:78088450-78094479
127 | L1HS.1.chrX:73380991-73387013
128 | L1HS.1.chr7:97613656-97619688
129 | L1HS.1.chr5:79778884-79784938
130 | L1HS.1.chr4:61939927-61945962
131 | L1PA2.1.chr2:43660471-43666500
132 | L1PA2.1.chr1:93790652-93796681
133 | L1HS.1.chr1:68736693-68740136
134 | L1HS.1.chr14:30684809-30690837
135 | L1HS.1.chr12:54788573-54794627
136 | L1HS.1.chrX:26314417-26320446
137 | L1HS.1.chr6:112703745-112709778
138 | L1HS.1.chr6:70010347-70016552
139 | L1PA2.1.chr6:44870634-44876665
140 | L1HS.1.chr5:119684785-119690814
141 | L1HS.1.chr5:32824614-32827992
142 | L1HS.1.chr2:193212420-193218448
143 | L1HS.1.chr20:7116194-7122199
144 | L1PA2.1.chr11:116570827-116576273
145 | L1HS.1.chr11:82155865-82161891
146 | L1HS.1.chrY:5606144-5612199
147 | L1HS.1.chr4:78347980-78354013
148 | L1HS.1.chr4:15841546-15847572
149 | L1PA2.1.chr3:43064774-43070790
150 | L1HS.1.chr1:67078891-67084915
151 | L1HS.1.chr18:5684668-5687891
152 | L1HS.1.chr16:18821266-18827058
153 | L1HS.1.chr14:79308933-79314061
154 | L1HS.1.chr11:109177494-109183526
155 | L1HS.1.chr9:95697585-95703604
156 | L1HS.1.chr7:113776122-113782152
157 | L1HS.1.chr5:15906515-15912550
158 | L1HS.1.chr4:19077911-19083929
159 | L1HS.1.chr3:90169567-90175598
160 | L1HS.1.chr20:23426108-23432140
161 | L1HS.1.chr16:54042096-54048145
162 | L1HS.1.chrY:4948913-4954938
163 | L1HS.1.chrX:66180696-66186728
164 | L1HS.1.chrX:54118685-54124744
165 | L1HS.1.chr5:152886441-152892473
166 | L1HS.1.chr5:102131356-102137385
167 | L1HS.1.chr4:74717539-74723587
168 | L1HS.1.chr2:169248623-169254656
169 | L1HS.1.chr1:218009227-218015252
170 | L1PA2.1.chr1:176256085-176262110
171 | L1HS.1.chr1:34566055-34572105
172 | L1PA2.1.chr11:87047304-87053192
173 | L1HS.1.chrX:64013267-64019286
174 | L1HS.1.chr5:58384174-58390206
175 | L1HS.1.chr5:34147845-34154031
176 | L1HS.1.chr4:52538471-52544498
177 | L1HS.1.chr1:80939203-80945257
178 | L1HS.1.chr18:70746549-70752581
179 | L1PA2.1.chr15:71174139-71180152
180 | L1HS.1.chr7:110707004-110713024
181 | L1HS.1.chr6:117102131-117108163
182 | L1HS.1.chr4:91978211-91984413
183 | L1HS.1.chr1:197707714-197713746
184 | L1PA2.1.chr15:58125731-58131761
185 | L1HS.1.chrX:119435468-119441493
186 | L1HS.1.chr5:160709608-160715639
187 | L1HS.1.chr4:119948726-119954758
188 | L1HS.1.chr4:14755114-14761144
189 | L1HS.1.chr3:77763677-77769678
190 | L1HS.1.chr2:175481951-175487994
191 | L1HS.1.chr2:16593725-16599758
192 | L1HS.1.chr12:69773410-69779441
193 | L1PA2.1.chr9:120055235-120061264
194 | L1PA2.1.chr8:97295603-97301657
195 | L1PA2.1.chr8:58914690-58920717
196 | L1HS.1.chr7:63148831-63154859
197 | L1HS.1.chr7:61837998-61844054
198 | L1HS.1.chr4:111894801-111900831
199 | L1HS.1.chr3:103556537-103562569
200 | L1HS.1.chr3:79129777-79133955
201 | L1HS.1.chr3:26398017-26404045
202 | L1PA2.1.chr3:12028021-12033291
203 | L1HS.1.chr2:213567231-213573262
204 | L1HS.1.chr1:196219370-196225402
205 | L1HS.1.chr15:54926081-54932099
206 | L1HS.1.chr11:99602687-99608113
207 | L1HS.1.chr11:31315654-31321680
208 | L1HS.1.chr7:70197328-70203357
209 | L1PA2.1.chr3:137633714-137639732
210 | L1HS.1.chr2:196905587-196911636
211 | L1HS.1.chr2:86655238-86661268
212 | L1HS.1.chr1:187343764-187349794
213 | L1HS.1.chr1:71513698-71519742
214 | L1PA2.1.chr16:61801455-61807489
215 | L1PA2.1.chr11:14715908-14721938
216 | L1HS.1.chrX:83542396-83548420
217 | L1HS.1.chr1:193717837-193723892
218 | L1HS.1.chr1:113497220-113500000
219 | L1HS.1.chr1:86679080-86685111
220 | L1HS.1.chr16:16840517-16846556
221 | L1HS.1.chr5:133583288-133589299
222 | L1PA2.1.chr5:65164017-65170048
223 | L1HS.1.chr1:209913771-209919823
224 | L1PA2.1.chr12:112621197-112627228
225 | L1PA2.1.chr12:92313998-92320023
226 | L1HS.1.chr12:38799646-38805673
227 | L1PA2.1.chrY:17060920-17066963
228 | L1HS.1.chrX:96057824-96063842
229 | L1HS.1.chrX:50019456-50025505
230 | L1HS.1.chr5:152076868-152082891
231 | L1HS.1.chr3:108749400-108755425
232 | L1HS.1.chr18:47660373-47666427
233 | L1PA2.1.chr13:39000817-39006875
234 | L1HS.1.chr12:51562631-51568657
235 | L1PA2.1.chr8:93405812-93411825
236 | L1HS.1.chr5:166141191-166145692
237 | L1HS.1.chr5:153070982-153077008
238 | L1HS.1.chr5:81616090-81622140
239 | L1HS.1.chr4:93638307-93644337
240 | L1HS.1.chr2:153007766-153013796
241 | L1PA2.1.chrX:98687494-98693514
242 | L1HS.1.chr5:177772245-177778274
243 | L1PA2.1.chr3:158634523-158640540
244 | L1HS.1.chrX:23238516-23244575
245 | L1PA7.1.chr9:113437560-113443590
246 | L1HS.1.chr9:83049539-83055571
247 | L1PA2.1.chr8:40432212-40438240
248 | L1HS.1.chr6:156324980-156331010
249 | L1HS.1.chr4:93608283-93614338
250 | L1HS.1.chr4:57562316-57568347
251 | L1HS.1.chr18:50343959-50349987
252 | L1HS.1.chr18:535701-541755
253 | L1PA2.1.chr11:94232524-94238528
254 | L1HS.1.chrY:9941130-9947151
255 | L1PA2.1.chr6:99823597-99829594
256 | L1PA2.1.chr4:143100259-143106289
257 | L1HS.1.chr4:106571057-106577070
258 | L1PA2.1.chr21:35493766-35499791
259 | L1HS.1.chr14:70547290-70553322
260 | L1PA2.1.chr9:101102144-101108174
261 | L1HS.1.chr8:135875862-135881890
262 | L1PA2.1.chr8:68362478-68367911
263 | L1PA2.1.chr3:26384735-26390767
264 | L1HS.1.chr1:247687173-247693204
265 | L1PA2.1.chr1:174377791-174383815
266 | L1HS.1.chr12:44108220-44114234
267 | L1PA2.1.chr10:117832895-117838887
268 | L1HS.1.chr9:28111895-28117865
269 | L1PA2.1.chr3:53365276-53371325
270 | L1PA2.1.chr2:222149601-222155632
271 | L1PA2.1.chr2:165485934-165491963
272 | L1PA2.1.chr11:60532161-60538190
273 | L1HS.1.chrX:56695884-56701916
274 | L1HS.1.chr8:136438074-136444105
275 | L1PA2.1.chr5:152340020-152346052
276 | L1PA2.1.chr4:27375687-27381719
277 | L1PA2.1.chr1:242045561-242051585
278 | L1PA2.1.chr1:192500584-192506612
279 | L1PA2.1.chr1:78845456-78851474
280 | L1HS.1.chr17:9615985-9622015
281 | L1HS.1.chrX:142477849-142483853
282 | L1PA2.1.chrX:50060143-50066175
283 | L1PA3.1.chr8:61115375-61121394
284 | L1PA2.1.chr8:10932425-10938427
285 | L1HS.1.chr3:3963076-3969110
286 | L1PA2.1.chr2:158522617-158528649
287 | L1PA2.1.chr15:86528094-86534125
288 | L1PA2.1.chr6:104489393-104495424
289 | L1PA2.1.chr2:124139775-124145807
290 | L1PA2.1.chr2:72063975-72069997
291 | L1HS.1.chr18:62906292-62912314
292 | L1HS.1.chr11:125536609-125542640
293 | L1PA2.1.chr10:20751667-20757692
294 | L1PA2.1.chr9:19536200-19542230
295 | L1HS.1.chr8:91522091-91528121
296 | L1PA2.1.chr7:23035734-23039855
297 | L1PA2.1.chr18:8057452-8063463
298 | L1PA2.1.chr8:110952164-110957638
299 | L1PA2.1.chr4:75126805-75132838
300 | L1PA2.1.chr20:8595101-8601127
301 | L1PA2.1.chr4:119274113-119280127
302 | L1PA2.1.chr2:157368535-157374566
303 | L1PA2.1.chr20:42206269-42212317
304 | L1PA2.1.chr18:57719248-57725264
305 | L1PA2.1.chr18:24619042-24625072
306 | L1PA2.1.chr17:32887137-32893184
307 | L1PA2.1.chr14:26629268-26635299
308 | L1PA2.1.chrX:34249185-34253913
309 | L1PA2.1.chr8:98614445-98620471
310 | L1HS.1.chr7:141062014-141068042
311 | L1HS.1.chr5:111302238-111308262
312 | L1PA2.1.chr5:93213145-93219176
313 | L1PA2.1.chr1:40365613-40370869
314 | L1PA2.1.chr13:40734919-40740945
315 | L1PA2.1.chr13:82045349-82051380
316 | L1PA2.1.chr8:98260275-98266293
317 | L1HS.1.chr1:187597671-187603699
318 | L1HS.1.chr15:81995166-82000000
319 | L1HS.1.chr14:51794601-51800632
320 | L1HS.1.chr10:108310130-108316139
321 | L1HS.1.chr8:104739851-104745873
322 | L1PA2.1.chr6:69515143-69521169
323 | L1PA2.1.chr3:119001470-119007490
324 | L1PA2.1.chr18:39565592-39571600
325 | L1PA2.1.chr11:49775119-49781151
326 | L1PA2.1.chr4:100000524-100006553
327 | L1PA2.1.chr3:103220448-103226476
328 | L1PA2.1.chr2:219931818-219937838
329 | L1PA2.1.chr2:173699375-173705410
330 | L1PA2.1.chr15:44252034-44258049
331 | L1PA2.1.chr12:61941440-61947489
332 | L1PA2.1.chr10:80722509-80728544
333 | L1PA2.1.chrX:36465194-36471217
334 | L1PA2.1.chr9:100527228-100533251
335 | L1PA2.1.chr6:8770471-8776512
336 | L1PA2.1.chr4:160574032-160580085
337 | L1PA2.1.chr1:65558564-65564576
338 | L1PA2.1.chr14:58032539-58038561
339 | L1PA2.1.chr13:73640527-73646551
340 | L1PA2.1.chrY:13179085-13185115
341 | L1PA2.1.chrX:130958931-130964957
342 | L1PA2.1.chr7:136414180-136420210
343 | L1PA3.1.chr5:93261035-93267065
344 | L1PA2.1.chr2:128858984-128865016
345 | L1PA2.1.chr1:91211587-91216947
346 | L1PA2.1.chr16:48768571-48774603
347 | L1PA2.1.chrX:76005216-76007849
348 | L1PA2.1.chr8:35528045-35534071
349 | L1PA3.1.chr7:37612053-37618072
350 | L1PA3.1.chr2:122046204-122052249
351 | L1PA2.1.chr2:122012673-122018708
352 | L1PA2.1.chr18:69449559-69455072
353 | L1PA2.1.chr18:24710814-24716841
354 | L1PA2.1.chr12:64195587-64201638
355 | L1PA2.1.chr8:135259101-135265130
356 | L1PA2.1.chr7:141032606-141038609
357 | L1PA2.1.chr3:141757129-141763153
358 | L1PA2.1.chr3:111018716-111024745
359 | L1HS.1.chr2:230337069-230342513
360 | L1HS.1.chr14:45477110-45483169
361 | L1HS.1.chr12:55096256-55102283
362 | L1HS.1.chr3:18516080-18520244
363 | L1PA2.1.chr2:174404458-174410482
364 | L1HS.1.chr19:43864494-43867300
365 | L1PA2.1.chrX:117873996-117880023
366 | L1PA2.1.chrX:107364336-107370349
367 | L1PA2.1.chr9:1223881-1229900
368 | L1PA2.1.chr6:9966044-9972049
369 | L1PA2.1.chr14:30363537-30369568
370 | L1PA2.1.chr12:57646479-57652498
371 | L1PA2.1.chrX:111588704-111594727
372 | L1PA3.1.chr2:799542-805567
373 | L1PA2.1.chr17:55501716-55507741
374 | L1PA3.1.chr12:88846991-88853008
375 | L1PA2.1.chr12:23070636-23076652
376 | L1PA2.1.chrX:19142668-19148700
377 | L1HS.1.chr9:20655632-20658802
378 | L1PA2.1.chr7:16216428-16222457
379 | L1HS.1.chr3:82337499-82339442
380 | L1PA7.1.chr3:27815983-27821983
381 | L1PA2.1.chr2:196521488-196527500
382 | L1PA2.1.chr16:60522745-60528760
383 | L1PA2.1.chr11:75748616-75754649
384 | L1PA2.1.chrX:104026871-104032902
385 | L1PA2.1.chr9:22348829-22354859
386 | L1PA2.1.chr8:1607577-1613555
387 | L1PA2.1.chr6:113531117-113537147
388 | L1PA2.1.chr6:39454557-39460534
389 | L1PA2.1.chr2:133910480-133916504
390 | L1PA2.1.chr17:12449903-12455932
391 | L1PA2.1.chr8:119159398-119165410
392 | L1PA2.1.chr6:113226487-113232515
393 | L1PA2.1.chr3:65509292-65515316
394 | L1PA2.1.chr18:60029678-60035707
395 | L1PA3.1.chr12:47426904-47432928
396 | L1PA2.1.chr10:106844583-106850610
397 | L1PA2.1.chrX:74788952-74795000
398 | L1PA2.1.chrX:65614680-65620735
399 | L1PA2.1.chr7:15547309-15553333
400 | L1PA2.1.chr3:111556203-111562234
401 | L1PA2.1.chr12:55484008-55490018
402 | L1PA2.1.chr8:74942646-74948678
403 | L1PA3.1.chr6:9810750-9816777
404 | L1PA2.1.chr4:186110393-186116420
405 | L1PA2.1.chr2:211219320-211225344
406 | L1PA2.1.chr2:182025225-182031239
407 | L1PA2.1.chr2:62835478-62841498
408 | L1PA2.1.chr12:55344249-55350274
409 | L1PA2.1.chr11:55685638-55691665
410 | L1PA2.1.chr10:111583927-111589948
411 | L1PA2.1.chr8:25730343-25736354
412 | L1PA2.1.chr2:30904198-30910223
413 | L1PA3.1.chr1:49006012-49012044
414 | L1PA2.1.chr13:40356290-40362321
415 | L1PA2.1.chr11:100475720-100481744
416 | L1PA2.1.chr6:100196461-100202490
417 | L1PA2.1.chr20:53503851-53509874
418 | L1PA2.1.chrX:100550262-100555448
419 | L1PA2.1.chrX:79765605-79771625
420 | L1PA2.1.chrX:47783671-47789697
421 | L1PA3.1.chrX:14103920-14109975
422 | L1PA2.1.chr9:119621304-119627350
423 | L1PA2.1.chr7:34169569-34175587
424 | L1HS.1.chr5:123933969-123935867
425 | L1PA2.1.chr5:78866805-78872828
426 | L1PA2.1.chr21:17376038-17381930
427 | L1PA2.1.chr8:35171472-35177508
428 | L1PA2.1.chr2:195067521-195073543
429 | L1PA2.1.chr2:191761849-191767876
430 | L1PA2.1.chr1:58269013-58275032
431 | L1PA2.1.chr15:83383651-83389616
432 | L1PA2.1.chr8:15555981-15562028
433 | L1PA2.1.chr6:156361254-156367276
434 | L1PA2.1.chr4:11235217-11241254
435 | L1PA2.1.chr8:83447142-83453169
436 | L1PA3.1.chr3:70285056-70291083
437 | L1PA2.1.chr22:32294665-32300684
438 | L1PA2.1.chr18:28416544-28422561
439 | L1PA2.1.chrX:117727933-117733746
440 | L1PA2.1.chr7:122278915-122284945
441 | L1PA2.1.chr7:111445694-111451725
442 | L1PA3.1.chr1:159452953-159458976
443 | L1PA2.1.chr13:42424880-42430912
444 | L1PA2.1.chrY:7249559-7255517
445 | L1PA2.1.chr9:70199056-70205081
446 | L1PA2.1.chr8:120644161-120650187
447 | L1PA4.1.chr5:90816413-90822442
448 | L1PA3.1.chr4:167343516-167349561
449 | L1PA3.1.chr4:11143617-11149645
450 | L1PA2.1.chr2:83768302-83774332
451 | L1PA2.1.chr1:115147821-115153959
452 | L1PA3.1.chr1:84228930-84234940
453 | L1HS.1.chr7:12497211-12500000
454 | L1PA3.1.chr6:102752889-102758907
455 | L1PA2.1.chr17:32678257-32684272
456 | L1PA2.1.chr15:93675399-93681428
457 | L1PA2.1.chr12:85312419-85318459
458 | L1PA3.1.chr10:120234299-120240325
459 | L1PA2.1.chr8:84419545-84425573
460 | L1PA3.1.chr6:48737348-48743377
461 | L1PA2.1.chr4:141877346-141883376
462 | L1PA3.1.chr3:40902374-40908432
463 | L1PA2.1.chr12:102827177-102833208
464 | L1PA2.1.chr12:88708857-88714885
465 | L1PA2.1.chr10:84756878-84762878
466 | L1PA2.1.chrX:150417926-150421451
467 | L1PA2.1.chr18:68784834-68790853
468 | L1PA2.1.chr16:36071922-36077950
469 | L1PA4.1.chr8:75278367-75284402
470 | L1PA2.1.chr7:85249854-85255879
471 | L1PA2.1.chr2:209746180-209752208
472 | L1PA3.1.chr2:72015949-72021960
473 | L1PA2.1.chr1:163639993-163646040
474 | L1PA2.1.chr15:81870667-81876699
475 | L1PA2.1.chr14:40326207-40332209
476 | L1PA2.1.chr9:14663995-14670015
477 | L1PA2.1.chr21:33925606-33931606
478 | L1PA3.1.chr11:4170252-4176276
479 | L1PA2.1.chrX:86969631-86975651
480 | L1PA2.1.chr3:122041911-122047938
481 | L1PA2.1.chr3:34413126-34419172
482 | L1PA2.1.chr3:55046104-55052129
483 | L1PA2.1.chr1:223395534-223401557
484 | L1PA2.1.chr15:49778509-49784518
485 | L1HS.1.chrX:92254241-92256469
486 | L1PA3.1.chr8:59168655-59174679
487 | L1PA2.1.chr7:96552195-96558214
488 | L1PA3.1.chr7:22528296-22534331
489 | L1PA2.1.chr6:82389419-82395425
490 | L1PA2.1.chr5:64709034-64715065
491 | L1PA3.1.chr5:26476048-26482063
492 | L1PA2.1.chr19:55822401-55828429
493 | L1PA2.1.chr6:141814814-141820842
494 | L1PA3.1.chr3:97904737-97910773
495 | L1PA3.1.chr11:32941250-32947256
496 | L1PA2.1.chr8:101678069-101684093
497 | L1PA3.1.chr8:87223230-87229276
498 | L1PA2.1.chr4:64859153-64865171
499 | L1PA2.1.chr20:53472644-53478653
500 | L1PA2.1.chr10:117079038-117085063
501 | L1PA3.1.chrX:80405241-80411257
502 | L1PA2.1.chr9:28348134-28354162
503 | L1PA3.1.chr6:44754479-44760501
504 | L1PA2.1.chr5:30817969-30823996
505 | L1PA2.1.chr2:158351231-158357242
506 | L1PA2.1.chr1:177633927-177639946
507 | L1PA2.1.chr18:34552378-34558395
508 | L1PA2.1.chr15:20311030-20317051
509 | L1PA3.1.chr10:35477-41492
510 | L1PA2.1.chrX:113337822-113343853
511 | L1PA3.1.chr8:129324372-129329792
512 | L1PA2.1.chr6:86806663-86812682
513 | L1PA2.1.chr6:22174840-22180874
514 | L1PA2.1.chr5:139005423-139011486
515 | L1PA2.1.chr4:75401194-75407226
516 | L1PA2.1.chr18:34331115-34337159
517 | L1PA2.1.chr15:81668513-81674504
518 | L1PA2.1.chr11:26965118-26971134
519 | L1PA2.1.chr8:40647061-40653096
520 | L1PA2.1.chr7:91903682-91909706
521 | L1PA3.1.chr7:34906239-34912252
522 | L1PA3.1.chr6:133142073-133148104
523 | L1PA2.1.chr6:112813021-112819047
524 | L1PA2.1.chr6:91047151-91053161
525 | L1PA2.1.chr5:78452185-78458210
526 | L1PA3.1.chr4:53564637-53570664
527 | L1PA3.1.chr3:164702605-164708638
528 | L1PA2.1.chr3:58816278-58822304
529 | L1PA3.1.chr2:188123561-188129537
530 | L1PA3.1.chr18:35807-41823
531 | L1PA2.1.chr14:57112210-57118214
532 | L1PA2.1.chr8:126313241-126319040
533 | L1PA3.1.chr4:93589053-93595080
534 | L1PA2.1.chr2:153719447-153725466
535 | L1PA2.1.chrX:88207885-88213919
536 | L1PA3.1.chr5:133180711-133186159
537 | L1PA3.1.chr3:176827330-176833078
538 | L1PA2.1.chr17:61110229-61116238
539 | L1PA3.1.chr9:9931213-9937220
540 | L1PA2.1.chr7:113742900-113748900
541 | L1PA2.1.chr3:53921747-53927805
542 | L1PA2.1.chr5:123236611-123242616
543 | L1PA2.1.chr1:30438491-30444125
544 | L1PA2.1.chr8:137558584-137564612
545 | L1PA2.1.chr5:58552269-58558283
546 | L1PA2.1.chr3:195087672-195093677
547 | L1PA2.1.chr16:48015465-48021489
548 | L1PA3.1.chr14:88443481-88448908
549 | L1PA3.1.chrX:125608955-125614969
550 | L1PA3.1.chr6:49791502-49797511
551 | L1PA2.1.chr5:43717953-43723974
552 | L1PA2.1.chr16:21042672-21048703
553 | L1PA2.1.chr14:53266406-53271579
554 | L1HS.1.chr13:76612823-76618851
555 | L1PA2.1.chr10:30948036-30953410
556 | L1HS.1.chr7:10168424-10171419
557 | L1PA2.1.chr5:103046752-103052769
558 | L1PA2.1.chr11:42429263-42435286
559 | L1PA3.1.chr9:79177278-79183283
560 | L1PA2.1.chr20:21910853-21916881
561 | L1PA2.1.chr15:100417071-100423096
562 | L1PA2.1.chr9:97580993-97587031
563 | L1PA2.1.chr6:82558403-82564423
564 | L1PA2.1.chr4:167276135-167282161
565 | L1PA2.1.chr11:37240712-37246716
566 | L1PA3.1.chr10:19581231-19587252
567 | L1PA2.1.chr8:120348977-120354404
568 | L1PA3.1.chr4:175019740-175025751
569 | L1PA2.1.chr3:156278973-156284990
570 | L1PA3.1.chr10:31469210-31475223
571 | L1PA2.1.chrX:110816764-110822773
572 | L1PA2.1.chr7:84062409-84068436
573 | L1PA2.1.chr4:104687024-104692901
574 | L1PA2.1.chr2:22984153-22990168
575 | L1PA2.1.chr1:247888322-247894338
576 | L1HS.1.chr1:216485645-216487614
577 | L1PA2.1.chrX:152235997-152242028
578 | L1PA2.1.chrX:87116306-87122337
579 | L1PA3.1.chr20:40619322-40625356
580 | L1PA3.1.chr15:51031621-51037649
581 | L1PA3.1.chr16:86233295-86239310
582 | L1PA3.1.chr10:126676471-126682458
583 | L1PA3.1.chr9:87041882-87047900
584 | L1PA3.1.chr8:86586338-86592366
585 | L1PA2.1.chr8:74866519-74872545
586 | L1PA2.1.chr5:16335410-16341440
587 | L1PA2.1.chr1:185251798-185257805
588 | L1PA2.1.chr5:67736897-67742931
589 | L1PA2.1.chr5:42734714-42740780
590 | L1PA2.1.chr4:104786988-104793014
591 | L1PA3.1.chrX:126104124-126110137
592 | L1PA2.1.chr16:63388077-63394106
593 | L1PA3.1.chr14:43698653-43704668
594 | L1PA2.1.chr9:31294065-31300087
595 | L1PA2.1.chr8:72147447-72153464
596 | L1PA3.1.chr5:79095899-79101910
597 | L1PA3.1.chr3:168764449-168770163
598 | L1PA5.1.chr6:30247968-30253370
599 | L1PA3.1.chr5:152762660-152768698
600 | L1PA3.1.chr8:31851611-31857633
601 | L1PA2.1.chr7:93586902-93591767
602 | L1PA3.1.chr6:6185423-6191251
603 | L1PA2.1.chr14:40348058-40354070
604 | L1PA3.1.chr9:4605799-4611811
605 | L1PA3.1.chr2:117649184-117655215
606 | L1PA2.1.chr1:98588661-98592036
607 | L1PA2.1.chr4:14480751-14486827
608 | L1PA2.1.chr7:50898842-50904874
609 | L1PA3.1.chr3:50973137-50979172
610 | L1PA3.1.chr15:87463738-87469749
611 | L1PA2.1.chr4:75270579-75276607
612 | L1PA2.1.chr11:108260626-108266643
613 | L1PA2.1.chr18:7398117-7403452
614 | L1PA3.1.chr6:160679719-160685751
615 | L1PA3.1.chr5:117338253-117344286
616 | L1PA2.1.chr4:132631286-132637304
617 | L1PA4.1.chr11:88986995-88993027
618 | L1PA2.1.chr8:87624830-87630833
619 | L1PA3.1.chr4:138881278-138887439
620 | L1PA3.1.chr3:159392310-159398352
621 | L1PA2.1.chr2:208811621-208817648
622 | L1HS.1.chr10:37995443-38000000
623 | L1PA2.1.chr2:124407870-124413893
624 | L1PA2.1.chr14:43597900-43602088
625 | L1PA3.1.chrX:69676829-69682848
626 | L1PA3.1.chrX:29981730-29987885
627 | L1PA2.1.chr9:32729016-32735047
628 | L1PA3.1.chr3:189017229-189025537
629 | L1PA3.1.chr15:49259578-49265724
630 | L1PA2.1.chrX:63597195-63603223
631 | L1PA3.1.chr9:79494547-79500000
632 | L1PA2.1.chr6:86021488-86027515
633 | L1PA3.1.chr3:22737735-22743754
634 | L1PA2.1.chr15:56311143-56317177
635 | L1P1.1.chr12:21083799-21088613
636 | L1PA3.1.chr9:139648-145712
637 | L1PA3.1.chr3:135025209-135031249
638 | L1PA3.1.chr5:122509616-122515639
639 | L1PA3.1.chr4:184208227-184214253
640 | L1HS.1.chr16:65690011-65696020
641 | L1PA3.1.chr9:25070956-25076968
642 | L1PA3.1.chrX:148123449-148129475
643 | L1PA2.1.chr8:15576322-15582323
644 | L1PA2.1.chr2:40008598-40014619
645 | L1PA3.1.chr12:87969763-87975178
646 | L1PA3.1.chr6:116638696-116644847
647 | L1PA2.1.chr1:68923486-68926929
648 | L1PA3.1.chr14:43262062-43268079
649 | L1PA3.1.chr11:13793800-13799246
650 | L1HS.1.chr17:69000000-69005148
651 | L1PA2.1.chr14:26673315-26679374
652 | L1PA3.1.chrX:91752800-91758828
653 | L1PA3.1.chr7:8726340-8732368
654 | L1PA3.1.chr10:98541327-98544428
655 | L1PA2.1.chr8:91869518-91875534
656 | L1PA2.1.chrY:22339612-22345645
657 | L1PA2.1.chrX:6933211-6938641
658 | L1PA3.1.chr9:28989909-28996028
659 | L1PA3.1.chr6:71826715-71832734
660 | L1PA3.1.chrX:130567667-130573661
661 | L1PA2.1.chr9:86606053-86612054
662 | L1PA2.1.chr6:102722105-102725773
663 | L1HS.1.chr3:135002247-135005976
664 | L1PA3.1.chrX:13132317-13138466
665 | L1PA3.1.chr8:120190619-120194194
666 | L1PA3.1.chr1:75677750-75683758
667 | L1PA3.1.chr19:42714368-42720306
668 | L1PA2.1.chr15:49354233-49360252
669 | L1PA3.1.chr12:87098406-87104423
670 | L1PA3.1.chr5:79902294-79908448
671 | L1HS.1.chr4:110690112-110693684
672 | L1PA3.1.chr11:4000000-4005742
673 | L1PA2.1.chr8:131770949-131776926
674 | L1PA3.1.chr7:45377515-45383528
675 | L1PA2.1.chr5:28798790-28804814
676 | L1PA3.1.chr4:117921983-117927986
677 | L1PA2.1.chr8:66949103-66955119
678 | L1PA3.1.chr6:54961429-54967437
679 | L1PA3.1.chr5:148610279-148615691
680 | L1PA3.1.chr7:96579968-96585972
681 | L1P1.1.chr6:68912761-68914683
682 | L1PA2.1.chr3:85289930-85295940
683 | L1PA3.1.chr1:94506993-94512223
684 | L1PA3.1.chr10:35858668-35864695
685 | L1PA3.1.chr6:75899741-75905745
686 | L1PA3.1.chr10:105735717-105741873
687 | L1PA3.1.chr9:65709351-65715424
688 | L1PA3.1.chr2:166702725-166708745
689 | L1PA3.1.chr10:9519878-9526029
690 | L1PA3.1.chr4:147434630-147440182
691 | L1PA3.1.chr4:11557447-11563586
692 | L1PA3.1.chr3:133551337-133557486
693 | L1PA4.1.chr3:29993917-29999918
694 | L1PA4.1.chr4:122939633-122945796
695 | L1PA3.1.chr2:113471481-113477545
696 | L1PA3.1.chr8:107889880-107895907
697 | L1P1.1.chr16:59363201-59365889
698 | L1PA2.1.chr13:95733699-95740837
699 | L1PA3.1.chr7:36552965-36559126
700 | L1PA3.1.chr7:87238750-87244161
701 | L1PA2.1.chr5:85657593-85663641
702 | L1HS.1.chr7:145561496-145564595
703 | L1PA3.1.chr14:93413032-93419170
704 | L1PA3.1.chr4:102697355-102702634
705 | L1PA3.1.chr5:75769521-75774874
706 | L1PA3.1.chr2:137309506-137315534
707 | L1PA3.1.chrX:51512321-51518344
708 | L1PA3.1.chrX:32730898-32736929
709 | L1PA3.1.chr6:115009066-115014475
710 | L1PA3.1.chr13:102522020-102528176
711 | L1HS.1.chr9:63913382-63916426
712 | L1PA3.1.chr4:127676608-127682756
713 | L1PA2.1.chr6:141916863-141922880
714 | L1PA3.1.chr3:67147643-67153797
715 | L1PA3.1.chr2:227064885-227070875
716 | L1PA3.1.chr2:154147919-154154081
717 | L1PA2.1.chr4:156284540-156290599
718 | L1PA3.1.chr4:119061901-119068062
719 | L1PA3.1.chr16:47182757-47188783
720 | L1PA3.1.chr4:187050521-187056650
721 | L1PA3.1.chr4:90152884-90159024
722 | L1PA3.1.chr11:113656606-113662757
723 | L1PA3.1.chr10:130316193-130319757
724 | L1PA2.1.chr10:37896970-37902988
725 | L1PA3.1.chr4:167055311-167058094
726 | L1PA3.1.chr9:92900099-92906066
727 | L1PA3.1.chr6:1428614-1434750
728 | L1PA3.1.chr3:809376-815406
729 | L1PA3.1.chr2:240544494-240550522
730 | L1PA3.1.chr20:31332715-31338865
731 | L1PA3.1.chr5:98342992-98349146
732 | L1PA3.1.chrX:56075957-56081346
733 | L1PA3.1.chr2:96378981-96385131
734 | L1PA3.1.chr8:138042576-138048582
735 | L1PA3.1.chrX:42202163-42208183
736 | L1PA3.1.chr8:118548537-118554688
737 | L1PA2.1.chr7:113533338-113538585
738 | L1PA3.1.chr6:94757768-94763915
739 | L1PA3.1.chr3:180448471-180452905
740 | L1HS.1.chr1:103922065-103925398
741 | L1PA2.1.chr7:39007527-39013552
742 | L1PA3.1.chr6:4831172-4836230
743 | L1PA3.1.chr2:172187264-172193414
744 | L1PA3.1.chr1:113633560-113639380
745 | L1PA3.1.chrX:75561270-75567303
746 | L1HS.1.chr6:121162716-121168725
747 | L1P1.1.chr2:31496361-31500000
748 | L1PA3.1.chr13:104340349-104346508
749 | L1PA3.1.chr5:44578944-44584955
750 | L1PA3.1.chr5:15291930-15298100
751 | L1PA3.1.chr4:127860152-127865687
752 | L1PA3.1.chr15:38834949-38841092
753 | L1PA3.1.chrX:116159201-116165340
754 | L1PA3.1.chr8:74594235-74600252
755 | L1PA3.1.chrX:16230649-16236856
756 | L1PA3.1.chr16:65356743-65362738
757 | L1PA3.1.chr7:90634134-90640272
758 | L1PA3.1.chr12:66578267-66584409
759 | L1PA4.1.chr6:122537169-122543352
760 | L1PA3.1.chr6:107854715-107860748
761 | L1PA3.1.chr3:23349243-23355385
762 | L1PA3.1.chrX:108277015-108283183
763 | L1PA3.1.chr5:126903810-126909817
764 | L1PA3.1.chr5:91762133-91768161
765 | L1PA3.1.chr3:63733312-63739364
766 | L1PA2.1.chr16:80086062-80091657
767 | L1PA3.1.chr3:171625080-171632749
768 | L1PA4.1.chr6:70782891-70788469
769 | L1PA4.1.chr2:149501242-149507406
770 | L1PA3.1.chr14:47171654-47177809
771 | L1PA3.1.chr12:105776064-105782085
772 | L1PA3.1.chr15:97372487-97377657
773 | L1HS.1.chrX:21330646-21331772
774 | L1PA3.1.chr8:126341585-126347597
775 | L1PA2.1.chr11:60435417-60441441
776 | L1PA2.1.chr8:48793905-48799930
777 | L1PA2.1.chr3:108257680-108263696
778 | L1PA4.1.chr7:111500001-111505980
779 | L1PA2.1.chr21:40022874-40028842
780 | L1PA2.1.chr8:95552232-95558265
781 | L1PA3.1.chrX:51644758-51650770
782 | L1PA2.1.chr20:39368228-39373484
783 | L1PA3.1.chrX:85146837-85153009
784 | L1PA3.1.chr7:23321058-23327074
785 | L1PA3.1.chr5:99339615-99345764
786 | L1HS.1.chr2:131815264-131816385
787 | L1PA2.1.chr9:74424274-74430304
788 | L1PA3.1.chr7:32708015-32714180
789 | L1PA3.1.chr6:23804047-23810075
790 | L1PA4.1.chr4:63948074-63953638
791 | L1PA2.1.chrY:20367992-20374018
792 | L1PA3.1.chr5:120176046-120182173
793 | L1PA3.1.chr10:45009173-45015196
794 | L1PA3.1.chr4:163207006-163212968
795 | L1PA2.1.chr2:236383666-236389689
796 | L1PA2.1.chr6:71138558-71146364
797 | L1PA3.1.chr3:94846027-94852063
798 | L1PA2.1.chrX:114720516-114726531
799 | L1PA3.1.chr9:12333994-12340033
800 | L1PA2.1.chr5:19161336-19167360
801 | L1PA2.1.chr21:40033161-40039180
802 | L1PA2.1.chr4:106935103-106941147
803 | L1PA3.1.chr12:55081365-55087407
804 | L1PA2.1.chr12:58109257-58115306
805 | L1PA3.1.chr14:39521539-39527458
806 | L1PA2.1.chr8:4854406-4860419
807 | L1PA3.1.chr13:60709752-60715784
808 | L1PA2.1.chr7:32682678-32688715
809 | L1PA3.1.chrX:36170297-36176324
810 | L1PA3.1.chr3:18328056-18331287
811 | L1PA3.1.chr12:59751112-59757266
812 | L1PA3.1.chr3:26239082-26245212
813 | L1PA2.1.chr1:186637331-186643356
814 | L1PA3.1.chr3:61211021-61217154
815 | 


--------------------------------------------------------------------------------
/CGC/ORF2_list.txt:
--------------------------------------------------------------------------------
  1 | L1HS.1.chr20:7116194-7122199
  2 | L1HS.1.chr5:152886441-152892473
  3 | L1HS.1.chr15:70729744-70735160
  4 | L1HS.1.chr8:125582886-125588889
  5 | L1HS.1.chr4:136293494-136299546
  6 | L1HS.1.chrX:141421202-141427246
  7 | L1HS.1.chr15:54926081-54932099
  8 | L1HS.1.chr4:74717539-74723587
  9 | L1HS.1.chr8:128453002-128459020
 10 | L1HS.1.chr2:4733729-4739760
 11 | L1HS.1.chr16:16840517-16846556
 12 | L1HS.1.chr9:95697585-95703604
 13 | L1HS.1.chr7:30439242-30445274
 14 | L1HS.1.chr4:138547723-138552054
 15 | L1HS.1.chr11:78677772-78683802
 16 | L1HS.1.chr8:134070756-134076773
 17 | L1HS.1.chr5:109259387-109265418
 18 | L1HS.1.chr4:21159390-21165421
 19 | L1HS.1.chr6:2417774-2423803
 20 | L1HS.1.chrX:11935296-11941314
 21 | L1HS.1.chrX:11707248-11713279
 22 | L1HS.1.chr16:18821266-18827058
 23 | L1HS.1.chr13:29641706-29647706
 24 | L1HS.1.chr8:72875538-72881588
 25 | L1HS.1.chr12:126299023-126305038
 26 | L1HS.1.chr5:104518587-104524616
 27 | L1HS.1.chr3:130628808-130634065
 28 | L1HS.1.chr10:105377346-105383377
 29 | L1HS.1.chr6:129000000-129004416
 30 | L1HS.1.chr4:79937715-79943746
 31 | L1HS.1.chr22:28663283-28669315
 32 | L1HS.1.chr2:16593725-16599758
 33 | L1HS.1.chr18:70746549-70752581
 34 | L1HS.1.chr16:33952564-33958612
 35 | L1HS.1.chr10:109812437-109818457
 36 | L1HS.1.chr10:6369617-6375667
 37 | L1HS.1.chr6:156034135-156040165
 38 | L1HS.1.chr1:84052389-84058406
 39 | L1HS.1.chr18:75846851-75852883
 40 | L1HS.1.chr11:93420986-93427031
 41 | L1HS.1.chr1:71513698-71519742
 42 | L1HS.1.chrX:147653734-147659767
 43 | L1HS.1.chr1:247687173-247693204
 44 | L1HS.1.chr7:113776122-113782152
 45 | L1HS.1.chr4:78347980-78354013
 46 | L1HS.1.chr11:93136638-93142673
 47 | L1HS.1.chr5:177772245-177778274
 48 | L1HS.1.chr4:90675739-90681757
 49 | L1HS.1.chr2:196905587-196911636
 50 | L1HS.1.chr16:83637252-83643296
 51 | L1HS.1.chr16:9584490-9590522
 52 | L1HS.1.chr7:141920659-141926712
 53 | L1HS.1.chr3:109199872-109205903
 54 | L1HS.1.chr1:174590323-174596379
 55 | L1HS.1.chr11:95436216-95442246
 56 | L1HS.1.chr11:24327951-24334001
 57 | L1HS.1.chr9:90149604-90155634
 58 | L1HS.1.chr6:19764892-19770918
 59 | L1HS.1.chr7:110707004-110713024
 60 | L1HS.1.chr6:83333952-83339981
 61 | L1HS.1.chr2:86655238-86661268
 62 | L1HS.1.chr7:49680245-49686300
 63 | L1HS.1.chr6:133020691-133026746
 64 | L1HS.1.chr1:86679080-86685111
 65 | L1HS.1.chr10:85355506-85361538
 66 | L1HS.1.chr8:27113618-27119645
 67 | L1HS.1.chr3:103556537-103562569
 68 | L1HS.1.chr6:24811657-24817706
 69 | L1PA2.1.chr5:132513964-132519996
 70 | L1HS.1.chr5:79778884-79784938
 71 | L1HS.1.chr3:120573021-120579186
 72 | L1HS.1.chr2:175481951-175487994
 73 | L1HS.1.chr1:239623498-239629523
 74 | L1HS.1.chr14:70547290-70553322
 75 | L1HS.1.chrX:54118685-54124744
 76 | L1HS.1.chr13:92685561-92691592
 77 | L1HS.1.chr1:237019467-237025494
 78 | L1HS.1.chr1:80939203-80945257
 79 | L1HS.1.chr5:58384174-58390206
 80 | L1HS.1.chr5:173402796-173408828
 81 | L1HS.1.chr4:16944926-16949113
 82 | L1HS.1.chr4:93638307-93644337
 83 | L1HS.1.chr3:77763677-77769678
 84 | L1HS.1.chr17:9615985-9622015
 85 | L1HS.1.chr6:121162716-121168725
 86 | L1HS.1.chr22:48985761-48991792
 87 | L1HS.1.chrX:23238516-23244575
 88 | L1HS.1.chr2:166988454-166994509
 89 | L1HS.1.chrX:81841153-81847184
 90 | L1PA2.1.chr11:60532161-60538190
 91 | L1HS.1.chr4:111894801-111900831
 92 | L1HS.1.chr1:180866811-180872843
 93 | L1HS.1.chr17:66596579-66602595
 94 | L1HS.1.chr6:117102131-117108163
 95 | L1PA2.1.chr5:39787652-39793671
 96 | L1HS.1.chr4:59078847-59084877
 97 | L1HS.1.chr9:28111895-28117865
 98 | L1HS.1.chr7:111963193-111969223
 99 | L1HS.1.chr5:146609485-146615534
100 | L1HS.1.chr3:159095379-159101394
101 | L1HS.1.chr2:180833661-180839689
102 | L1HS.1.chr7:111243515-111249546
103 | L1HS.1.chr15:87509891-87515920
104 | L1HS.1.chr11:85324758-85330821
105 | L1HS.1.chr10:98782941-98788971
106 | L1HS.1.chr1:187597671-187603699
107 | L1HS.1.chr14:63116706-63122735
108 | L1HS.1.chr1:187343764-187349794
109 | L1HS.1.chr18:13975860-13981891
110 | L1PA2.1.chr1:71888203-71894235
111 | L1HS.1.chr20:11632779-11638837
112 | L1HS.1.chrX:96057824-96063842
113 | L1HS.1.chr4:122652658-122656850
114 | L1HS.1.chr1:195925003-195929320
115 | L1HS.1.chr1:85927067-85933100
116 | L1HS.1.chr18:50343959-50349987
117 | L1HS.1.chr6:72988654-72994686
118 | L1HS.1.chr11:109177494-109183526
119 | L1HS.1.chr8:88685705-88691760
120 | L1HS.1.chr5:111302238-111308262
121 | L1HS.1.chr2:102566355-102572385
122 | L1HS.1.chr5:86510690-86516743
123 | L1HS.1.chr3:132946006-132952034
124 | L1HS.1.chr1:118852351-118858380
125 | L1HS.1.chr10:76586841-76591752
126 | L1HS.1.chrX:151330320-151336351
127 | L1HS.1.chr10:5245354-5251383
128 | L1PA2.1.chr6:115960032-115966060
129 | L1PA2.1.chr12:92313998-92320023
130 | L1HS.1.chrX:155516016-155522048
131 | L1HS.1.chr4:169515501-169521532
132 | L1HS.1.chr7:93787624-93793679
133 | L1HS.1.chr10:19088601-19094618
134 | L1HS.1.chrX:76322775-76328806
135 | L1PA2.1.chrX:28206791-28212789
136 | L1HS.1.chr5:102131356-102137385
137 | L1PA2.1.chr12:90536603-90542635
138 | L1HS.1.chr7:46820756-46825657
139 | L1PA2.1.chr19:37837502-37843533
140 | L1PA2.1.chr10:15915731-15921753
141 | L1HS.1.chr20:12801017-12807044
142 | L1HS.1.chr11:49793154-49797728
143 | L1HS.1.chr18:37819737-37825798
144 | L1HS.1.chrY:5606144-5612199
145 | L1HS.1.chr3:4916534-4922591
146 | L1PA2.1.chr18:59403939-59409970
147 | L1PA2.1.chr15:71174139-71180152
148 | L1HS.1.chrX:142477849-142483853
149 | L1HS.1.chr10:33510845-33516876
150 | L1HS.1.chr11:90400067-90406098
151 | L1HS.1.chr7:63148831-63154859
152 | L1PA2.1.chr5:83316287-83320401
153 | L1HS.1.chr1:209913771-209919823
154 | L1HS.1.chr11:36551606-36557636
155 | L1PA2.1.chr3:187412123-187418152
156 | L1HS.1.chr3:136479056-136485103
157 | L1PA2.1.chr3:81051389-81057413
158 | L1PA2.1.chr18:7966442-7972474
159 | L1PA2.1.chr8:91558668-91564687
160 | L1HS.1.chr3:89460825-89466856
161 | L1PA2.1.chr6:44870634-44876665
162 | L1PA2.1.chr5:45658440-45664470
163 | L1HS.1.chr3:54394322-54400323
164 | L1PA2.1.chr6:72570139-72576167
165 | L1HS.1.chr18:72966526-72972556
166 | L1HS.1.chr3:3963076-3969110
167 | L1PA2.1.chr2:128858984-128865016
168 | L1PA2.1.chr3:177388770-177394751
169 | L1PA2.1.chr10:11731436-11737465
170 | L1PA2.1.chr10:39466259-39470575
171 | L1PA2.1.chr9:19536200-19542230
172 | L1PA2.1.chr6:104489393-104495424
173 | L1HS.1.chrX:83059584-83065637
174 | L1HS.1.chr7:70197328-70203357
175 | L1PA2.1.chr2:173699375-173705410
176 | L1HS.1.chrX:64013267-64019286
177 | L1PA2.1.chrX:103891506-103897537
178 | L1PA2.1.chr4:164553492-164559523
179 | L1PA2.1.chr8:63797384-63803439
180 | L1HS.1.chr12:54788573-54794627
181 | L1PA2.1.chr10:106844583-106850610
182 | L1PA2.1.chr15:51173565-51179009
183 | L1PA2.1.chr8:75444000-75448442
184 | L1PA2.1.chr6:104452399-104457460
185 | L1PA3.1.chr3:137454947-137460983
186 | L1HS.1.chr5:122240435-122244924
187 | L1PA2.1.chr4:102204930-102210958
188 | L1HS.1.chr7:7465092-7471120
189 | L1PA2.1.chr3:155119416-155125444
190 | L1PA2.1.chr16:21042672-21048703
191 | L1PA3.1.chr3:187424407-187428816
192 | L1HS.1.chr16:35608475-35614501
193 | L1PA2.1.chr5:139005423-139011486
194 | L1PA2.1.chr15:93675399-93681428
195 | L1PA2.1.chr2:165485934-165491963
196 | L1PA2.1.chr18:24619042-24625072
197 | L1PA3.1.chr6:48363090-48369117
198 | L1PA2.1.chr3:65509292-65515316
199 | L1PA3.1.chr19:29225779-29231807
200 | L1PA2.1.chr8:120348977-120354404
201 | L1PA2.1.chr12:77173381-77179424
202 | L1PA2.1.chr13:100698082-100704117
203 | L1PA2.1.chr12:64195587-64201638
204 | L1PA2.1.chr2:174269465-174274464
205 | L1PA2.1.chr8:72479440-72485463
206 | L1PA2.1.chr4:14009454-14015486
207 | L1PA2.1.chr13:40356290-40362321
208 | L1PA2.1.chr6:156361254-156367276
209 | L1PA2.1.chr1:174377791-174383815
210 | L1PA2.1.chr4:145369388-145375369
211 | L1HS.1.chr1:104770247-104776278
212 | L1PA2.1.chr13:42424880-42430912
213 | L1PA2.1.chr14:101266199-101272227
214 | L1PA2.1.chr4:158084240-158090272
215 | L1PA2.1.chr5:21107412-21113430
216 | L1PA2.1.chr3:141757129-141763153
217 | L1PA2.1.chr1:49875162-49881175
218 | L1PA2.1.chr18:22529636-22535670
219 | L1PA2.1.chr1:25506585-25512707
220 | L1PA3.1.chr6:107854715-107860748
221 | L1HS.1.chr13:31302314-31308370
222 | L1PA2.1.chr14:26629268-26635299
223 | L1PA2.1.chrX:127116697-127122729
224 | L1PA2.1.chr5:57471563-57475609
225 | L1PA2.1.chr8:131770949-131776926
226 | L1PA2.1.chr1:178314791-178320818
227 | L1PA2.1.chr16:63388077-63394106
228 | L1HS.1.chr4:79704552-79710581
229 | L1PA2.1.chr3:178859948-178865979
230 | L1PA2.1.chr18:40187639-40193657
231 | L1PA3.1.chr18:27279551-27285578
232 | L1PA2.1.chr2:158351231-158357242
233 | L1PA3.1.chr4:120741413-120747472
234 | L1PA2.1.chr12:57646479-57652498
235 | L1PA2.1.chr7:29579936-29585963
236 | L1PA2.1.chr8:72147447-72153464
237 | L1HS.1.chr11:90966271-90972302
238 | L1PA2.1.chrX:47783671-47789697
239 | L1PA2.1.chrX:18105518-18110908
240 | L1PA2.1.chr4:4953897-4959919
241 | L1PA2.1.chr11:107361810-107367839
242 | L1PA2.1.chr1:75383477-75388208
243 | L1PA2.1.chr12:70013065-70019067
244 | L1PA2.1.chr2:76775758-76781758
245 | L1PA3.1.chr6:105716122-105722275
246 | L1PA2.1.chr18:41631742-41637769
247 | L1PA3.1.chr2:4157808-4163833
248 | L1PA3.1.chr2:57587069-57592429
249 | L1PA2.1.chrX:36465194-36471217
250 | L1PA2.1.chr2:192268435-192274450
251 | L1PA3.1.chrX:68736250-68742398
252 | L1PA3.1.chrX:137672117-137678261
253 | L1PA3.1.chr2:48739839-48745890
254 | L1PA2.1.chr2:195067521-195073543
255 | L1PA3.1.chr7:37612053-37618072
256 | L1PA3.1.chr5:3439025-3445063
257 | L1PA2.1.chr3:116203398-116209426
258 | L1PA3.1.chrX:125510365-125515187
259 | L1PA3.1.chr4:97633479-97639503
260 | L1PA2.1.chr20:53472644-53478653
261 | L1PA2.1.chr7:16216428-16222457
262 | L1PA2.1.chr1:177633927-177639946
263 | L1HS.1.chrX:56695884-56701916
264 | L1PA2.1.chr18:35205779-35211809
265 | L1PA2.1.chr15:56311143-56317177
266 | L1PA2.1.chr20:24900605-24906618
267 | L1PA3.1.chr8:2413733-2419762
268 | L1PA3.1.chr8:2301964-2307993
269 | L1PA2.1.chr3:158634523-158640540
270 | L1HS.1.chr1:67078891-67084915
271 | L1PA2.1.chr2:124593139-124599168
272 | L1PA2.1.chr7:43059096-43065116
273 | L1PA3.1.chr5:35568225-35574246
274 | L1PA2.1.chr17:3176530-3182557
275 | L1PA3.1.chr4:65052166-65058198
276 | L1PA3.1.chr2:228229041-228234515
277 | L1PA2.1.chr20:18601523-18606939
278 | L1PA3.1.chr9:133310021-133316045
279 | L1PA2.1.chr2:151698868-151704889
280 | L1PA2.1.chr6:141566105-141572136
281 | L1HS.1.chr1:56365452-56369282
282 | L1HS.1.chr14:30684809-30690837
283 | L1PA2.1.chr16:61801455-61807489
284 | L1PA2.1.chr22:16021017-16027044
285 | L1PA3.1.chr2:188123561-188129537
286 | L1HS.1.chr4:15841546-15847572
287 | L1PA3.1.chr11:89744187-89750239
288 | L1HS.1.chr4:107206672-107210557
289 | L1PA2.1.chr8:58914690-58920717
290 | L1HS.1.chr1:237075264-237081293
291 | L1PA3.1.chr3:135025209-135031249
292 | L1PA2.1.chr5:75642235-75648286
293 | L1PA2.1.chr19:55822401-55828429
294 | L1PA2.1.chr6:103709031-103715056
295 | L1PA2.1.chr10:7137522-7142956
296 | L1PA2.1.chr12:106471865-106477891
297 | L1HS.1.chr20:55859566-55865521
298 | L1PA2.1.chr9:14663995-14670015
299 | L1HS.1.chr5:152076868-152082891
300 | L1PA2.1.chr14:55988182-55993244
301 | L1PA2.1.chr10:18030651-18036675
302 | L1PA2.1.chr2:204991072-204997106
303 | L1PA2.1.chr1:174233266-174239293
304 | L1PA2.1.chr13:82045349-82051380
305 | L1PA2.1.chr15:81797930-81803963
306 | L1PA3.1.chr7:14313260-14319290
307 | L1HS.1.chr18:62906292-62912314
308 | L1PA2.1.chr6:162989737-162995762
309 | L1PA2.1.chr9:1223881-1229900
310 | L1PA2.1.chrX:5480456-5486466
311 | L1PA2.1.chrX:98424325-98430357
312 | L1HS.1.chr2:193212420-193218448
313 | L1PA3.1.chr13:105383251-105388345
314 | L1PA2.1.chr12:80244169-80250184
315 | L1PA2.1.chr1:91211587-91216947
316 | L1PA2.1.chr4:64859153-64865171
317 | L1PA2.1.chr9:21536697-21541948
318 | L1PA3.1.chrX:64252345-64258375
319 | L1PA3.1.chr11:127868667-127872497
320 | L1PA2.1.chr1:82250044-82256069
321 | L1PA2.1.chr3:111556203-111562234
322 | L1PA3.1.chr4:53564637-53570664
323 | L1PA3.1.chr6:136000726-136006368
324 | L1HS.1.chrX:127362223-127368248
325 | L1PA2.1.chr4:65288237-65294261
326 | L1PA3.1.chr10:126881867-126887893
327 | L1PA3.1.chr6:133142073-133148104
328 | L1PA3.1.chr15:97372487-97377657
329 | L1PA3.1.chr11:79552653-79558680
330 | L1PA3.1.chr10:60692960-60698897
331 | L1PA2.1.chr5:51250746-51256770
332 | L1PA2.1.chr11:40585472-40591501
333 | L1PA3.1.chr4:157174892-157180916
334 | L1PA4.1.chr16:72580798-72586932
335 | L1PA2.1.chr7:86340208-86346233
336 | L1HS.1.chr4:135178140-135183747
337 | L1PA3.1.chr8:118548537-118554688
338 | L1PA3.1.chrX:124582570-124588702
339 | L1PA2.1.chr2:137393160-137399190
340 | L1PA4.1.chr10:20291416-20297572
341 | 


--------------------------------------------------------------------------------
/CGC/make_ORF1_and_intact_table.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 3 | try:
 4 |     import cPickle as pickle
 5 | except ImportError:
 6 |     import pickle
 7 | 
 8 | exp_prob_pkls_list = sys.argv[1]
 9 | bam_info_list = sys.argv[2]
10 | orf1_list = sys.argv[3]
11 | orf2_list = sys.argv[4]
12 | allowed_runthrough_fraction = float(sys.argv[5])
13 | 
14 | output_orf1_name = sys.argv[6]
15 | output_intact_name = sys.argv[7]
16 | 
17 | orf1_intact = set()
18 | for line in open(orf1_list):
19 | 	orf1_intact.add(line.strip())
20 | orf2_intact = set()
21 | for line in open(orf2_list):
22 | 	orf2_intact.add(line.strip())
23 | 
24 | exp_probs = dict()
25 | seqs = set([])
26 | 
27 | for line in open(exp_prob_pkls_list):
28 | 	names_file, X_file = line.strip().split('\t')
29 | 	name = names_file.split('/')[-1][:-16]
30 | 	exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
31 | 	seqs = seqs | set(exp_probs[name].keys())
32 | 
33 | l1pa_pairs = dict()
34 | mapped_pairs = dict()
35 | 
36 | for line in open(bam_info_list):
37 | 	name = line.strip().split('/')[-1][:-4]
38 | 	baminfo = open(line.strip()).readlines()
39 | 	mapped_pairs[name] = int(baminfo[1])
40 | 	l1pa_pairs[name] = int(baminfo[2])
41 | 
42 | output_orf1 = open(output_orf1_name,'w')
43 | output_intact = open(output_intact_name,'w')
44 | 
45 | print_string = "locus"
46 | for name in exp_probs:
47 | 	print_string += "\t"+name
48 | 
49 | output_orf1.write (print_string+'\n')
50 | output_intact.write (print_string+'\n')
51 | 
52 | completed = set()
53 | 
54 | for name in seqs:
55 | 	seq_name = '_'.join(name.split('_')[:-1])
56 | 	if seq_name in completed:
57 | 		continue
58 | 	else:
59 | 		completed.add(seq_name)
60 | 	print_string = seq_name.split('(')[0]
61 | 	only_name = seq_name+'_only'
62 | 	runon_name = seq_name+'_3prunon'
63 | 	runthrough_name = seq_name+'_runthrough'
64 | 	for name in exp_probs:
65 | 		FPM = 0.0
66 | 		runthrough_FPM = 0.0
67 | 		if only_name in exp_probs[name]:
68 | 			FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
69 | 		if runon_name in exp_probs[name]:
70 | 			FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
71 | 		if runthrough_name in exp_probs[name]:
72 | 			runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
73 | 		if FPM>0 and FPM/(FPM+runthrough_FPM) > allowed_runthrough_fraction:
74 | 			print_string += '\t'+str(FPM)
75 | 		else:
76 | 			print_string += '\t0.0'
77 | 	if seq_name.split('(')[0][:-2] in orf1_intact:
78 | 		output_orf1.write(print_string+'\n')
79 | 	if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact:
80 | 		output_intact.write(print_string+'\n')
81 | 
82 | output_orf1.close()
83 | output_intact.close()
84 | 


--------------------------------------------------------------------------------
/CGC/make_ORF1_and_intact_table_stranded.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 3 | try:
 4 |     import cPickle as pickle
 5 | except ImportError:
 6 |     import pickle
 7 | 
 8 | exp_prob_pkls_list = sys.argv[1]
 9 | bam_info_list = sys.argv[2]
10 | orf1_list = sys.argv[3]
11 | orf2_list = sys.argv[4]
12 | allowed_runthrough_fraction = float(sys.argv[5])
13 | 
14 | output_orf1_name = sys.argv[6]
15 | output_intact_name = sys.argv[7]
16 | 
17 | orf1_intact = set()
18 | for line in open(orf1_list):
19 | 	orf1_intact.add(line.strip())
20 | orf2_intact = set()
21 | for line in open(orf2_list):
22 | 	orf2_intact.add(line.strip())
23 | 
24 | exp_probs = dict()
25 | seqs = set([])
26 | 
27 | for line in open(exp_prob_pkls_list):
28 | 	names_file, X_file = line.strip().split('\t')
29 | 	name = names_file.split('/')[-1][:-16]
30 | 	exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
31 | 	seqs = seqs | set(exp_probs[name].keys())
32 | 
33 | l1pa_pairs = dict()
34 | mapped_pairs = dict()
35 | 
36 | for line in open(bam_info_list):
37 | 	name = line.strip().split('/')[-1][:-4]
38 | 	baminfo = open(line.strip()).readlines()
39 | 	mapped_pairs[name] = int(baminfo[1])
40 | 	l1pa_pairs[name] = int(baminfo[2])
41 | 
42 | output_orf1 = open(output_orf1_name,'w')
43 | output_intact = open(output_intact_name,'w')
44 | 
45 | print_string = "locus"
46 | for name in exp_probs:
47 | 	print_string += "\t"+name
48 | 
49 | output_orf1.write (print_string+'\n')
50 | output_intact.write (print_string+'\n')
51 | 
52 | completed = set()
53 | 
54 | for name in seqs:
55 | 	seq_name = '_'.join(name.split('_')[:-1])
56 | 	if seq_name in completed:
57 | 		continue
58 | 	else:
59 | 		completed.add(seq_name)
60 | 	print_string = seq_name.split('(')[0]
61 | 	only_name = seq_name+'_only'
62 | 	runon_name = seq_name+'_3prunon'
63 | 	senserunthrough_name = seq_name+'_senserunthrough'
64 | 	antisenserunthrough_name = seq_name+'_antisenserunthrough'
65 | 	for name in exp_probs:
66 | 		FPM = 0.0
67 | 		runthrough_FPM = 0.0
68 | 		if only_name in exp_probs[name]:
69 | 			FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
70 | 		if runon_name in exp_probs[name]:
71 | 			FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
72 | 		if senserunthrough_name in exp_probs[name]:
73 | 			runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
74 | 		if FPM>0 and FPM/(FPM+runthrough_FPM) > allowed_runthrough_fraction:
75 | 			print_string += '\t'+str(FPM)
76 | 		else:
77 | 			print_string += '\t0.0'
78 | 	if seq_name.split('(')[0][:-2] in orf1_intact:
79 | 		output_orf1.write(print_string+'\n')
80 | 	if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact:
81 | 		output_intact.write(print_string+'\n')
82 | 
83 | output_orf1.close()
84 | output_intact.close()
85 | 


--------------------------------------------------------------------------------
/CGC/make_l1pa1to4table.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 3 | try:
 4 |     import cPickle as pickle
 5 | except ImportError:
 6 |     import pickle
 7 | 
 8 | exp_prob_pkls_list = sys.argv[1]
 9 | bam_info_list = sys.argv[2]
10 | 
11 | exp_probs = dict()
12 | seqs = set([])
13 | 
14 | for line in open(exp_prob_pkls_list):
15 | 	names_file, X_file = line.strip().split('\t')
16 | 	name = names_file.split('/')[-1][:-16]
17 | 	exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
18 | 	seqs = seqs | set(exp_probs[name].keys())
19 | 
20 | l1pa_pairs = dict()
21 | mapped_pairs = dict()
22 | 
23 | for line in open(bam_info_list):
24 | 	name = line.strip().split('/')[-1][:-4]
25 | 	baminfo = open(line.strip()).readlines()
26 | 	mapped_pairs[name] = int(baminfo[1])
27 | 	l1pa_pairs[name] = int(baminfo[2])
28 | 
29 | print_string = "locus"
30 | for name in exp_probs:
31 | 	print_string += "\t"+name+'-active'+"\t"+name+'-passive'
32 | 
33 | print(print_string)
34 | 
35 | completed = set()
36 | 
37 | for name in seqs:
38 | 	if name.split('.')[0] not in ['L1HS','L1PA2','L1PA3','L1PA4']:
39 | 		continue
40 | 	seq_name = '_'.join(name.split('_')[:-1])
41 | 	if seq_name in completed:
42 | 		continue
43 | 	else:
44 | 		completed.add(seq_name)
45 | 	print_string = seq_name.split('(')[0]
46 | 	only_name = seq_name+'_only'
47 | 	runon_name = seq_name+'_3prunon'
48 | 	runthrough_name = seq_name+'_runthrough'
49 | 	for name in exp_probs:
50 | 		FPM = 0.0
51 | 		runthrough_FPM = 0.0
52 | 		if only_name in exp_probs[name]:
53 | 			FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
54 | 		if runon_name in exp_probs[name]:
55 | 			FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
56 | 		if runthrough_name in exp_probs[name]:
57 | 			runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
58 | 		print_string += '\t'+str(FPM)+'\t'+str(runthrough_FPM)
59 | 	print(print_string)
60 | 


--------------------------------------------------------------------------------
/CGC/make_l1pa1to4table_stranded.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 3 | try:
 4 |     import cPickle as pickle
 5 | except ImportError:
 6 |     import pickle
 7 | 
 8 | exp_prob_pkls_list = sys.argv[1]
 9 | bam_info_list = sys.argv[2]
10 | allowed_rt_fraction = float(sys.argv[3])
11 | 
12 | exp_probs = dict()
13 | seqs = set([])
14 | 
15 | for line in open(exp_prob_pkls_list):
16 | 	names_file, X_file = line.strip().split('\t')
17 | 	name = names_file.split('/')[-1][:-16]
18 | 	exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
19 | 	seqs = seqs | set(exp_probs[name].keys())
20 | 
21 | l1pa_pairs = dict()
22 | mapped_pairs = dict()
23 | 
24 | for line in open(bam_info_list):
25 | 	name = line.strip().split('/')[-1][:-4]
26 | 	baminfo = open(line.strip()).readlines()
27 | 	mapped_pairs[name] = int(baminfo[1])
28 | 	l1pa_pairs[name] = int(baminfo[2])
29 | 
30 | print_string = "locus"
31 | for name in exp_probs:
32 | 	print_string += "\t"+name
33 | 
34 | print(print_string)
35 | 
36 | completed = set()
37 | 
38 | for name in seqs:
39 | 	if name.split('.')[0] not in ['L1HS','L1PA2','L1PA3','L1PA4']:
40 | 		continue
41 | 	seq_name = '_'.join(name.split('_')[:-1])
42 | 	if seq_name in completed:
43 | 		continue
44 | 	else:
45 | 		completed.add(seq_name)
46 | 	print_string = seq_name.split('(')[0]
47 | 	only_name = seq_name+'_only'
48 | 	runon_name = seq_name+'_3prunon'
49 | 	runthrough_name = seq_name+'_senserunthrough'
50 | 	for name in exp_probs:
51 | 		FPM = 0.0
52 | 		runthrough_FPM = 0.0
53 | 		if only_name in exp_probs[name]:
54 | 			FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
55 | 		if runon_name in exp_probs[name]:
56 | 			FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
57 | 		if runthrough_name in exp_probs[name]:
58 | 			runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6
59 | 		if runthrough_FPM < allowed_rt_fraction*FPM:
60 | 			print_string += '\t'+str(FPM)
61 | 		else:
62 | 			print_string += '\t0.0'
63 | 	print(print_string)
64 | 


--------------------------------------------------------------------------------
/CGC/median_template_and_pairs.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pysam
 3 | import random
 4 | import numpy
 5 | 
 6 | """
 7 | Estimate median template length of a bam file.
 8 | 
 9 | Part of the L1-EM package.
10 | 
11 | Copyright (C) 2019 Wilson McKerrow
12 | 
13 |     This program is free software: you can redistribute it and/or modify
14 |     it under the terms of the GNU General Public License as published by
15 |     the Free Software Foundation, either version 3 of the License, or
16 |     (at your option) any later version.
17 | 
18 |     This program is distributed in the hope that it will be useful,
19 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
20 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 |     GNU General Public License for more details.
22 | 
23 |     You should have received a copy of the GNU General Public License
24 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
25 | 
26 | """
27 | 
28 | bamfile = sys.argv[1]
29 | fraction = float(sys.argv[2])
30 | 
31 | tlens = list()
32 | n_proper_reads = 0
33 | 
34 | for read in pysam.AlignmentFile(bamfile):
35 | 	if read.is_proper_pair:
36 | 		n_proper_reads += 1
37 | 		if random.random() < fraction:
38 | 			tlens.append(read.template_length)
39 | 
40 | print(numpy.median(numpy.abs(tlens)))
41 | print(n_proper_reads/2)
42 | 


--------------------------------------------------------------------------------
/CGC/read_or_pair_overlap_bed_and_unmapped.py:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import sys
 3 | 
 4 | """
 5 | Extract reads or pairs of reads that overlap a bed file.
 6 | 
 7 | Part of the L1-EM package.
 8 | 
 9 | Copyright (C) 2019 Wilson McKerrow
10 | 
11 |     This program is free software: you can redistribute it and/or modify
12 |     it under the terms of the GNU General Public License as published by
13 |     the Free Software Foundation, either version 3 of the License, or
14 |     (at your option) any later version.
15 | 
16 |     This program is distributed in the hope that it will be useful,
17 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 |     GNU General Public License for more details.
20 | 
21 |     You should have received a copy of the GNU General Public License
22 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
23 | 
24 | """
25 | 
26 | def main():
27 | 	bedfile = sys.argv[1]
28 | 	bamfile = sys.argv[2]
29 | 	outbamfile = sys.argv[3]
30 | 	outunmappedbamfile = sys.argv[4]
31 | 	if len(sys.argv) > 5:
32 | 		flanking = int(sys.argv[5])
33 | 	else:
34 | 		flanking = 400
35 | 	if len(sys.argv) > 6:
36 | 		maxNM = int(sys.argv[6])
37 | 	else:
38 | 		maxNM = 4
39 | 	
40 | 	inbam = pysam.AlignmentFile(bamfile,'rb')
41 | 	outbam = pysam.AlignmentFile(outbamfile,'wb',template=inbam)
42 | 	outunmappedbam = pysam.AlignmentFile(outunmappedbamfile,'wb',template=inbam)
43 | 	
44 | 	read_ids = set()
45 | 	for line in open(bedfile):
46 | 		chrom,start,stop = line.strip().split('\t')[:3]
47 | 		start = int(start)+flanking
48 | 		stop = int(stop)-flanking
49 | 		if chrom in inbam.references:
50 | 			for read in inbam.fetch(chrom,start,stop):
51 | 				if not read.is_unmapped:
52 | 					if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and (not read.has_tag('NM') or read.get_tag('NM')<=maxNM):
53 | 						read_ids.add(read.query_name)
54 | # 		if chrom[3:] in inbam.references:
55 | # 			for read in inbam.fetch(chrom[3:],start,stop):
56 | # 				if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3:
57 | # 						read_ids.add(read.query_name)
58 | # 		if '_' in chrom and chrom.split('_')[1].upper()+'.1' in inbam.references:
59 | # 			for read in inbam.fetch(chrom.split('_')[1].upper()+'.1',start,stop):
60 | # 				if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3:
61 | # 					read_ids.add(read.query_name)
62 | 	
63 | 	inbam.close()
64 | 	inbam = pysam.AlignmentFile(bamfile,'rb')
65 | 	
66 | 	for read in inbam:
67 | 		if read.query_name in read_ids:
68 | 			if not read.is_secondary and not read.is_supplementary:
69 | 				outbam.write(read)
70 | 		elif read.is_unmapped or read.mate_is_unmapped:
71 | 			if not read.is_secondary and not read.is_supplementary:
72 | 				outunmappedbam.write(read)
73 | 	
74 | 	inbam.close()
75 | 	outbam.close()
76 | 
77 | if __name__ == '__main__':
78 | 	main()
79 | 


--------------------------------------------------------------------------------
/CGC/report_l1_exp_counts.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | import sys
 7 | 
 8 | """
 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 | 
12 | Copyright (C) 2019 Wilson McKerrow
13 | 
14 |     This program is free software: you can redistribute it and/or modify
15 |     it under the terms of the GNU General Public License as published by
16 |     the Free Software Foundation, either version 3 of the License, or
17 |     (at your option) any later version.
18 | 
19 |     This program is distributed in the hope that it will be useful,
20 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |     GNU General Public License for more details.
23 | 
24 |     You should have received a copy of the GNU General Public License
25 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
26 | 
27 | """
28 | 
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 | 
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 | 
33 | total = float(sys.argv[4])
34 | 
35 | written_seqs = set([])
36 | 
37 | print("family.category.locus.strand\tonly\t3prunon\tpassive_sense\tpassive_antisense\tantisense")
38 | 
39 | names = list(X_est.keys())
40 | 
41 | for name in names:
42 | 	if 'exon' not in name:
43 | 		seq_name = '_'.join(name.split('_')[:-1])
44 | 		if seq_name in written_seqs:
45 | 			continue
46 | 		written_seqs.add(seq_name)
47 | 		print_string = seq_name.split('(')[0]
48 | 		only_name = seq_name+'_only'
49 | 		if only_name not in X_est:
50 | 			X_est[only_name]=0.0
51 | 		print_string += '\t'+str(total*X_est[only_name]/proper_pairs_in_original_bam*10**6)
52 | 		runon_name = seq_name+'_3prunon'
53 | 		if runon_name not in X_est:
54 | 			X_est[runon_name]=0.0
55 | 		print_string += '\t'+str(total*X_est[runon_name]/proper_pairs_in_original_bam*10**6)
56 | 		runthroughS_name = seq_name+'_senserunthrough'
57 | 		if runthroughS_name not in X_est:
58 | 			X_est[runthroughS_name]=0.0
59 | 		print_string += '\t'+str(total*X_est[runthroughS_name]/proper_pairs_in_original_bam*10**6)
60 | 		runthroughA_name = seq_name+'_antisenserunthrough'
61 | 		if runthroughA_name not in X_est:
62 | 			X_est[runthroughA_name]=0.0
63 | 		print_string += '\t'+str(total*X_est[runthroughA_name]/proper_pairs_in_original_bam*10**6)
64 | 		antisense_name = seq_name+'_antisense'
65 | 		if antisense_name not in X_est:
66 | 			X_est[antisense_name]=0.0
67 | 		print_string += '\t'+str(total*X_est[antisense_name]/proper_pairs_in_original_bam*10**6)
68 | 		print(print_string)
69 | 


--------------------------------------------------------------------------------
/CGC/report_l1_exp_counts_unstranded.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | import sys
 7 | 
 8 | """
 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 | 
12 | Copyright (C) 2019 Wilson McKerrow
13 | 
14 |     This program is free software: you can redistribute it and/or modify
15 |     it under the terms of the GNU General Public License as published by
16 |     the Free Software Foundation, either version 3 of the License, or
17 |     (at your option) any later version.
18 | 
19 |     This program is distributed in the hope that it will be useful,
20 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |     GNU General Public License for more details.
23 | 
24 |     You should have received a copy of the GNU General Public License
25 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
26 | 
27 | """
28 | 
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 | 
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 | 
33 | total = float(sys.argv[4])
34 | 
35 | written_seqs = set([])
36 | 
37 | print("family.category.locus.strand\tonly\t3prunon\tpassive")
38 | 
39 | names = list(X_est.keys())
40 | 
41 | for name in names:
42 | 	if 'exon' not in name:
43 | 		seq_name = '_'.join(name.split('_')[:-1])
44 | 		if seq_name in written_seqs:
45 | 			continue
46 | 		written_seqs.add(seq_name)
47 | 		print_string = seq_name.split('(')[0]
48 | 		only_name = seq_name+'_only'
49 | 		if only_name not in X_est:
50 | 			X_est[only_name]=0.0
51 | 		print_string += '\t'+str(total*X_est[only_name]/proper_pairs_in_original_bam*10**6)
52 | 		runon_name = seq_name+'_3prunon'
53 | 		if runon_name not in X_est:
54 | 			X_est[runon_name]=0.0
55 | 		print_string += '\t'+str(total*X_est[runon_name]/proper_pairs_in_original_bam*10**6)
56 | 		runthrough_name = seq_name+'_runthrough'
57 | 		if runthrough_name not in X_est:
58 | 			X_est[runthrough_name]=0.0
59 | 		print_string += '\t'+str(total*X_est[runthrough_name]/proper_pairs_in_original_bam*10**6)
60 | 		print(print_string)
61 | 


--------------------------------------------------------------------------------
/CGC/total_orf1_and_orf2.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 3 | try:
 4 |     import cPickle as pickle
 5 | except ImportError:
 6 |     import pickle
 7 | 
 8 | exp_prob_pkls_list = sys.argv[1]
 9 | bam_info_list = sys.argv[2]
10 | orf1_list = sys.argv[3]
11 | orf2_list = sys.argv[4]
12 | min_FPM = float(sys.argv[5])
13 | allowed_runthrough_fraction = float(sys.argv[6])
14 | 
15 | l1pa_pairs = dict()
16 | mapped_pairs = dict()
17 | 
18 | orf1_intact = set()
19 | for line in open(orf1_list):
20 | 	orf1_intact.add(line.strip())
21 | orf2_intact = set()
22 | for line in open(orf2_list):
23 | 	orf2_intact.add(line.strip())
24 | 
25 | for line in open(bam_info_list):
26 | 	name = line.strip().split('/')[-1][:-4]
27 | 	baminfo = open(line.strip()).readlines()
28 | 	mapped_pairs[name] = int(baminfo[1])
29 | 	l1pa_pairs[name] = int(baminfo[2])
30 | 
31 | print('name\torf1_FPM\tORF2_FPM\tboth_FPM\tL1HS_expression_FPM\tL1HS_all_FPM')
32 | 
33 | for line in open(exp_prob_pkls_list):
34 | 	names_file, X_file = line.strip().split('\t')
35 | 	sample_name = names_file.split('/')[-1][:-16]
36 | 	exp_prob = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
37 | 	orf1 = 0.0
38 | 	orf2 = 0.0
39 | 	both = 0.0
40 | 	L1HS_exp = 0.0
41 | 	L1HS_all = 0.0
42 | 	for transcript in exp_prob:
43 | 		if 'L1HS' in transcript:
44 | 			L1HS_all += exp_prob[transcript]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
45 | 		if 'only' not in transcript:
46 | 			continue
47 | 		seq_name = '_'.join(transcript.split('_')[:-1])
48 | 		only_name = seq_name+'_only'
49 | 		runon_name = seq_name+'_3prunon'
50 | 		runthrough_name = seq_name+'_runthrough'
51 | 		FPM = 0.0
52 | 		FPM += exp_prob[only_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
53 | 		if runon_name in exp_prob:
54 | 			FPM += exp_prob[runon_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
55 | 		if runthrough_name in exp_prob:
56 | 			runthrough_FPM = exp_prob[runthrough_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
57 | 		else:
58 | 			runthrough_FPM = 0.0
59 | 		FPM *= FPM >= min_FPM and runthrough_FPM/(runthrough_FPM+FPM) <= allowed_runthrough_fraction
60 | 		if seq_name.split('(')[0][:-2] in orf1_intact:
61 | 			orf1 += FPM
62 | 		if seq_name.split('(')[0][:-2] in orf2_intact:
63 | 			orf2 += FPM
64 | 		if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact:
65 | 			both += FPM
66 | 		if 'L1HS' in seq_name:
67 | 			L1HS_exp += FPM
68 | 	print(sample_name +'\t'+ str(orf1) +'\t'+ str(orf2) +'\t'+ str(both) +'\t'+ str(L1HS_exp) +'\t'+ str(L1HS_all))
69 | 


--------------------------------------------------------------------------------
/CGC/total_orf1_and_orf2_stranded.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 3 | try:
 4 |     import cPickle as pickle
 5 | except ImportError:
 6 |     import pickle
 7 | 
 8 | exp_prob_pkls_list = sys.argv[1]
 9 | bam_info_list = sys.argv[2]
10 | orf1_list = sys.argv[3]
11 | orf2_list = sys.argv[4]
12 | min_FPM = float(sys.argv[5])
13 | allowed_runthrough_fraction = float(sys.argv[6])
14 | 
15 | l1pa_pairs = dict()
16 | mapped_pairs = dict()
17 | 
18 | orf1_intact = set()
19 | for line in open(orf1_list):
20 | 	orf1_intact.add(line.strip())
21 | orf2_intact = set()
22 | for line in open(orf2_list):
23 | 	orf2_intact.add(line.strip())
24 | 
25 | for line in open(bam_info_list):
26 | 	name = line.strip().split('/')[-1][:-4]
27 | 	baminfo = open(line.strip()).readlines()
28 | 	mapped_pairs[name] = int(baminfo[1])
29 | 	l1pa_pairs[name] = int(baminfo[2])
30 | 
31 | print('name\torf1_FPM\tORF2_FPM\tboth_FPM\tL1HS_expression_FPM\tL1HS_all_FPM')
32 | 
33 | for line in open(exp_prob_pkls_list):
34 | 	names_file, X_file = line.strip().split('\t')
35 | 	sample_name = names_file.split('/')[-1][:-16]
36 | 	exp_prob = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb'))))
37 | 	orf1 = 0.0
38 | 	orf2 = 0.0
39 | 	both = 0.0
40 | 	L1HS_exp = 0.0
41 | 	L1HS_all = 0.0
42 | 	for transcript in exp_prob:
43 | 		if 'L1HS' in transcript:
44 | 			L1HS_all += exp_prob[transcript]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
45 | 		if 'only' not in transcript:
46 | 			continue
47 | 		seq_name = '_'.join(transcript.split('_')[:-1])
48 | 		only_name = seq_name+'_only'
49 | 		runon_name = seq_name+'_3prunon'
50 | 		runthrough_name = seq_name+'_senserunthrough'
51 | 		FPM = 0.0
52 | 		FPM += exp_prob[only_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
53 | 		if runon_name in exp_prob:
54 | 			FPM += exp_prob[runon_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
55 | 		if runthrough_name in exp_prob:
56 | 			runthrough_FPM = exp_prob[runthrough_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6
57 | 		else:
58 | 			runthrough_FPM = 0.0
59 | 		FPM *= FPM >= min_FPM and runthrough_FPM/(runthrough_FPM+FPM) <= allowed_runthrough_fraction
60 | 		if seq_name.split('(')[0][:-2] in orf1_intact:
61 | 			orf1 += FPM
62 | 		if seq_name.split('(')[0][:-2] in orf2_intact:
63 | 			orf2 += FPM
64 | 		if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact:
65 | 			both += FPM
66 | 		if 'L1HS' in seq_name:
67 | 			L1HS_exp += FPM
68 | 	print(sample_name +'\t'+ str(orf1) +'\t'+ str(orf2) +'\t'+ str(both) +'\t'+ str(L1HS_exp) +'\t'+ str(L1HS_all))
69 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM continuumio/miniconda3:4.5.12
2 | 
3 | WORKDIR /
4 | 
5 | RUN conda install -y --override-channels -c bioconda -c conda-forge -c defaults python=2.7.15 bwa=0.7.17 samtools=1.9 numpy=1.14.3 scipy=1.1.0 pysam=0.15.0 bedtools=2.27.1
6 | RUN git clone https://github.com/FenyoLab/L1EM/
7 | 
8 | 


--------------------------------------------------------------------------------
/L1EM.yml:
--------------------------------------------------------------------------------
 1 | name: L1EM
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - python=2.7.15
 8 |   - bwa=0.7.17
 9 |   - samtools=1.9
10 |   - numpy=1.14.3
11 |   - scipy=1.1.0
12 |   - pysam=0.15.0
13 |   - bedtools=2.27.1
14 | 
15 | 


--------------------------------------------------------------------------------
/L1EM/G_of_R.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import sys
  3 | import numpy
  4 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
  5 | try:
  6 |     import cPickle as pickle
  7 | except ImportError:
  8 |     import pickle
  9 | from scipy import sparse
 10 | import datetime
 11 | import argparse
 12 | 
 13 | """
 14 | This script reads through a bam file resulting from a bwa aln alignment to the L1EM reference.
 15 | The output is a sparse matrix in which the rows are reads, the columns are transcripts
 16 | and the entries are the likelihood of that read arising from that transcript.
 17 | The matrix is pickled and saved. The column names are writted to a text file.
 18 | 
 19 | Part of the L1-EM package.
 20 | 
 21 | Copyright (C) 2019 Wilson McKerrow
 22 | 
 23 |     This program is free software: you can redistribute it and/or modify
 24 |     it under the terms of the GNU General Public License as published by
 25 |     the Free Software Foundation, either version 3 of the License, or
 26 |     (at your option) any later version.
 27 | 
 28 |     This program is distributed in the hope that it will be useful,
 29 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 30 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 31 |     GNU General Public License for more details.
 32 | 
 33 |     You should have received a copy of the GNU General Public License
 34 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
 35 | 
 36 | """
 37 | 
 38 | """
 39 | This class stores relevant information about a read's potential alignment as a dictionary
 40 | with references names as keys and as list of potential alignments to that reference name
 41 | as values.
 42 | """
 43 | class read_alignments(object):
 44 | 	def __init__(self, alignment,rnames,P):
 45 | 		self.alignments = dict()
 46 | 		self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
 47 | 	# Add a new alignment, passing a pysam aligned_segnment object.
 48 | 	def add(self, alignment,rnames,P):
 49 | 		if rnames[alignment.rname] not in self.alignments:
 50 | 			self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
 51 | 		else:
 52 | 			self.alignments[rnames[alignment.rname]].append(alignment_at_name(alignment.reference_start,alignment.is_reverse,P))
 53 | 	# Add a new alignment, passing the output of parseXA.
 54 | 	def addXA(self,refname,start,is_reverse,P):
 55 | 		if refname not in self.alignments:
 56 | 			self.alignments[refname] = [alignment_at_name(start,is_reverse,P)]
 57 | 		else:
 58 | 			self.alignments[refname].append(alignment_at_name(start,is_reverse,P))
 59 | 
 60 | # Stores position, strand and likelihood for an alignment.
 61 | class alignment_at_name(object):
 62 | 	def __init__(self,start,is_reverse,P):
 63 | 		self.start = start
 64 | 		self.is_reverse = is_reverse
 65 | 		self.P = P
 66 | 
 67 | # Read command line arguments
 68 | def GetArgs():
 69 | 
 70 | 	def ParseArgs(parser):
 71 | 		class Parser(argparse.ArgumentParser):
 72 | 			def error(self, message):
 73 | 				sys.stderr.write('error: %s\n' % message)
 74 | 				self.print_help()
 75 | 				sys.exit(2)
 76 | 
 77 | 		parser.add_argument('-b', '--bamfile',
 78 | 							type=str,
 79 | 							required=True,
 80 | 							help='Bam to generate alignments from. Required.')
 81 | 		parser.add_argument('-e', '--error_prob',
 82 | 							required=False,
 83 | 							default=0.01,
 84 | 							type=float,
 85 | 							help='Probability of an alignment mismatch. [0.01]')
 86 | 		parser.add_argument('-m', '--max_start2start_len',
 87 | 							required=False,
 88 | 							default=500,
 89 | 							type=int,
 90 | 							help='Maximium distance between read starts to be considered concordant. [500]')
 91 | 		parser.add_argument('-r', '--reads_per_pickle',
 92 | 							required=False,
 93 | 							default=12500,
 94 | 							type=int,
 95 | 							help='Split output into chunks of this many reads. [12500]')
 96 | 		parser.add_argument('-p', '--prefix',
 97 | 							required=False,
 98 | 							default='G_of_R',
 99 | 							type=str,
100 | 							help='Prefix for output file(s) [G_of_R]')
101 | 		parser.add_argument('-n', '--NMdiff',
102 | 							required=False,
103 | 							default=2,
104 | 							type=int,
105 | 							help='Ignore alignments with edit distance that exceed the best alignment by more than this number. [2]')
106 | 		parser.add_argument('-i', '--insert_mean',
107 | 							required=True,
108 | 							type=float,
109 | 							help='Median template length. Required.')
110 | 		parser.add_argument('--flanking',
111 | 							required=False,
112 | 							default=400,
113 | 							type=int,
114 | 							help='Number of flanking bases included on each end of repeats in reference fasta. [400]')
115 | 		parser.add_argument('--as_start',
116 | 							required=False,
117 | 							default=500,
118 | 							type=int,
119 | 							help='Position of the antisense TSS in L1. [500]')
120 | 		parser.add_argument('-w', '--wiggle',
121 | 							required=False,
122 | 							default=20,
123 | 							type=int,
124 | 							help='Extend L1 annotation this many bases in both directions. [20]')
125 | 		parser.add_argument('--min_len',
126 | 							required=False,
127 | 							default=500,
128 | 							type=int,
129 | 							help='When alignments probabilities are normalized for element length take max of elements length and this value. [500]')
130 | 		parser.add_argument('--min_exon_len',
131 | 							required=False,
132 | 							default=100,
133 | 							type=int,
134 | 							help='When alignments probabilities are normalized for exon length take max of elements length and this value. [100]')
135 | 		return parser.parse_args()
136 | 
137 | 	parser = argparse.ArgumentParser()
138 | 	args = ParseArgs(parser)
139 | 
140 | 	return args.bamfile, args.error_prob, args.max_start2start_len, args.reads_per_pickle, args.prefix, args.NMdiff, args.insert_mean, args.flanking, args.as_start,args.wiggle, args.min_len, args.min_exon_len
141 | 
142 | """
143 | Takes as input alignments (read_alignments class) of two paired reads and returns a sparse
144 | row matrix with the likelihoods of all properly paired alignments.
145 | """
146 | def get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len):
147 | 	this_G_of_R = numpy.zeros(5*nreps)
148 | 	for refname in alignments1.alignments:
149 | 		if refname not in alignments2.alignments:
150 | 			continue
151 | 		for aln1 in alignments1.alignments[refname]:
152 | 			for aln2 in alignments2.alignments[refname]:
153 | 				if aln1.is_reverse == aln2.is_reverse:
154 | 					continue
155 | 				if max(aln1.start,aln2.start)-min(aln1.start,aln2.start) <= max_start2start_len:
156 | 					has_5pUTR = refname.split('.')[1]=='1'
157 | 					if refname.split('.')[1]=='2':
158 | 						this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/(max(rlens[rnames_index[refname]]-insert_mean,min_exon_len))
159 | 						continue
160 | 					is_sense = not aln2.is_reverse
161 | 					within_5p = min(aln1.start,aln2.start) > flanking -wiggle
162 | 					within_3p = max(aln1.start,aln2.start)+read_length < rlens[rnames_index[refname]]-flanking +wiggle
163 | 					overlap_element = max(aln1.start,aln2.start)+read_length > flanking and min(aln1.start,aln2.start) < rlens[rnames_index[refname]]-flanking
164 | 					if not overlap_element:
165 | 						continue
166 | 					if is_sense:
167 | 						this_G_of_R[rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+insert_mean+2*wiggle)
168 | 					if not is_sense:
169 | 						this_G_of_R[nreps+rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+insert_mean+2*wiggle)
170 | 					if  within_5p and within_3p and is_sense and has_5pUTR:
171 | 						this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/max(rlens[rnames_index[refname]]-2*flanking-insert_mean+2*wiggle,min_len)
172 | 					if  within_5p and is_sense and has_5pUTR:
173 | 						this_G_of_R[3*nreps+rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+2*wiggle)
174 | 					if has_5pUTR and rlens[rnames_index[refname]] > flanking+as_start and max(aln1.start,aln2.start)+read_length < flanking+as_start and (not is_sense) and rlens[rnames_index[refname]] > flanking+as_start:
175 | 						this_G_of_R[4*nreps+rnames_index[refname]] += aln1.P*aln2.P/(as_start+insert_mean+wiggle)
176 | 	return sparse.csr_matrix(this_G_of_R)
177 | 
178 | # Parse secondary alignments in the XA tag from bwa aln.
179 | def parseXA(alignments,XAtagdict,error_prob,maxNM,reversed):
180 | 	for aln in [x.split(',') for x in XAtagdict.split(';')[:-1]]:
181 | 		refname = aln[0]
182 | 		#if not reversed:
183 | 		#	is_reverse = aln[1][0] == '-'
184 | 		#else:
185 | 		#	is_reverse = aln[1][0] == '+'
186 | 		is_reverse = aln[1][0] == '-'
187 | 		start = int(aln[1][1:])
188 | 		cigarstring = aln[2]
189 | 		NM = int(aln[3])
190 | 		if NM <= maxNM and 'S' not in cigarstring and 'H' not in cigarstring:
191 | 			P = error_prob**NM
192 | 			alignments.addXA(refname,start,is_reverse,P)
193 | 	return alignments
194 | 
195 | def main():
196 | 	bamfile, error_prob, max_start2start_len, reads_per_pickle, prefix, NMdiff, insert_mean, flanking, as_start, wiggle, min_len, min_exon_len = GetArgs()
197 | 
198 | 	pickle_num = 0
199 | 
200 | 	bam = pysam.Samfile(bamfile, "rb")
201 | 	rnames = bam.references
202 | 	rlens = bam.lengths
203 | 	nreps = len(rnames)
204 | 	rnames_index = dict()
205 | 	for i in range(nreps):
206 | 		rnames_index[rnames[i]] = i
207 | 
208 | 	# Write transcript (column) names
209 | 	TEnamefile = open(prefix+'_TE_list.txt','w')
210 | 	for i in range(nreps):
211 | 		TEnamefile.write(rnames[i]+'_senserunthrough'+'\t'+str(rlens[i]+2*flanking)+'\n')
212 | 	for i in range(nreps):
213 | 		TEnamefile.write(rnames[i]+'_antisenserunthrough'+'\t'+str(rlens[i]+2*flanking)+'\n')
214 | 	for i in range(nreps):
215 | 		TEnamefile.write(rnames[i]+'_only'+'\t'+str(rlens[i])+'\n')
216 | 	for i in range(nreps):
217 | 		TEnamefile.write(rnames[i]+'_3prunon'+'\t'+str(rlens[i]+flanking)+'\n')
218 | 	for i in range(nreps):
219 | 		TEnamefile.write(rnames[i]+'_antisense'+'\t'+str(flanking+as_start)+'\n')
220 | 	TEnamefile.close()
221 | 
222 | 	read_id = None
223 | 
224 | 	G_of_R = None
225 | 	G_of_R_list_file = open(prefix+'_list.txt','w')
226 | 	G_of_R_row = 0
227 | 
228 | 	starttime = datetime.datetime.now()
229 | 
230 | 	# Read through the name sorted bam file
231 | 	for alignment in bam:
232 | 		read_length = alignment.query_length
233 | 		# Throw out alignments that are unmapped, clipped or low quality
234 | 		if alignment.is_unmapped:
235 | 			continue
236 | 		if 'N' in alignment.cigarstring or 'S' in alignment.cigarstring or 'H' in alignment.cigarstring or 'P' in alignment.cigarstring or '=' in alignment.cigarstring or 'X' in alignment.cigarstring:
237 | 			continue
238 | 		if numpy.mean(alignment.query_qualities) < 30:
239 | 			continue
240 | 
241 | 		if not read_id:
242 | 			read_id = alignment.qname
243 | 			new_read_id1 = True
244 | 			new_read_id2 = True
245 | 
246 | 		# Once we have read all entries for a given query name, create a row for that fragment
247 | 		if read_id != alignment.qname:
248 | 			if not (new_read_id1 or new_read_id2):
249 | 				this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len)
250 | 				if this_G_of_R.nnz > 0:
251 | 					if G_of_R_row > 0:
252 | 						G_of_R = sparse.vstack([G_of_R,this_G_of_R])
253 | 					else:
254 | 						G_of_R = this_G_of_R
255 | 					G_of_R_row += 1
256 | 				# If necessary, break up matrix into multiple pickle files.
257 | 				if G_of_R_row >= reads_per_pickle:
258 | 					pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
259 | 					G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
260 | 					pickle_num += 1
261 | 					G_of_R_row = 0
262 | 					G_of_R = None
263 | 					print('wrote '+str(reads_per_pickle)+' reads in '+str(datetime.datetime.now()-starttime))
264 | 					starttime = datetime.datetime.now()
265 | 
266 | 			read_id = alignment.qname
267 | 			new_read_id1 = True
268 | 			new_read_id2 = True
269 | 
270 | 		# Parse primary alignment
271 | 		# There's a bug in bwa samse (0.7.17) when writing NM tag for overlapping read pairs
272 | 		NMtag = dict(alignment.tags)['XM']
273 | 		for pair in alignment.cigartuples:
274 | 			NMtag += (pair[0]>0)*pair[1]
275 | 		P = error_prob**NMtag
276 | 
277 | 		if alignment.is_read1:
278 | 			if new_read_id1:
279 | 				alignments1 = read_alignments(alignment,rnames,P)
280 | 				new_read_id1 = False
281 | 			else:
282 | 				alignments1.add(alignment,rnames,P)
283 | 		else:
284 | 			if new_read_id2:
285 | 				alignments2 = read_alignments(alignment,rnames,P)
286 | 				new_read_id2 = False
287 | 			else:
288 | 				alignments2.add(alignment,rnames,P)
289 | 
290 | 		# Parse secondary alignments
291 | 		if 'XA' in dict(alignment.tags):
292 | 			if alignment.is_read1:
293 | 				alignments1 = parseXA(alignments1,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
294 | 			else:
295 | 				alignments2 = parseXA(alignments2,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
296 | 
297 | 	# Make row for last read
298 | 	if read_id and not (new_read_id1 or new_read_id2):
299 | 		this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len)
300 | 		if this_G_of_R.nnz > 0:
301 | 			if G_of_R_row > 0:
302 | 				G_of_R = sparse.vstack([G_of_R,this_G_of_R])
303 | 			else:
304 | 				G_of_R = this_G_of_R
305 | 
306 | 	# Write matrix to disk.
307 | 	pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
308 | 	G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
309 | 	print(G_of_R_row+reads_per_pickle*pickle_num)
310 | 
311 | if __name__ == '__main__':
312 | 	main()
313 | 


--------------------------------------------------------------------------------
/L1EM/G_of_R_single_unstranded.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import sys
  3 | import numpy
  4 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
  5 | try:
  6 |     import cPickle as pickle
  7 | except ImportError:
  8 |     import pickle
  9 | from scipy import sparse
 10 | import datetime
 11 | import argparse
 12 | 
 13 | """
 14 | This script reads through a bam file resulting from a bwa aln alignment to the L1EM reference.
 15 | The output is a sparse matrix in which the rows are reads, the columns are transcripts
 16 | and the entries are the likelihood of that read arising from that transcript.
 17 | The matrix is pickled and saved. The column names are writted to a text file.
 18 | 
 19 | Copyright (C) 2019 Wilson McKerrow
 20 | 
 21 |     This program is free software: you can redistribute it and/or modify
 22 |     it under the terms of the GNU General Public License as published by
 23 |     the Free Software Foundation, either version 3 of the License, or
 24 |     (at your option) any later version.
 25 | 
 26 |     This program is distributed in the hope that it will be useful,
 27 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 28 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 29 |     GNU General Public License for more details.
 30 | 
 31 |     You should have received a copy of the GNU General Public License
 32 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
 33 | 
 34 | """
 35 | 
 36 | """
 37 | This class stores relevant information about a read's potential alignment as a dictionary
 38 | with references names as keys and as list of potential alignments to that reference name
 39 | as values.
 40 | """
 41 | class read_alignments(object):
 42 | 	def __init__(self, alignment,rnames,P):
 43 | 		self.alignments = dict()
 44 | 		self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
 45 | 	# Add a new alignment, passing a pysam aligned_segnment object.
 46 | 	def add(self, alignment,rnames,P):
 47 | 		if rnames[alignment.rname] not in self.alignments:
 48 | 			self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
 49 | 		else:
 50 | 			self.alignments[rnames[alignment.rname]].append(alignment_at_name(alignment.reference_start,alignment.is_reverse,P))
 51 | 	# Add a new alignment, passing the output of parseXA.
 52 | 	def addXA(self,refname,start,is_reverse,P):
 53 | 		if refname not in self.alignments:
 54 | 			self.alignments[refname] = [alignment_at_name(start,is_reverse,P)]
 55 | 		else:
 56 | 			self.alignments[refname].append(alignment_at_name(start,is_reverse,P))
 57 | 
 58 | # Stores position, strand and likelihood for an alignment.
 59 | class alignment_at_name(object):
 60 | 	def __init__(self,start,is_reverse,P):
 61 | 		self.start = start
 62 | 		self.is_reverse = is_reverse
 63 | 		self.P = P
 64 | 
 65 | # Read command line arguments
 66 | def GetArgs():
 67 | 
 68 | 	def ParseArgs(parser):
 69 | 		class Parser(argparse.ArgumentParser):
 70 | 			def error(self, message):
 71 | 				sys.stderr.write('error: %s\n' % message)
 72 | 				self.print_help()
 73 | 				sys.exit(2)
 74 | 
 75 | 		parser.add_argument('-b', '--bamfile',
 76 | 							type=str,
 77 | 							required=True,
 78 | 							help='Bam to generate alignments from. Required.')
 79 | 		parser.add_argument('-e', '--error_prob',
 80 | 							required=False,
 81 | 							default=0.01,
 82 | 							type=float,
 83 | 							help='Probability of an alignment mismatch. [0.01]')
 84 | 		parser.add_argument('-r', '--reads_per_pickle',
 85 | 							required=False,
 86 | 							default=12500,
 87 | 							type=int,
 88 | 							help='Split output into chunks of this many reads. [12500]')
 89 | 		parser.add_argument('-p', '--prefix',
 90 | 							required=False,
 91 | 							default='G_of_R',
 92 | 							type=str,
 93 | 							help='Prefix for output file(s) [G_of_R]')
 94 | 		parser.add_argument('-n', '--NMdiff',
 95 | 							required=False,
 96 | 							default=2,
 97 | 							type=int,
 98 | 							help='Ignore alignments with edit distance that exceed the best alignment by more than this number. [2]')
 99 | 		parser.add_argument('--flanking',
100 | 							required=False,
101 | 							default=400,
102 | 							type=int,
103 | 							help='Number of flanking bases included on each end of repeats in reference fasta. [400]')
104 | 		parser.add_argument('-w', '--wiggle',
105 | 							required=False,
106 | 							default=20,
107 | 							type=int,
108 | 							help='Extend L1 annotation this many bases in both directions. [20]')
109 | 		parser.add_argument('--min_len',
110 | 							required=False,
111 | 							default=500,
112 | 							type=int,
113 | 							help='When alignments probabilities are normalized for element length take max of elements length and this value. [500]')
114 | 		parser.add_argument('--min_exon_len',
115 | 							required=False,
116 | 							default=100,
117 | 							type=int,
118 | 							help='When alignments probabilities are normalized for exon length take max of elements length and this value. [100]')
119 | 		return parser.parse_args()
120 | 
121 | 	parser = argparse.ArgumentParser()
122 | 	args = ParseArgs(parser)
123 | 
124 | 	return args.bamfile, args.error_prob, args.reads_per_pickle, args.prefix, args.NMdiff, args.flanking, args.wiggle, args.min_len, args.min_exon_len
125 | 
126 | """
127 | Takes as input alignments (read_alignments class) of two paired reads and returns a sparse
128 | row matrix with the likelihoods of all properly paired alignments.
129 | """
130 | def make_G_of_R_row(alignments,rnames_index,rlens,nreps,read_length,flanking,wiggle,min_len,min_exon_len):
131 | 	this_G_of_R = numpy.zeros(3*nreps)
132 | 	for refname in alignments.alignments:
133 | 		for aln in alignments.alignments[refname]:
134 | 			has_5pUTR = refname.split('.')[1]=='1'
135 | 			within_5p = aln.start > flanking -wiggle
136 | 			within_3p = aln.start+read_length < rlens[rnames_index[refname]]-flanking +wiggle
137 | 			overlap_element = aln.start+read_length > flanking and aln.start < rlens[rnames_index[refname]]-flanking
138 | 			if not overlap_element:
139 | 				continue
140 | 			this_G_of_R[rnames_index[refname]] += aln.P/(rlens[rnames_index[refname]]-2*flanking+read_length+2*wiggle)
141 | 			if  within_5p and within_3p and has_5pUTR:
142 | 				this_G_of_R[1*nreps+rnames_index[refname]] += aln.P/max(rlens[rnames_index[refname]]-2*flanking-read_length+2*wiggle,min_len)
143 | 			if  within_5p and has_5pUTR:
144 | 				this_G_of_R[2*nreps+rnames_index[refname]] += aln.P/(rlens[rnames_index[refname]]-2*flanking+2*wiggle)
145 | 	return sparse.csr_matrix(this_G_of_R)
146 | 
147 | # Parse secondary alignments in the XA tag from bwa aln.
148 | def parseXA(alignments,XAtagdict,error_prob,maxNM,reversed):
149 | 	for aln in [x.split(',') for x in XAtagdict.split(';')[:-1]]:
150 | 		refname = aln[0]
151 | 		#if not reversed:
152 | 		#	is_reverse = aln[1][0] == '-'
153 | 		#else:
154 | 		#	is_reverse = aln[1][0] == '+'
155 | 		is_reverse = aln[1][0] == '-'
156 | 		start = int(aln[1][1:])
157 | 		cigarstring = aln[2]
158 | 		NM = int(aln[3])
159 | 		if NM <= maxNM and 'S' not in cigarstring and 'H' not in cigarstring:
160 | 			P = error_prob**NM
161 | 			alignments.addXA(refname,start,is_reverse,P)
162 | 	return alignments
163 | 
164 | def main():
165 | 	bamfile, error_prob, reads_per_pickle, prefix, NMdiff, flanking, wiggle, min_len, min_exon_len = GetArgs()
166 | 
167 | 	pickle_num = 0
168 | 
169 | 	bam = pysam.Samfile(bamfile, "rb")
170 | 	rnames = bam.references
171 | 	rlens = bam.lengths
172 | 	nreps = len(rnames)
173 | 	rnames_index = dict()
174 | 	for i in range(nreps):
175 | 		rnames_index[rnames[i]] = i
176 | 
177 | 	# Write transcript (column) names
178 | 	TEnamefile = open(prefix+'_TE_list.txt','w')
179 | 	for i in range(nreps):
180 | 		TEnamefile.write(rnames[i]+'_runthrough'+'\t'+str(rlens[i]+2*flanking)+'\n')
181 | 	for i in range(nreps):
182 | 		TEnamefile.write(rnames[i]+'_only'+'\t'+str(rlens[i])+'\n')
183 | 	for i in range(nreps):
184 | 		TEnamefile.write(rnames[i]+'_3prunon'+'\t'+str(rlens[i]+flanking)+'\n')
185 | 	TEnamefile.close()
186 | 
187 | 	read_id = None
188 | 
189 | 	G_of_R = None
190 | 	G_of_R_list_file = open(prefix+'_list.txt','w')
191 | 	G_of_R_row = 0
192 | 
193 | 	starttime = datetime.datetime.now()
194 | 
195 | 	# Read through the name sorted bam file
196 | 	for alignment in bam:
197 | 		read_length = alignment.query_length
198 | 		# Throw out alignments that are unmapped, clipped or low quality
199 | 		if alignment.is_unmapped:
200 | 			continue
201 | 		if 'N' in alignment.cigarstring or 'S' in alignment.cigarstring or 'H' in alignment.cigarstring or 'P' in alignment.cigarstring or '=' in alignment.cigarstring or 'X' in alignment.cigarstring:
202 | 			continue
203 | 		if numpy.mean(alignment.query_qualities) < 30:
204 | 			continue
205 | 
206 | 		if not read_id:
207 | 			read_id = alignment.qname
208 | 			new_read_id = True
209 | 
210 | 		# Once we have read all entries for a given query name, create a row for that fragment
211 | 		if read_id != alignment.qname:
212 | 			if not new_read_id:
213 | 				this_G_of_R = make_G_of_R_row(alignments,rnames_index,rlens,nreps,read_length,flanking,wiggle,min_len,min_exon_len)
214 | 				# Don't add row if its empty
215 | 				if this_G_of_R.nnz > 0:
216 | 					if G_of_R_row > 0:
217 | 						G_of_R = sparse.vstack([G_of_R,this_G_of_R])
218 | 					else:
219 | 						G_of_R = this_G_of_R
220 | 					G_of_R_row += 1
221 | 				# If necessary, break up matrix into multiple pickle files.
222 | 				if G_of_R_row >= reads_per_pickle:
223 | 					pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
224 | 					G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
225 | 					pickle_num += 1
226 | 					G_of_R_row = 0
227 | 					G_of_R = None
228 | 					print('wrote '+str(reads_per_pickle)+' reads in '+str(datetime.datetime.now()-starttime))
229 | 					starttime = datetime.datetime.now()
230 | 
231 | 			read_id = alignment.qname
232 | 			new_read_id = True
233 | 
234 | 		# Parse primary alignment
235 | 		# There's a bug in bwa samse (0.7.17) when writing NM tag for overlapping read pairs
236 | 		NMtag = dict(alignment.tags)['NM']
237 | 		P = error_prob**NMtag
238 | 
239 | 		if new_read_id:
240 | 			alignments = read_alignments(alignment,rnames,P)
241 | 			new_read_id = False
242 | 		else:
243 | 			alignments.add(alignment,rnames,P)
244 | 
245 | 		# Parse secondary alignments
246 | 		if 'XA' in dict(alignment.tags):
247 | 			alignments = parseXA(alignments,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
248 | 
249 | 	# Make row for last read
250 | 	if not new_read_id:
251 | 		this_G_of_R = make_G_of_R_row(alignments,rnames_index,rlens,nreps,read_length,flanking,wiggle,min_len,min_exon_len)
252 | 		if this_G_of_R.nnz > 0:
253 | 			if G_of_R_row > 0:
254 | 				G_of_R = sparse.vstack([G_of_R,this_G_of_R])
255 | 			else:
256 | 				G_of_R = this_G_of_R
257 | 
258 | 	# Write matrix to disk.
259 | 	pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
260 | 	G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
261 | 	print(G_of_R_row+reads_per_pickle*pickle_num)
262 | 
263 | if __name__ == '__main__':
264 | 	main()
265 | 


--------------------------------------------------------------------------------
/L1EM/G_of_R_unstranded.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import sys
  3 | import numpy
  4 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
  5 | try:
  6 |     import cPickle as pickle
  7 | except ImportError:
  8 |     import pickle
  9 | from scipy import sparse
 10 | import datetime
 11 | import argparse
 12 | 
 13 | """
 14 | This script reads through a bam file resulting from a bwa aln alignment to the L1EM reference.
 15 | The output is a sparse matrix in which the rows are reads, the columns are transcripts
 16 | and the entries are the likelihood of that read arising from that transcript.
 17 | The matrix is pickled and saved. The column names are writted to a text file.
 18 | 
 19 | Copyright (C) 2019 Wilson McKerrow
 20 | 
 21 |     This program is free software: you can redistribute it and/or modify
 22 |     it under the terms of the GNU General Public License as published by
 23 |     the Free Software Foundation, either version 3 of the License, or
 24 |     (at your option) any later version.
 25 | 
 26 |     This program is distributed in the hope that it will be useful,
 27 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 28 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 29 |     GNU General Public License for more details.
 30 | 
 31 |     You should have received a copy of the GNU General Public License
 32 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
 33 | 
 34 | """
 35 | 
 36 | """
 37 | This class stores relevant information about a read's potential alignment as a dictionary
 38 | with references names as keys and as list of potential alignments to that reference name
 39 | as values.
 40 | """
 41 | class read_alignments(object):
 42 | 	def __init__(self, alignment,rnames,P):
 43 | 		self.alignments = dict()
 44 | 		self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
 45 | 	# Add a new alignment, passing a pysam aligned_segnment object.
 46 | 	def add(self, alignment,rnames,P):
 47 | 		if rnames[alignment.rname] not in self.alignments:
 48 | 			self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)]
 49 | 		else:
 50 | 			self.alignments[rnames[alignment.rname]].append(alignment_at_name(alignment.reference_start,alignment.is_reverse,P))
 51 | 	# Add a new alignment, passing the output of parseXA.
 52 | 	def addXA(self,refname,start,is_reverse,P):
 53 | 		if refname not in self.alignments:
 54 | 			self.alignments[refname] = [alignment_at_name(start,is_reverse,P)]
 55 | 		else:
 56 | 			self.alignments[refname].append(alignment_at_name(start,is_reverse,P))
 57 | 
 58 | # Stores position, strand and likelihood for an alignment.
 59 | class alignment_at_name(object):
 60 | 	def __init__(self,start,is_reverse,P):
 61 | 		self.start = start
 62 | 		self.is_reverse = is_reverse
 63 | 		self.P = P
 64 | 
 65 | # Read command line arguments
 66 | def GetArgs():
 67 | 
 68 | 	def ParseArgs(parser):
 69 | 		class Parser(argparse.ArgumentParser):
 70 | 			def error(self, message):
 71 | 				sys.stderr.write('error: %s\n' % message)
 72 | 				self.print_help()
 73 | 				sys.exit(2)
 74 | 
 75 | 		parser.add_argument('-b', '--bamfile',
 76 | 							type=str,
 77 | 							required=True,
 78 | 							help='Bam to generate alignments from. Required.')
 79 | 		parser.add_argument('-e', '--error_prob',
 80 | 							required=False,
 81 | 							default=0.01,
 82 | 							type=float,
 83 | 							help='Probability of an alignment mismatch. [0.01]')
 84 | 		parser.add_argument('-m', '--max_start2start_len',
 85 | 							required=False,
 86 | 							default=500,
 87 | 							type=int,
 88 | 							help='Maximium distance between read starts to be considered concordant. [500]')
 89 | 		parser.add_argument('-r', '--reads_per_pickle',
 90 | 							required=False,
 91 | 							default=12500,
 92 | 							type=int,
 93 | 							help='Split output into chunks of this many reads. [12500]')
 94 | 		parser.add_argument('-p', '--prefix',
 95 | 							required=False,
 96 | 							default='G_of_R',
 97 | 							type=str,
 98 | 							help='Prefix for output file(s) [G_of_R]')
 99 | 		parser.add_argument('-n', '--NMdiff',
100 | 							required=False,
101 | 							default=2,
102 | 							type=int,
103 | 							help='Ignore alignments with edit distance that exceed the best alignment by more than this number. [2]')
104 | 		parser.add_argument('-i', '--insert_mean',
105 | 							required=True,
106 | 							type=float,
107 | 							help='Median template length. Required.')
108 | 		parser.add_argument('--flanking',
109 | 							required=False,
110 | 							default=400,
111 | 							type=int,
112 | 							help='Number of flanking bases included on each end of repeats in reference fasta. [400]')
113 | 		parser.add_argument('--as_start',
114 | 							required=False,
115 | 							default=500,
116 | 							type=int,
117 | 							help='Position of the antisense TSS in L1. [500]')
118 | 		parser.add_argument('-w', '--wiggle',
119 | 							required=False,
120 | 							default=20,
121 | 							type=int,
122 | 							help='Extend L1 annotation this many bases in both directions. [20]')
123 | 		parser.add_argument('--min_len',
124 | 							required=False,
125 | 							default=500,
126 | 							type=int,
127 | 							help='When alignments probabilities are normalized for element length take max of elements length and this value. [500]')
128 | 		parser.add_argument('--min_exon_len',
129 | 							required=False,
130 | 							default=100,
131 | 							type=int,
132 | 							help='When alignments probabilities are normalized for exon length take max of elements length and this value. [100]')
133 | 		return parser.parse_args()
134 | 
135 | 	parser = argparse.ArgumentParser()
136 | 	args = ParseArgs(parser)
137 | 
138 | 	return args.bamfile, args.error_prob, args.max_start2start_len, args.reads_per_pickle, args.prefix, args.NMdiff, args.insert_mean, args.flanking, args.as_start,args.wiggle, args.min_len, args.min_exon_len
139 | 
140 | """
141 | Takes as input alignments (read_alignments class) of two paired reads and returns a sparse
142 | row matrix with the likelihoods of all properly paired alignments.
143 | """
144 | def get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len):
145 | 	this_G_of_R = numpy.zeros(3*nreps)
146 | 	for refname in alignments1.alignments:
147 | 		if refname not in alignments2.alignments:
148 | 			continue
149 | 		for aln1 in alignments1.alignments[refname]:
150 | 			for aln2 in alignments2.alignments[refname]:
151 | 				if aln1.is_reverse == aln2.is_reverse:
152 | 					continue
153 | 				if max(aln1.start,aln2.start)-min(aln1.start,aln2.start) <= max_start2start_len:
154 | 					has_5pUTR = refname.split('.')[1]=='1'
155 | 					if refname.split('.')[1]=='2':
156 | 						this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/(max(rlens[rnames_index[refname]]-insert_mean,min_exon_len))
157 | 						continue
158 | 					within_5p = min(aln1.start,aln2.start) > flanking -wiggle
159 | 					within_3p = max(aln1.start,aln2.start)+read_length < rlens[rnames_index[refname]]-flanking +wiggle
160 | 					overlap_element = max(aln1.start,aln2.start)+read_length > flanking and min(aln1.start,aln2.start) < rlens[rnames_index[refname]]-flanking
161 | 					if not overlap_element:
162 | 						continue
163 | 					this_G_of_R[rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+insert_mean+2*wiggle)
164 | 					if  within_5p and within_3p and has_5pUTR:
165 | 						this_G_of_R[1*nreps+rnames_index[refname]] += aln1.P*aln2.P/max(rlens[rnames_index[refname]]-2*flanking-insert_mean+2*wiggle,min_len)
166 | 					if  within_5p and has_5pUTR:
167 | 						this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+2*wiggle)
168 | 	return sparse.csr_matrix(this_G_of_R)
169 | 
170 | # Parse secondary alignments in the XA tag from bwa aln.
171 | def parseXA(alignments,XAtagdict,error_prob,maxNM,reversed):
172 | 	for aln in [x.split(',') for x in XAtagdict.split(';')[:-1]]:
173 | 		refname = aln[0]
174 | 		#if not reversed:
175 | 		#	is_reverse = aln[1][0] == '-'
176 | 		#else:
177 | 		#	is_reverse = aln[1][0] == '+'
178 | 		is_reverse = aln[1][0] == '-'
179 | 		start = int(aln[1][1:])
180 | 		cigarstring = aln[2]
181 | 		NM = int(aln[3])
182 | 		if NM <= maxNM and 'S' not in cigarstring and 'H' not in cigarstring:
183 | 			P = error_prob**NM
184 | 			alignments.addXA(refname,start,is_reverse,P)
185 | 	return alignments
186 | 
187 | def main():
188 | 	bamfile, error_prob, max_start2start_len, reads_per_pickle, prefix, NMdiff, insert_mean, flanking, as_start, wiggle, min_len, min_exon_len = GetArgs()
189 | 
190 | 	pickle_num = 0
191 | 
192 | 	bam = pysam.Samfile(bamfile, "rb")
193 | 	rnames = bam.references
194 | 	rlens = bam.lengths
195 | 	nreps = len(rnames)
196 | 	rnames_index = dict()
197 | 	for i in range(nreps):
198 | 		rnames_index[rnames[i]] = i
199 | 
200 | 	# Write transcript (column) names
201 | 	TEnamefile = open(prefix+'_TE_list.txt','w')
202 | 	for i in range(nreps):
203 | 		TEnamefile.write(rnames[i]+'_runthrough'+'\t'+str(rlens[i]+2*flanking)+'\n')
204 | 	for i in range(nreps):
205 | 		TEnamefile.write(rnames[i]+'_only'+'\t'+str(rlens[i])+'\n')
206 | 	for i in range(nreps):
207 | 		TEnamefile.write(rnames[i]+'_3prunon'+'\t'+str(rlens[i]+flanking)+'\n')
208 | 	TEnamefile.close()
209 | 
210 | 	read_id = None
211 | 
212 | 	G_of_R = None
213 | 	G_of_R_list_file = open(prefix+'_list.txt','w')
214 | 	G_of_R_row = 0
215 | 
216 | 	starttime = datetime.datetime.now()
217 | 
218 | 	# Read through the name sorted bam file
219 | 	for alignment in bam:
220 | 		read_length = alignment.query_length
221 | 		# Throw out alignments that are unmapped, clipped or low quality
222 | 		if alignment.is_unmapped:
223 | 			continue
224 | 		if 'N' in alignment.cigarstring or 'S' in alignment.cigarstring or 'H' in alignment.cigarstring or 'P' in alignment.cigarstring or '=' in alignment.cigarstring or 'X' in alignment.cigarstring:
225 | 			continue
226 | 		if numpy.mean(alignment.query_qualities) < 30:
227 | 			continue
228 | 
229 | 		if not read_id:
230 | 			read_id = alignment.qname
231 | 			new_read_id1 = True
232 | 			new_read_id2 = True
233 | 
234 | 		# Once we have read all entries for a given query name, create a row for that fragment
235 | 		if read_id != alignment.qname:
236 | 			if not (new_read_id1 or new_read_id2):
237 | 				this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len)
238 | 				if this_G_of_R.nnz > 0:
239 | 					if G_of_R_row > 0:
240 | 						G_of_R = sparse.vstack([G_of_R,this_G_of_R])
241 | 					else:
242 | 						G_of_R = this_G_of_R
243 | 					G_of_R_row += 1
244 | 				# If necessary, break up matrix into multiple pickle files.
245 | 				if G_of_R_row >= reads_per_pickle:
246 | 					pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
247 | 					G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
248 | 					pickle_num += 1
249 | 					G_of_R_row = 0
250 | 					G_of_R = None
251 | 					print('wrote '+str(reads_per_pickle)+' reads in '+str(datetime.datetime.now()-starttime))
252 | 					starttime = datetime.datetime.now()
253 | 
254 | 			read_id = alignment.qname
255 | 			new_read_id1 = True
256 | 			new_read_id2 = True
257 | 
258 | 		# Parse primary alignment
259 | 		# There's a bug in bwa samse (0.7.17) when writing NM tag for overlapping read pairs
260 | 		NMtag = dict(alignment.tags)['XM']
261 | 		for pair in alignment.cigartuples:
262 | 			NMtag += (pair[0]>0)*pair[1]
263 | 		P = error_prob**NMtag
264 | 
265 | 		if alignment.is_read1:
266 | 			if new_read_id1:
267 | 				alignments1 = read_alignments(alignment,rnames,P)
268 | 				new_read_id1 = False
269 | 			else:
270 | 				alignments1.add(alignment,rnames,P)
271 | 		else:
272 | 			if new_read_id2:
273 | 				alignments2 = read_alignments(alignment,rnames,P)
274 | 				new_read_id2 = False
275 | 			else:
276 | 				alignments2.add(alignment,rnames,P)
277 | 
278 | 		# Parse secondary alignments
279 | 		if 'XA' in dict(alignment.tags):
280 | 			if alignment.is_read1:
281 | 				alignments1 = parseXA(alignments1,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
282 | 			else:
283 | 				alignments2 = parseXA(alignments2,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse)
284 | 
285 | 	# Make row for last read
286 | 	if read_id!=None and not (new_read_id1 or new_read_id2):
287 | 		this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len)
288 | 		if this_G_of_R.nnz > 0:
289 | 			if G_of_R_row > 0:
290 | 				G_of_R = sparse.vstack([G_of_R,this_G_of_R])
291 | 			else:
292 | 				G_of_R = this_G_of_R
293 | 
294 | 	# Write matrix to disk.
295 | 	if G_of_R_row+reads_per_pickle*pickle_num >0:
296 | 		pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL)
297 | 		G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n')
298 | 		print(G_of_R_row+reads_per_pickle*pickle_num)
299 | 
300 | if __name__ == '__main__':
301 | 	main()
302 | 


--------------------------------------------------------------------------------
/L1EM/L1EM.py:
--------------------------------------------------------------------------------
  1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
  2 | try:
  3 |     import cPickle as pickle
  4 | except ImportError:
  5 |     import pickle
  6 | import numpy
  7 | import sys
  8 | import datetime
  9 | from scipy import sparse
 10 | from multiprocessing import Pool
 11 | import argparse
 12 | 
 13 | """
 14 | This code takes as input the output of G_of_R.py and runs the EM algorithm to estimate
 15 | transcript abundances.
 16 | 
 17 | Part of the L1-EM package.
 18 | 
 19 | Copyright (C) 2019 Wilson McKerrow
 20 | 
 21 |     This program is free software: you can redistribute it and/or modify
 22 |     it under the terms of the GNU General Public License as published by
 23 |     the Free Software Foundation, either version 3 of the License, or
 24 |     (at your option) any later version.
 25 | 
 26 |     This program is distributed in the hope that it will be useful,
 27 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
 28 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 29 |     GNU General Public License for more details.
 30 | 
 31 |     You should have received a copy of the GNU General Public License
 32 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
 33 | """
 34 | 
 35 | # Main calculation for the E step
 36 | def calculate_expcounts(G_of_R_pkl,X):
 37 | 	G_of_R_file = open(G_of_R_pkl,'rb')
 38 | 	G_of_R = pickle.load(G_of_R_file)
 39 | 	G_of_R_file.close()
 40 | 	if G_of_R == None:
 41 | 		return 0.0,0.0
 42 | 	L_of_R_mat = G_of_R.multiply(X)
 43 | 	L_of_R = numpy.array(L_of_R_mat.sum(1))
 44 | 	L_of_R_mat = L_of_R_mat[L_of_R[:,0]>=10**-200,:]
 45 | 	L_of_R = L_of_R[L_of_R>=10**-200]
 46 | 	L_of_R_inv = sparse.csr_matrix(1.0/L_of_R).transpose()
 47 | 	exp_counts = L_of_R_mat.multiply(L_of_R_inv).sum(0)
 48 | 	loglik = numpy.sum(numpy.log(L_of_R))
 49 | 	if numpy.isfinite(loglik):
 50 | 		return exp_counts,loglik
 51 | 	else:
 52 | 		return numpy.zeros(G_of_R.shape[1]),0.0
 53 | 
 54 | # Divide send each thread a chunk of the G_of_R pkl files.
 55 | def calculate_expcounts_chunk(input):
 56 | 	G_of_R_pkl_list,X_len = input
 57 | 	exp_counts = numpy.zeros(X_len.shape,dtype=numpy.float64)
 58 | 	loglik = 0.0
 59 | 	for G_of_R_pkl in G_of_R_pkl_list:
 60 | 		this_exp_counts,this_loglik = calculate_expcounts(G_of_R_pkl,X_len)
 61 | 		exp_counts += this_exp_counts
 62 | 		loglik += this_loglik
 63 | 	return exp_counts,loglik
 64 | 
 65 | # Parse commandline arguments
 66 | def GetArgs():
 67 | 
 68 | 	def ParseArgs(parser):
 69 | 		class Parser(argparse.ArgumentParser):
 70 | 			def error(self, message):
 71 | 				sys.stderr.write('error: %s\n' % message)
 72 | 				self.print_help()
 73 | 				sys.exit(2)
 74 | 
 75 | 		parser.add_argument('-g', '--G_of_R_list',
 76 | 							type=str,
 77 | 							required=True,
 78 | 							help='Text file listing paths to chunks of the G(R) matrix.')
 79 | 		parser.add_argument('-l', '--TE_list',
 80 | 							required=True,
 81 | 							type=str,
 82 | 							help='Text file listing the names of all transcripts. Output of G_of_R.py.')
 83 | 		parser.add_argument('-s', '--stop_thresh',
 84 | 							required=False,
 85 | 							default=10**-7,
 86 | 							type=float,
 87 | 							help='Continue EM iterations until no transcription expression fraction (X_i) changes by more than this value. [1e-7]')
 88 | 		parser.add_argument('-r', '--report_every',
 89 | 							required=False,
 90 | 							default=100,
 91 | 							type=int,
 92 | 							help='Write X every 100 steps. [100]')
 93 | 		parser.add_argument('-m', '--max_nEMsteps',
 94 | 							required=False,
 95 | 							default=10000,
 96 | 							type=int,
 97 | 							help='Terminate if threshold has not been reached after this many EM steps [10000]')
 98 | 		parser.add_argument('-t', '--nThreads',
 99 | 							required=False,
100 | 							default=16,
101 | 							type=int,
102 | 							help='Divide E step into this many threads. [16]')
103 | 		parser.add_argument('-p', '--prefix',
104 | 							required=False,
105 | 							type=str,
106 | 							default='',
107 | 							help='If specified, this prefix will be used for output files.')
108 | 		return parser.parse_args()
109 | 
110 | 	parser = argparse.ArgumentParser()
111 | 	args = ParseArgs(parser)
112 | 
113 | 	return args.G_of_R_list, args.TE_list, args.stop_thresh, args.report_every, args.max_nEMsteps, args.nThreads, args.prefix
114 | 
115 | 
116 | def main():
117 | 	G_of_R_list, TE_list, stop_thresh, report_every, max_nEMsteps, nThreads, prefix = GetArgs()
118 | 
119 | 	# All the transcripts names in the same order as the G_of_R matrix columns
120 | 	TE_names = list()
121 | 	for name in open(TE_list):
122 | 		TE_names.append(name.strip().split('\t')[0])
123 | 
124 | 	# Intial guess
125 | 	X = sparse.csr_matrix(numpy.ones((1,len(TE_names)),dtype=numpy.float64)/len(TE_names))
126 | 
127 | 	# Split up the pickle files into a set for each thread.
128 | 	G_of_R_pkl_fulllist = list()
129 | 	for G_of_R_pkl in open(G_of_R_list):
130 | 		G_of_R_pkl_fulllist.append(G_of_R_pkl.strip())
131 | 	G_of_R_pkl_lists = list()
132 | 	listsize = len(G_of_R_pkl_fulllist)//nThreads
133 | 	nlistsp1 = len(G_of_R_pkl_fulllist)%nThreads
134 | 	k = 0
135 | 	for i in range(nlistsp1):
136 | 		G_of_R_pkl_lists.append(G_of_R_pkl_fulllist[k:k+listsize+1])
137 | 		k+=listsize+1
138 | 	for i in range(nlistsp1,nThreads):
139 | 		G_of_R_pkl_lists.append(G_of_R_pkl_fulllist[k:k+listsize])
140 | 		k+=listsize
141 | 
142 | 	masterPool = Pool(processes = nThreads)
143 | 
144 | 	# Run the EM steps
145 | 	for step in range(max_nEMsteps):
146 | 		starttime = datetime.datetime.now()
147 | 		exp_counts = numpy.zeros((1,len(TE_names)),dtype=numpy.float64)
148 | 		loglik = 0.0
149 | 
150 | 		outputs = masterPool.map(calculate_expcounts_chunk,zip(G_of_R_pkl_lists,[X]*nThreads))
151 | 		for output in outputs:
152 | 			this_exp_counts,this_loglik = output
153 | 			exp_counts += this_exp_counts
154 | 			loglik += this_loglik
155 | 
156 | 		last_X = X.copy()
157 | 		X = sparse.csr_matrix(exp_counts/numpy.sum(exp_counts))
158 | 		print(str(step)+" "+str(numpy.max(numpy.abs(X.toarray()-last_X.toarray())))+" "+str(loglik)+" "+str(datetime.datetime.now()-starttime))
159 | 
160 | 		if (step+1) % report_every == 0:
161 | 			pickle.dump(X.toarray()[X.toarray() > 10**-10],open(prefix+'X_step_'+str(step+1)+'.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
162 | 			pickle.dump(numpy.array(TE_names)[X.toarray()[0,:] > 10**-10],open(prefix+'names_step_'+str(step+1)+'.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
163 | 
164 | 		if numpy.max(numpy.abs(X.toarray()-last_X.toarray())) < stop_thresh:
165 | 			break
166 | 
167 | 	# Output the final results
168 | 	pickle.dump(X.toarray()[X.toarray() > 10**-10],open(prefix+'X_final.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
169 | 	pickle.dump(numpy.array(TE_names)[X.toarray()[0,:] > 10**-10],open(prefix+'names_final.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL)
170 | 
171 | if __name__ == '__main__':
172 | 	main()
173 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | TERMS AND CONDITIONS
  2 | 
  3 | 0. Definitions.
  4 | 
  5 | “This License” refers to version 3 of the GNU General Public License.
  6 | 
  7 | “Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks.
  8 | 
  9 | “The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations.
 10 | 
 11 | To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work.
 12 | 
 13 | A “covered work” means either the unmodified Program or a work based on the Program.
 14 | 
 15 | To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well.
 16 | 
 17 | To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying.
 18 | 
 19 | An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion.
 20 | 
 21 | 1. Source Code.
 22 | 
 23 | The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work.
 24 | 
 25 | A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language.
 26 | 
 27 | The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it.
 28 | 
 29 | The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work.
 30 | 
 31 | The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source.
 32 | 
 33 | The Corresponding Source for a work in source code form is that same work.
 34 | 
 35 | 2. Basic Permissions.
 36 | 
 37 | All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law.
 38 | 
 39 | You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you.
 40 | 
 41 | Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
 42 | 
 43 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
 44 | 
 45 | No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures.
 46 | 
 47 | When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures.
 48 | 
 49 | 4. Conveying Verbatim Copies.
 50 | 
 51 | You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program.
 52 | 
 53 | You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee.
 54 | 
 55 | 5. Conveying Modified Source Versions.
 56 | 
 57 | You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions:
 58 | 
 59 | a) The work must carry prominent notices stating that you modified it, and giving a relevant date.
 60 | b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to “keep intact all notices”.
 61 | c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it.
 62 | d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so.
 63 | A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate.
 64 | 
 65 | 6. Conveying Non-Source Forms.
 66 | 
 67 | You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways:
 68 | 
 69 | a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange.
 70 | b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge.
 71 | c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b.
 72 | d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements.
 73 | e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d.
 74 | A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work.
 75 | 
 76 | A “User Product” is either (1) a “consumer product”, which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, “normally used” refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product.
 77 | 
 78 | “Installation Information” for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made.
 79 | 
 80 | If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM).
 81 | 
 82 | The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network.
 83 | 
 84 | Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying.
 85 | 
 86 | 7. Additional Terms.
 87 | 
 88 | “Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions.
 89 | 
 90 | When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission.
 91 | 
 92 | Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms:
 93 | 
 94 | a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or
 95 | b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or
 96 | c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or
 97 | d) Limiting the use for publicity purposes of names of licensors or authors of the material; or
 98 | e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or
 99 | f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors.
100 | All other non-permissive additional terms are considered “further restrictions” within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying.
101 | 
102 | If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms.
103 | 
104 | Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way.
105 | 
106 | 8. Termination.
107 | 
108 | You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11).
109 | 
110 | However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation.
111 | 
112 | Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice.
113 | 
114 | Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10.
115 | 
116 | 9. Acceptance Not Required for Having Copies.
117 | 
118 | You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so.
119 | 
120 | 10. Automatic Licensing of Downstream Recipients.
121 | 
122 | Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License.
123 | 
124 | An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts.
125 | 
126 | You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it.
127 | 
128 | 11. Patents.
129 | 
130 | A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's “contributor version”.
131 | 
132 | A contributor's “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License.
133 | 
134 | Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version.
135 | 
136 | In the following three paragraphs, a “patent license” is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To “grant” such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party.
137 | 
138 | If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. “Knowingly relying” means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid.
139 | 
140 | If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it.
141 | 
142 | A patent license is “discriminatory” if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007.
143 | 
144 | Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law.
145 | 
146 | 12. No Surrender of Others' Freedom.
147 | 
148 | If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program.
149 | 
150 | 13. Use with the GNU Affero General Public License.
151 | 
152 | Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such.
153 | 
154 | 14. Revised Versions of this License.
155 | 
156 | The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns.
157 | 
158 | Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation.
159 | 
160 | If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program.
161 | 
162 | Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version.
163 | 
164 | 15. Disclaimer of Warranty.
165 | 
166 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
167 | 
168 | 16. Limitation of Liability.
169 | 
170 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
171 | 
172 | 17. Interpretation of Sections 15 and 16.
173 | 
174 | If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Installation
  2 | ### conda way
  3 | You will need
  4 | 1. git (https://git-scm.com/book/en/v2/Getting-Started-Installing-Git)
  5 | 2. anaconda (https://docs.anaconda.com/anaconda/install/)
  6 | 
  7 | Download from github
  8 | ```
  9 | git clone https://github.com/FenyoLab/L1EM
 10 | ```
 11 | Create conda environment
 12 | ```
 13 | cd L1EM
 14 | conda env create -f L1EM.yml
 15 | ```
 16 | 
 17 | Before running L1EM, activate the environment:
 18 | ```
 19 | source activate L1EM
 20 | ```
 21 | 
 22 | When finished, deactivate the environment:
 23 | ```
 24 | source deactivate L1EM
 25 | ```
 26 | 
 27 | ### old way
 28 | Alternatively you can install the following dependencies yourself:
 29 | * python version 2.7+ (version 2.7 tested)
 30 | * bwa (version 0.7.17 tested)
 31 | * samtools (version 1.9 tested)
 32 | * numpy (version 1.14.3 tested)
 33 | * scipy (version 1.1.0 tested)
 34 | * pysam (version 0.15.0 tested)
 35 | * bedtools (version 2.27.1 tested)
 36 | 
 37 | No compiling of L1EM is necessary. Python scripts will be called from inside the L1EM
 38 | directory.
 39 | 
 40 | If necessary, you can specify the path for bwa and samtools in the run\_L1EM.sh script.
 41 | You must use samtools >=1.0. Early version of pysam will not work. I highly recommend
 42 | that you use bwa 0.7.17. Earlier versions may differ in how they write the XA tag. This
 43 | will lead to inaccurate results without throwing an error.
 44 | 
 45 | ## Quick guide
 46 | ### First time: build L1EM reference
 47 | You will need the hg38 reference genome in fasta format, with bwa index.
 48 | Downloaded from UCSC genome browser:
 49 | ```
 50 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
 51 | zcat hg38.fa.gz > hg38.fa
 52 | bwa index hg38.fa
 53 | ```
 54 | Note: this will take some time.
 55 | 
 56 | Then you can build the L1EM reference using the provided shell script:
 57 | ```
 58 | bash generate_L1EM_fasta_and_index.sh /fullpathto/hg38.fa
 59 | ```
 60 | This should be done inside the L1EM directory
 61 | 
 62 | ### Executing the L1-EM pipeline
 63 | You will need a bam file with strand specific paired end read alignments to hg38. You can
 64 | use any aligner, but make sure that all reads from the original fastq files are present
 65 | trimming should be okay, but is tested. Filtering reads will potentially break the pipeline.
 66 | 
 67 | First move to an empty directory and then execute the shell script:
 68 | ```
 69 | bash -e /fullpathto/run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
 70 | ```
 71 | L1EM will write files with specific names, so do NOT run two instances of L1EM in the same
 72 | directory.
 73 | 
 74 | At the end of the run\_L1EM.sh script are a commented set of commands to delete all the
 75 | intermediate files. If you wish to automatically delete intermediate files, you can delete
 76 | these comments.
 77 | 
 78 | ### Output
 79 | At completion, three tab delimited tables will be written.
 80 | 1. full\_counts.txt: raw count estimates for each L1HS/L1PA\* element with any aligned read pairs
 81 | 2. l1hs\_transcript\_counts.txt: expression estimates for L1HS elements, reported as raw counts
 82 | 3. filter\_L1HS\_FPM.txt: L1HS whose expression is supported by at least 100 read pairs, reported as FPM (read pairs per million properly aligned)
 83 | 
 84 | The rows of all files are L1 loci.
 85 | 
 86 | For full\_counts.txt each of the five transcript types:
 87 | only, runon, passive (sense), passive (antisense), antisense
 88 | are reported.
 89 | 
 90 | For l1hs\_transcript\_counts.txt and filter\_L1HS\_FPM.txt only proper transcription from L1HS elements start at the
 91 | 5' UTR is reported.
 92 | 
 93 | The results are also written as pickle files to facilitate further analysis in python. To
 94 | generate a python dictionary with keys being the transcript names and values being the
 95 | relative expression:
 96 | ```
 97 | X_est = dict(zip(pickle.load(open('names_final.pkl')),pickle.load(open('X_final.pkl'))))
 98 | ```
 99 | 
100 | ## Additional details
101 | * Our Bioinformatics paper introducing L1EM: https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btz724/5581349
102 | * More details can be found in manual.md
103 | 
104 | ## Mouse Version
105 | Scripts and annotation to measure the expression of LINE-1 loci in mm39 has been added. The mouse version uses all the same methodology as the human version, but has not been as rigorously tested.
106 | 1. Download and index the mm39 reference genome (UCSC genome browser version)
107 | ```
108 | wget http://hgdownload.cse.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz
109 | zcat mm39.fa.gz > mm39.fa
110 | bwa index mm39.fa
111 | ```
112 | 2. Build the mm39 L1EM reference.
113 | ```
114 | bash generate_mm39_L1EM_fasta_and_index.sh /fullpathto/mm39.fa
115 | ```
116 | 3. Run L1EM.
117 | ```
118 | bash /fullpathto/run_L1EM_mm39.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/mm39.fa
119 | ```
120 | All L1Md loci are quantified in full\_counts.txt. Normalized expression of 5' UTR intact young (L1Md\_Tf I/II/II, L1Md\_Gf I/II, L1Md\_A I/II/III) LINE-1 loci supported by at least 100 reads can be found in filter\_active\_L1Md\_FPM.txt.
121 | 
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/generate_L1EM_fasta_and_index.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # If you need to specify package directories
 4 | bedtools=$(which bedtools)
 5 | bwa=$(which bwa)
 6 | 
 7 | # Command line
 8 | hg38=$1
 9 | 
10 | $bedtools getfasta -s -name -fi $hg38 -bed annotation/L1EM.400.bed > annotation/L1EM.400.fa
11 | $bwa index annotation/L1EM.400.fa


--------------------------------------------------------------------------------
/generate_mm39_L1EM_fasta_and_index.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # If you need to specify package directories
 4 | bedtools=$(which bedtools)
 5 | bwa=$(which bwa)
 6 | 
 7 | # Command line
 8 | mm39=$1
 9 | 
10 | $bedtools getfasta -s -name -fi $mm39 -bed annotation/mm39.L1EM.bed > annotation/mm39.L1EM.400.fa
11 | $bwa index annotation/mm39.L1EM.400.fa
12 | 


--------------------------------------------------------------------------------
/manual.md:
--------------------------------------------------------------------------------
 1 | ## Pipeline Parameters
 2 | 
 3 | The key parameters for L1EM are listed at the beginning of the run\_L1EM.sh file. Default parameters should work well in most cases, but advanced users may wish to tinker.
 4 | 1. threads. Dictates the number of threads that L1EM will spawn. More threads will improve parallel performance, but memory usage scales linearly with number of threads.
 5 | 2. realignNM. The number of mismatches to allow when trying to realign reads that do not align as proper pairs in the bam file provided. Default is 3, but you might want to increase for longer reads.
 6 | 3. L1EM_NM. As above, but for the generation of candidate alignments to the L1EM reference. Including more candidate alignments will slow the computation, but too few candidate alignments could yield less accurate results.
 7 | 4. NMdiff. Only consider alignments with at most this many more mismatches than the primary alignment. Because read likelihood diminishes exponentially with additional mismatches, increasing this parameter is unlikely to affect results but will slow the EM steps.
 8 | 5. bwa\_i. By default bwa will create a large number of alignments with indels near the edge of the read. This parameter will prevent this behavior. You may wish to decrease this parameter for shorter reads.
 9 | 6. error\_prob. Probability of an error. Error probability is chosen to be constant because computing the read likelihood from base quality scores is slow.
10 | 7. max\_start2start\_len=500. Maximum allowed fragment/template length. Increase if you are using data with very large fragments.
11 | 8. reads\_per\_pickle. The G(R) matrix is split into a number of pickle files, so the entire matrix doesn't need to sit in memory. Decreasing this parameter will free up memory at the G(R) construction and EM steps.
12 | 9. EM\_threshold. Run EM steps until no entry in X changes by more than this value. The paremeter is chosen to be small by default to ensure convergence. Increasing the parameter modestly will improve run time.
13 | 10. template\_fraction. When computing median template length, subsample read to this fraction. You only need about 10,000 proper pairs to get a good estimate.
14 | 
15 | ## Generating new annotations
16 | If you wish to run L1-EM for another retrotransposon or for another model organism, you will need to generate a new annotation.
17 | 1. Create a bedfile with the following naming scheme:
18 | family.category.region.strand
19 | Where family is the name of the repeat family,
20 | category is 1 is the element has a promoter and 0 otherwise
21 | region is the genome region (chrom:start-stop) of the element
22 | strand is +/- depending which strand the element falls on
23 | The bedfile must have the six required fields: chrom, start, stop, name, score, strand
24 | The start and stop coordinates should include 400 positions of flanking sequence on either end.
25 | Exons overlapping the annotation can also be included.
26 | 2. Create a fasta file from your bed file and index it with bwa:
27 | ```
28 | bedtools getfasta -s -name -fi refernece.fa -bed annotation.bed > annotation.fa
29 | bwa index annotation.fa
30 | ```
31 | 3. Update lines 27 and 28 to point toward your new annotation.
32 | 
33 | ## Pipeline steps
34 | ### STEP 1: realign
35 | In this step reads that are not properly paired are extracted and realigned with bwa. Many aligners do not bother with highly redundant reads, so this step is included to ensure that LINE-1 aligning reads are identified.
36 | 
37 | ### STEP 2: extract
38 | In this step, L1HS/L1PA reads are extracted. Any read pair for which either end overlaps an entry in the L1EM.400.bed annotation is considered.
39 | 
40 | ### STEP 3: candidate alignments
41 | The extracted reads are aligned to L1EM.400.fa, all secondary alignments with up to L1EM_NM mismatches are found. The candidate alignments fastqs are split for parallelization. It is vitally important that all candidate alignments are identified. Missing some of these alignments will drastically hurt accuracy. For this reads bwa aln is used. Do not use bwa mem or STAR as these aligner do not provide a complete enumeration of secondary alignments for highly repetitive elements (like L1HS).
42 | 
43 | ### STEP 4: G(R) matrix construction
44 | 
45 | The bam files of candidate alignments are read by the script G\_of\_R.py. The likelihood of each candidate alignment is calculated and added to the G(R) matrix.
46 | 
47 | The following options are additional parameters that can be accessed at this step:
48 | 1. -f/--flanking specifies the amount of flanking sequence in the annotation. If you created you own annotation with more or less that 400 bases of flanking sequence specify that here.
49 | 2. --as\_start. If you wish to change to TSS for antisense transcription do that here.
50 | 3. -w/--wiggle. Some proper LINE-1 transcripts start slightly before the annotation start of the 5'UTR. This parameter extends the annotated element this many bases in either direct (default is 20).
51 | 4. --min\_len. Puts a floor on transcript effective length to prevent cases where transcription of very short elements are over predicted. Default is 500.
52 | 5. --min\_exon\_len. Corresponding minimun effective length for exon annotations. Default is 100.
53 | 
54 | ### STEP 5: Expectation maximization
55 | In this step, the expectation maximization algorithm is used to compute a maximum likelihood estimate of relative expression, using the G(R) matrix output in the previous as input.
56 | The following options are additional parameters that can be accessed at this step:
57 | 1. -r/--report\_every. Write the estimate every n steps.
58 | 2. -m/--max\_nEMsteps. By default EM stops if converge has not been achieved after 10000 steps. Change that value here.
59 | 
60 | ### STEP 6: Writing results
61 | At completion, three tab delimited tables will be written.
62 | 1. full\_counts.txt: raw count estimates for each L1HS/L1PA\* element with any aligned read pairs
63 | 2. l1hs\_transcript\_counts.txt: expression estimates for L1HS elements, reported as raw counts
64 | 3. filter\_L1HS\_FPM.txt: L1HS whose expression is supported by at least 100 read pairs, reported as FPM (read pairs per million properly aligned)
65 | 
66 | ### STEP 7: Clean up
67 | All the intermediate files are delete at this step. Comment out these lines if you want to keep them.
68 | 
69 | The rows of both files are L1 loci.
70 | 
71 | For full\_counts.txt each of the five transcript types:
72 | only, runon, passive (sense), passive (antisense), antisense
73 | are reported.
74 | 
75 | For l1hs\_transcript_counts.txt only proper transcription from L1HS elements start at the
76 | 5' UTR is reported.
77 | 
78 | The results are also written as pickle files to facilitate further analysis in python. To
79 | generate a python dictionary with keys being the transcript names and values being the
80 | relative expression:
81 | ```
82 | X_est = dict(zip(pickle.load(open('names_final.pkl')),pickle.load(open('X_final.pkl'))))
83 | ```
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/parameters.sh:
--------------------------------------------------------------------------------
 1 | # Parameters
 2 | export threads=16 #How many threads to use for samtools, bwa and L1EM
 3 | export realignNM=3 #Number of mismatches allowed in bwa realignment
 4 | export L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments
 5 | export NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment
 6 | export bwa_i=20 #bwa i parameter prevents indels near the edges of a read
 7 | export error_prob=0.01 #Probability of a read error at a given position
 8 | export max_start2start_len=500 #Max allowed template/fragment length
 9 | export reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
10 | export EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
11 | export template_fraction=1 #Fraction of reads to consider when calculated median template length.
12 | 


--------------------------------------------------------------------------------
/run_L1EM.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script to execute L1-EM pipeline
  4 | # Copyright (C) 2019 Wilson McKerrow
  5 | 
  6 | #    This program is free software: you can redistribute it and/or modify
  7 | #    it under the terms of the GNU General Public License as published by
  8 | #    the Free Software Foundation, either version 3 of the License, or
  9 | #    (at your option) any later version.
 10 | 
 11 | #    This program is distributed in the hope that it will be useful,
 12 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | #    GNU General Public License for more details.
 15 | 
 16 |  #   You should have received a copy of the GNU General Public License
 17 |  #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 18 | 
 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
 20 | 
 21 | # Parameters
 22 | threads=16 #How many threads to use for samtools, bwa and L1EM
 23 | realignNM=3 #Number of mismatches allowed in bwa realignment
 24 | L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments
 25 | NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment
 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
 27 | error_prob=0.01 #Probability of a read error at a given position
 28 | max_start2start_len=500 #Max allowed template/fragment length
 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
 30 | EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
 32 | 
 33 | # If you need to specify paths to required packages
 34 | bwa=$(which bwa) # version 0.7.17 tested
 35 | samtools=$(which samtools) # version 1.9 tested
 36 | python=$(which python) # use version 2.7
 37 | 
 38 | # Command line arguments
 39 | bamfile=$1
 40 | L1EM_directory=$2
 41 | hg38=$3
 42 | 
 43 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed'
 44 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa'
 45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
 48 | 
 49 | # Try to realign unaligned reads using bwa aln.
 50 | echo 'STEP 1: realign'
 51 | mkdir idL1reads
 52 | cd idL1reads
 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 
 57 | samtools index realigned.bam
 58 | 
 59 | # Extract L1HS/L1PA* aligning reads.
 60 | echo 'STEP 2: extract'
 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
 65 | cat temp.fq1 >> L1.fq1
 66 | cat temp.fq2 >> L1.fq2
 67 | # rm temp*
 68 | 
 69 | # Split the L1 fastq files for parallel execution
 70 | cd ..
 71 | mkdir split_fqs
 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
 75 | cd split_fqs
 76 | 
 77 | # Generate candidate alignments
 78 | echo 'STEP 3: candidate alignments'
 79 | for name in *.fq1.*
 80 |     do reads1=$name
 81 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 82 |     ref=$L1EM_fa
 83 |     base=$(echo $name|sed 's/.fq1//g')
 84 |     $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
 85 |     $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 >  $base.R2.aln.sai
 86 | done
 87 | for name in *.fq1.*
 88 |     do reads1=$name
 89 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 90 |     ref=$L1EM_fa
 91 |     base=$(echo $name|sed 's/.fq1//g')
 92 |     $bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai  $base.R2.aln.sai $reads1 $reads2 | $samtools view -bS - | $samtools sort -n - > $base.aln.bam &
 93 | done
 94 | wait
 95 | 
 96 | # Make G_of_R matrix
 97 | echo 'STEP 4: G(R) matrix construction'
 98 | mkdir ../G_of_R
 99 | cd ../G_of_R
100 | $python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
101 | medianinsert=$(head -1 ../baminfo.txt)
102 | for bam in ../split_fqs/*.bam
103 | 	do $python ${L1EM_code_dir}G_of_R.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
104 | done
105 | wait
106 | 
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 | 
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 | 
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}report_l1hs_transcription.py > ../l1hs_transcript_counts.txt
121 | $python ${L1EM_utilities_dir}filtered_and_normalized_l1hs.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt
122 | 
123 | #Clean up
124 | echo 'STEP 7: Clean up'
125 | cp *final.pkl ../
126 | cd ..
127 | 
128 | # rm idL1reads/*
129 | # rmdir idL1reads
130 | # rm split_fqs/*
131 | # rmdir split_fqs
132 | # rm G_of_R/*
133 | # rmdir G_of_R
134 | # rm L1EM/*
135 | # rmdir L1EM
136 | 


--------------------------------------------------------------------------------
/run_L1EM_fortcga.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/L1HS.fa
  4 | 
  5 | # Parameters
  6 | threads=16 #How many threads to use for samtools, bwa and L1EM
  7 | realignNM=3 #Number of mismatches allowed in bwa realignment
  8 | L1EM_NM=2 # Number of mismatches allowed when enumerated candidate alignments
  9 | NMdiff=1 #Skip candidate alignments with greater than this many more mismatches than the best alignment
 10 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
 11 | error_prob=0.01 #Probability of a read error at a given position
 12 | max_start2start_len=500 #Max allowed template/fragment length
 13 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
 14 | EM_threshold=1e-6 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
 15 | template_fraction=0.0001 #Fraction of reads to consider when calculated median template length.
 16 | 
 17 | # If you need to specify paths to required packages
 18 | bwa=$(which bwa) # version 0.7.17 tested
 19 | samtools=$(which samtools) # version 1.9 tested
 20 | python=$(which python) # use version 2.7
 21 | 
 22 | # Command line arguments
 23 | bamfile=$1
 24 | L1EM_directory=$2
 25 | L1HS=$3
 26 | 
 27 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed'
 28 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa'
 29 | L1EM_code_dir=$L1EM_directory'/L1EM/'
 30 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
 31 | 
 32 | # Try to realign unaligned reads using bwa aln.
 33 | echo 'STEP 1: realign'
 34 | mkdir idL1reads
 35 | cd idL1reads
 36 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
 37 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $L1HS unaligned.fq1 > 1.sai
 38 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $L1HS unaligned.fq2 > 2.sai
 39 | $bwa sampe $L1HS 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -f 2 -@ $threads - | $samtools sort -@ $threads - > realigned.bam 
 40 | 
 41 | # Extract L1HS/L1PA* aligning reads.
 42 | echo 'STEP 2: extract'
 43 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
 44 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
 45 | $samtools fastq realigned.bam -1 temp.fq1 -2 temp.fq2
 46 | cat temp.fq1 >> L1.fq1
 47 | cat temp.fq2 >> L1.fq2
 48 | rm temp*
 49 | 
 50 | # Split the L1 fastq files for parallel execution
 51 | cd ..
 52 | mkdir split_fqs
 53 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
 54 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
 55 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
 56 | cd split_fqs
 57 | 
 58 | # Generate candidate alignments
 59 | echo 'STEP 3: candidate alignments'
 60 | for name in *.fq1.*
 61 |     do reads1=$name
 62 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 63 |     ref=$L1EM_fa
 64 |     base=$(echo $name|sed 's/.fq1//g')
 65 |     $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
 66 |     $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 >  $base.R2.aln.sai
 67 | done
 68 | for name in *.fq1.*
 69 |     do reads1=$name
 70 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 71 |     ref=$L1EM_fa
 72 |     base=$(echo $name|sed 's/.fq1//g')
 73 |     $bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai  $base.R2.aln.sai $reads1 $reads2 | $samtools view -bS - | $samtools sort -n - > $base.aln.bam &
 74 | done
 75 | wait
 76 | 
 77 | # Make G_of_R matrix
 78 | echo 'STEP 4: G(R) matrix construction'
 79 | mkdir ../G_of_R
 80 | cd ../G_of_R
 81 | medianinsert=$($python ${L1EM_utilities_dir}median_template.py $bamfile $template_fraction)
 82 | for bam in ../split_fqs/*.bam
 83 | 	do $python ${L1EM_code_dir}G_of_R_unstranded.py -b $bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
 84 | done
 85 | wait
 86 | 
 87 | # RUN EM
 88 | echo 'STEP 5: Expectation maximization'
 89 | mkdir ../L1EM/
 90 | cd ../L1EM/
 91 | ls ../G_of_R/*pk2 > G_of_R_list.txt
 92 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
 93 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
 94 | 
 95 | #Write results as text file
 96 | echo 'STEP 6: Writing results'
 97 | 
 98 | $python ${L1EM_utilities_dir}report_l1_exp_counts_unstranded.py > ../full_counts.txt
 99 | $python ${L1EM_utilities_dir}report_l1hs_transcription.py > ../l1hs_transcript_counts.txt
100 | 
101 | #Clean up
102 | echo 'STEP 7: Clean up'
103 | cp *final.pkl ../
104 | cd ..
105 | 
106 | #rm idL1reads/*
107 | #rmdir idL1reads
108 | #rm split_fqs/*
109 | #rmdir split_fqs
110 | #rm G_of_R/*
111 | #rmdir G_of_R
112 | #rm L1EM/*
113 | #rmdir L1EM
114 | 
115 | 


--------------------------------------------------------------------------------
/run_L1EM_mm39.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script to execute L1-EM pipeline
  4 | # Copyright (C) 2019 Wilson McKerrow
  5 | 
  6 | #    This program is free software: you can redistribute it and/or modify
  7 | #    it under the terms of the GNU General Public License as published by
  8 | #    the Free Software Foundation, either version 3 of the License, or
  9 | #    (at your option) any later version.
 10 | 
 11 | #    This program is distributed in the hope that it will be useful,
 12 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | #    GNU General Public License for more details.
 15 | 
 16 |  #   You should have received a copy of the GNU General Public License
 17 |  #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 18 | 
 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
 20 | 
 21 | # Parameters
 22 | threads=16 #How many threads to use for samtools, bwa and L1EM
 23 | realignNM=2 #Number of mismatches allowed in bwa realignment
 24 | L1EM_NM=2 # Number of mismatches allowed when enumerated candidate alignments
 25 | NMdiff=1 #Skip candidate alignments with greater than this many more mismatches than the best alignment
 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
 27 | error_prob=0.01 #Probability of a read error at a given position
 28 | max_start2start_len=500 #Max allowed template/fragment length
 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
 30 | EM_threshold=1e-6 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
 32 | 
 33 | # If you need to specify paths to required packages
 34 | bwa=$(which bwa) # version 0.7.17 tested
 35 | samtools=$(which samtools) # version 1.9 tested
 36 | python=$(which python) # use version 2.7
 37 | 
 38 | # Command line arguments
 39 | bamfile=$1
 40 | L1EM_directory=$2
 41 | hg38=$3
 42 | 
 43 | L1EM_bed=$L1EM_directory'/annotation/mm39.L1EM.bed'
 44 | L1EM_fa=$L1EM_directory'/annotation/mm39.L1EM.400.fa'
 45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
 48 | 
 49 | # Try to realign unaligned reads using bwa aln.
 50 | echo 'STEP 1: realign'
 51 | mkdir idL1reads
 52 | cd idL1reads
 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 
 57 | samtools index realigned.bam
 58 | 
 59 | # Extract L1HS/L1PA* aligning reads.
 60 | echo 'STEP 2: extract'
 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
 65 | cat temp.fq1 >> L1.fq1
 66 | cat temp.fq2 >> L1.fq2
 67 | rm temp*
 68 | 
 69 | # Split the L1 fastq files for parallel execution
 70 | cd ..
 71 | mkdir split_fqs
 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*10*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
 75 | cd split_fqs
 76 | 
 77 | # Generate candidate alignments
 78 | echo 'STEP 3: candidate alignments'
 79 | for name in *.fq1.*
 80 |     do reads1=$name
 81 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 82 |     ref=$L1EM_fa
 83 |     base=$(echo $name|sed 's/.fq1//g')
 84 |     bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
 85 |     bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 >  $base.R2.aln.sai
 86 |     bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai  $base.R2.aln.sai $reads1 $reads2 > temp.$base.aln.sam
 87 |     samtools view -@ $threads -bS temp.$base.aln.sam > temp.$base.aln.bam
 88 |     samtools sort -@ $threads -n temp.$base.aln.bam > $base.aln.bam
 89 |     rm temp.$base.aln.sam temp.$base.aln.bam $base.R1.aln.sai $base.R2.aln.sai
 90 | done
 91 | 
 92 | # Make G_of_R matrix
 93 | echo 'STEP 4: G(R) matrix construction'
 94 | mkdir ../G_of_R
 95 | cd ../G_of_R
 96 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
 97 | medianinsert=$(head -1 ../baminfo.txt)
 98 | ls ../split_fqs/*.bam > list_of_bams.txt
 99 | split -l $threads list_of_bams.txt list_of_bams.txt.
100 | for bamlist in list_of_bams.txt.*
101 | 	do for bam in $(cat $bamlist)
102 | 		do python ${L1EM_code_dir}G_of_R.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
103 | 	done
104 | 	wait
105 | done
106 | 
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 | 
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 | 
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}filtered_and_normalized_active_l1md.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_active_L1Md_FPM.txt
121 | 
122 | #Clean up
123 | echo 'STEP 7: Clean up'
124 | cp *final.pkl ../
125 | cd ..
126 | 
127 | # rm idL1reads/*
128 | # rmdir idL1reads
129 | # rm split_fqs/*
130 | # rmdir split_fqs
131 | # rm G_of_R/*
132 | # rmdir G_of_R
133 | # rm L1EM/*
134 | # rmdir L1EM
135 | 


--------------------------------------------------------------------------------
/run_L1EM_mm39_unstranded.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script to execute L1-EM pipeline
  4 | # Copyright (C) 2019 Wilson McKerrow
  5 | 
  6 | #    This program is free software: you can redistribute it and/or modify
  7 | #    it under the terms of the GNU General Public License as published by
  8 | #    the Free Software Foundation, either version 3 of the License, or
  9 | #    (at your option) any later version.
 10 | 
 11 | #    This program is distributed in the hope that it will be useful,
 12 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | #    GNU General Public License for more details.
 15 | 
 16 |  #   You should have received a copy of the GNU General Public License
 17 |  #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 18 | 
 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
 20 | 
 21 | # Parameters
 22 | threads=16 #How many threads to use for samtools, bwa and L1EM
 23 | realignNM=2 #Number of mismatches allowed in bwa realignment
 24 | L1EM_NM=2 # Number of mismatches allowed when enumerated candidate alignments
 25 | NMdiff=1 #Skip candidate alignments with greater than this many more mismatches than the best alignment
 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
 27 | error_prob=0.01 #Probability of a read error at a given position
 28 | max_start2start_len=500 #Max allowed template/fragment length
 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
 30 | EM_threshold=1e-6 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
 32 | 
 33 | # If you need to specify paths to required packages
 34 | bwa=$(which bwa) # version 0.7.17 tested
 35 | samtools=$(which samtools) # version 1.9 tested
 36 | python=$(which python) # use version 2.7
 37 | 
 38 | # Command line arguments
 39 | bamfile=$1
 40 | L1EM_directory=$2
 41 | hg38=$3
 42 | 
 43 | L1EM_bed=$L1EM_directory'/annotation/mm39.L1EM.bed'
 44 | L1EM_fa=$L1EM_directory'/annotation/mm39.L1EM.400.fa'
 45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
 48 | 
 49 | # Try to realign unaligned reads using bwa aln.
 50 | echo 'STEP 1: realign'
 51 | mkdir idL1reads
 52 | cd idL1reads
 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 
 57 | samtools index realigned.bam
 58 | 
 59 | # Extract L1HS/L1PA* aligning reads.
 60 | echo 'STEP 2: extract'
 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
 65 | cat temp.fq1 >> L1.fq1
 66 | cat temp.fq2 >> L1.fq2
 67 | rm temp*
 68 | 
 69 | # Split the L1 fastq files for parallel execution
 70 | cd ..
 71 | mkdir split_fqs
 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*10*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
 75 | cd split_fqs
 76 | 
 77 | # Generate candidate alignments
 78 | echo 'STEP 3: candidate alignments'
 79 | for name in *.fq1.*
 80 |     do reads1=$name
 81 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 82 |     ref=$L1EM_fa
 83 |     base=$(echo $name|sed 's/.fq1//g')
 84 |     bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
 85 |     bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 >  $base.R2.aln.sai
 86 |     bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai  $base.R2.aln.sai $reads1 $reads2 > temp.$base.aln.sam
 87 |     samtools view -@ $threads -bS temp.$base.aln.sam > temp.$base.aln.bam
 88 |     samtools sort -@ $threads -n temp.$base.aln.bam > $base.aln.bam
 89 |     rm temp.$base.aln.sam temp.$base.aln.bam $base.R1.aln.sai $base.R2.aln.sai
 90 | done
 91 | 
 92 | # Make G_of_R matrix
 93 | echo 'STEP 4: G(R) matrix construction'
 94 | mkdir ../G_of_R
 95 | cd ../G_of_R
 96 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
 97 | medianinsert=$(head -1 ../baminfo.txt)
 98 | ls ../split_fqs/*.bam > list_of_bams.txt
 99 | split -l $threads list_of_bams.txt list_of_bams.txt.
100 | for bamlist in list_of_bams.txt.*
101 | 	do for bam in $(cat $bamlist)
102 | 		do python ${L1EM_code_dir}G_of_R_unstranded.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
103 | 	done
104 | 	wait
105 | done
106 | 
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 | 
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 | 
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}filtered_and_normalized_active_l1md_unstranded.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_active_L1Md_FPM.txt
121 | 
122 | #Clean up
123 | echo 'STEP 7: Clean up'
124 | cp *final.pkl ../
125 | cd ..
126 | 
127 | # rm idL1reads/*
128 | # rmdir idL1reads
129 | # rm split_fqs/*
130 | # rmdir split_fqs
131 | # rm G_of_R/*
132 | # rmdir G_of_R
133 | # rm L1EM/*
134 | # rmdir L1EM
135 | 


--------------------------------------------------------------------------------
/run_L1EM_unstranded.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script to execute L1-EM pipeline
  4 | # Copyright (C) 2019 Wilson McKerrow
  5 | 
  6 | #    This program is free software: you can redistribute it and/or modify
  7 | #    it under the terms of the GNU General Public License as published by
  8 | #    the Free Software Foundation, either version 3 of the License, or
  9 | #    (at your option) any later version.
 10 | 
 11 | #    This program is distributed in the hope that it will be useful,
 12 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | #    GNU General Public License for more details.
 15 | 
 16 |  #   You should have received a copy of the GNU General Public License
 17 |  #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 18 | 
 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
 20 | 
 21 | # Parameters
 22 | threads=16 #How many threads to use for samtools, bwa and L1EM
 23 | realignNM=3 #Number of mismatches allowed in bwa realignment
 24 | L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments
 25 | NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment
 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
 27 | error_prob=0.01 #Probability of a read error at a given position
 28 | max_start2start_len=500 #Max allowed template/fragment length
 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
 30 | EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
 32 | 
 33 | # If you need to specify paths to required packages
 34 | bwa=$(which bwa) # version 0.7.17 tested
 35 | samtools=$(which samtools) # version 1.9 tested
 36 | python=$(which python) # use version 2.7
 37 | 
 38 | # Command line arguments
 39 | bamfile=$1
 40 | L1EM_directory=$2
 41 | hg38=$3
 42 | 
 43 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed'
 44 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa'
 45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
 48 | 
 49 | # Try to realign unaligned reads using bwa aln.
 50 | echo 'STEP 1: realign'
 51 | mkdir idL1reads
 52 | cd idL1reads
 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 
 57 | samtools index realigned.bam
 58 | 
 59 | # Extract L1HS/L1PA* aligning reads.
 60 | echo 'STEP 2: extract'
 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
 65 | cat temp.fq1 >> L1.fq1
 66 | cat temp.fq2 >> L1.fq2
 67 | # rm temp*
 68 | 
 69 | # Split the L1 fastq files for parallel execution
 70 | cd ..
 71 | mkdir split_fqs
 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
 75 | cd split_fqs
 76 | 
 77 | # Generate candidate alignments
 78 | echo 'STEP 3: candidate alignments'
 79 | for name in *.fq1.*
 80 |     do reads1=$name
 81 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 82 |     ref=$L1EM_fa
 83 |     base=$(echo $name|sed 's/.fq1//g')
 84 |     $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
 85 |     $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 >  $base.R2.aln.sai
 86 | done
 87 | for name in *.fq1.*
 88 |     do reads1=$name
 89 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 90 |     ref=$L1EM_fa
 91 |     base=$(echo $name|sed 's/.fq1//g')
 92 |     $bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai  $base.R2.aln.sai $reads1 $reads2 | $samtools view -bS - | $samtools sort -n - > $base.aln.bam &
 93 | done
 94 | wait
 95 | 
 96 | # Make G_of_R matrix
 97 | echo 'STEP 4: G(R) matrix construction'
 98 | mkdir ../G_of_R
 99 | cd ../G_of_R
100 | $python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
101 | medianinsert=$(head -1 ../baminfo.txt)
102 | for bam in ../split_fqs/*.bam
103 | 	do $python ${L1EM_code_dir}G_of_R_unstranded.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
104 | done
105 | wait
106 | 
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 | 
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 | 
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts_unstranded.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}report_l1hs_transcription_unstranded.py > ../l1hs_transcript_counts.txt
121 | $python ${L1EM_utilities_dir}filtered_and_normalized_l1hs_unstranded.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt
122 | 
123 | #Clean up
124 | echo 'STEP 7: Clean up'
125 | cp *final.pkl ../
126 | cd ..
127 | 
128 | # rm idL1reads/*
129 | # rmdir idL1reads
130 | # rm split_fqs/*
131 | # rmdir split_fqs
132 | # rm G_of_R/*
133 | # rmdir G_of_R
134 | # rm L1EM/*
135 | # rmdir L1EM
136 | 


--------------------------------------------------------------------------------
/run_L1EM_unstranded_fromdocker.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script to execute L1-EM pipeline
  4 | # Copyright (C) 2019 Wilson McKerrow
  5 | 
  6 | #    This program is free software: you can redistribute it and/or modify
  7 | #    it under the terms of the GNU General Public License as published by
  8 | #    the Free Software Foundation, either version 3 of the License, or
  9 | #    (at your option) any later version.
 10 | 
 11 | #    This program is distributed in the hope that it will be useful,
 12 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | #    GNU General Public License for more details.
 15 | 
 16 |  #   You should have received a copy of the GNU General Public License
 17 |  #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 18 | 
 19 | # Usage: bash run_L1EM.sh parameters.sh /fullpathto/alignments.bam /fullpathto/hg38.fa
 20 | 
 21 | # Command line arguments
 22 | bash $1
 23 | bamfile=$2
 24 | hg38=$3
 25 | 
 26 | # Locations within L1EM directory
 27 | L1EM_bed=/annotation/L1EM.400.bed
 28 | L1EM_fa=/annotation/L1EM.400.fa
 29 | L1EM_code_dir=/L1EM/L1EM/
 30 | L1EM_utilities_dir=/L1EM/utilities/
 31 | L1EM_CGC_dir=/L1EM/CGC/
 32 | 
 33 | # Try to realign unaligned reads using bwa aln.
 34 | echo 'STEP 1: realign'
 35 | mkdir idL1reads
 36 | cd idL1reads
 37 | samtools view -@ $threads -b -F 2 $bamfile | samtools sort -@ $threads -n - | samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
 38 | bwa aln -k $realignNM -n $realignNM -t $threads -i bwa_i $hg38 unaligned.fq1 > 1.sai
 39 | bwa aln -k $realignNM -n $realignNM -t $threads -i bwa_i $hg38 unaligned.fq2 > 2.sai
 40 | bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | samtools view -b -@ $threads - | samtools sort -@ $threads - > realigned.bam 
 41 | samtools index realigned.bam
 42 | 
 43 | # Extract L1HS/L1PA* aligning reads.
 44 | echo 'STEP 2: extract'
 45 | python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
 46 | samtools sort -@ $threads -n temp.bam | samtools fastq - -1 L1.fq1 -2 L1.fq2
 47 | python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
 48 | samtools sort -@ $threads -n temp.bam | samtools fastq - -1 temp.fq1 -2 temp.fq2
 49 | cat temp.fq1 >> L1.fq1
 50 | cat temp.fq2 >> L1.fq2
 51 | # rm temp*
 52 | 
 53 | # Split the L1 fastq files for parallel execution
 54 | cd ..
 55 | mkdir split_fqs
 56 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
 57 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
 58 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
 59 | cd split_fqs
 60 | 
 61 | # Generate candidate alignments
 62 | echo 'STEP 3: candidate alignments'
 63 | for name in *.fq1.*
 64 |     do reads1=$name
 65 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 66 |     ref=$L1EM_fa
 67 |     base=$(echo $name|sed 's/.fq1//g')
 68 |     bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
 69 |     bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i bwa_i -R 10000000 $ref $reads2 >  $base.R2.aln.sai
 70 | done
 71 | for name in *.fq1.*
 72 |     do reads1=$name
 73 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 74 |     ref=$L1EM_fa
 75 |     base=$(echo $name|sed 's/.fq1//g')
 76 |     bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai  $base.R2.aln.sai $reads1 $reads2 | samtools view -bS - | samtools sort -n - > $base.aln.bam &
 77 | done
 78 | wait
 79 | 
 80 | # Make G_of_R matrix
 81 | echo 'STEP 4: G(R) matrix construction'
 82 | mkdir ../G_of_R
 83 | cd ../G_of_R
 84 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
 85 | medianinsert=$(head -1 ../baminfo.txt)
 86 | for bam in ../split_fqs/*.bam
 87 | 	do python ${L1EM_code_dir}G_of_R_unstranded.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
 88 | done
 89 | wait
 90 | 
 91 | # RUN EM
 92 | echo 'STEP 5: Expectation maximization'
 93 | mkdir ../L1EM/
 94 | cd ../L1EM/
 95 | ls ../G_of_R/*pk2 > G_of_R_list.txt
 96 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
 97 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
 98 | 
 99 | #Write results as text file
100 | echo 'STEP 6: Writing results'
101 | 
102 | python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
103 | python ${L1EM_utilities_dir}report_l1_exp_counts_unstranded.py > ../full_counts.txt
104 | python ${L1EM_utilities_dir}report_l1hs_transcription_unstranded.py > ../l1hs_transcript_counts.txt
105 | python ${L1EM_utilities_dir}filtered_and_normalized_l1hs_unstranded.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt
106 | 
107 | #Clean up
108 | echo 'STEP 7: Clean up'
109 | cp *final.pkl ../
110 | cd ..
111 | 
112 | # rm idL1reads/*
113 | # rmdir idL1reads
114 | # rm split_fqs/*
115 | # rmdir split_fqs
116 | # rm G_of_R/*
117 | # rmdir G_of_R
118 | # rm L1EM/*
119 | # rmdir L1EM
120 | 


--------------------------------------------------------------------------------
/run_L1EM_withlessmemory.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Script to execute L1-EM pipeline
  4 | # Copyright (C) 2019 Wilson McKerrow
  5 | 
  6 | #    This program is free software: you can redistribute it and/or modify
  7 | #    it under the terms of the GNU General Public License as published by
  8 | #    the Free Software Foundation, either version 3 of the License, or
  9 | #    (at your option) any later version.
 10 | 
 11 | #    This program is distributed in the hope that it will be useful,
 12 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | #    GNU General Public License for more details.
 15 | 
 16 |  #   You should have received a copy of the GNU General Public License
 17 |  #   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 18 | 
 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa
 20 | 
 21 | # Parameters
 22 | threads=16 #How many threads to use for samtools, bwa and L1EM
 23 | realignNM=3 #Number of mismatches allowed in bwa realignment
 24 | L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments
 25 | NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment
 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read
 27 | error_prob=0.01 #Probability of a read error at a given position
 28 | max_start2start_len=500 #Max allowed template/fragment length
 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high.
 30 | EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time.
 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length.
 32 | 
 33 | # If you need to specify paths to required packages
 34 | bwa=$(which bwa) # version 0.7.17 tested
 35 | samtools=$(which samtools) # version 1.9 tested
 36 | python=$(which python) # use version 2.7
 37 | 
 38 | # Command line arguments
 39 | bamfile=$1
 40 | L1EM_directory=$2
 41 | hg38=$3
 42 | 
 43 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed'
 44 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa'
 45 | L1EM_code_dir=$L1EM_directory'/L1EM/'
 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/'
 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/'
 48 | 
 49 | # Try to realign unaligned reads using bwa aln.
 50 | echo 'STEP 1: realign'
 51 | mkdir idL1reads
 52 | cd idL1reads
 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2
 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai
 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai
 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 
 57 | samtools index realigned.bam
 58 | 
 59 | # Extract L1HS/L1PA* aligning reads.
 60 | echo 'STEP 2: extract'
 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam
 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2
 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam
 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2
 65 | cat temp.fq1 >> L1.fq1
 66 | cat temp.fq2 >> L1.fq2
 67 | # rm temp*
 68 | 
 69 | # Split the L1 fastq files for parallel execution
 70 | cd ..
 71 | mkdir split_fqs
 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}')
 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1.
 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2.
 75 | cd split_fqs
 76 | 
 77 | # Generate candidate alignments
 78 | echo 'STEP 3: candidate alignments'
 79 | for name in *.fq1.*
 80 |     do reads1=$name
 81 |     reads2=$(echo $name|sed 's/fq1/fq2/g')
 82 |     ref=$L1EM_fa
 83 |     base=$(echo $name|sed 's/.fq1//g')
 84 |     bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai
 85 |     bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 >  $base.R2.aln.sai
 86 |     bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai  $base.R2.aln.sai $reads1 $reads2 > temp.$base.aln.sam
 87 |     samtools view -@ $threads -bS temp.$base.aln.sam > temp.$base.aln.bam
 88 |     samtools sort -@ $threads -n temp.$base.aln.bam > $base.aln.bam
 89 |     rm temp.$base.aln.sam temp.$base.aln.bam $base.R1.aln.sai $base.R2.aln.sai
 90 | done
 91 | 
 92 | # Make G_of_R matrix
 93 | echo 'STEP 4: G(R) matrix construction'
 94 | mkdir ../G_of_R
 95 | cd ../G_of_R
 96 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt
 97 | medianinsert=$(head -1 ../baminfo.txt)
 98 | ls ../split_fqs/*.bam > list_of_bams.txt
 99 | split -l $threads list_of_bams.txt list_of_bams.txt.
100 | for bamlist in list_of_bams.txt.*
101 | 	do for bam in $(cat $bamlist)
102 | 		do python ${L1EM_code_dir}G_of_R.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff &
103 | 	done
104 | 	wait
105 | done
106 | 
107 | # RUN EM
108 | echo 'STEP 5: Expectation maximization'
109 | mkdir ../L1EM/
110 | cd ../L1EM/
111 | ls ../G_of_R/*pk2 > G_of_R_list.txt
112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt
113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold
114 | 
115 | #Write results as text file
116 | echo 'STEP 6: Writing results'
117 | 
118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt
119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt
120 | $python ${L1EM_utilities_dir}report_l1hs_transcription.py > ../l1hs_transcript_counts.txt
121 | $python ${L1EM_utilities_dir}filtered_and_normalized_l1hs.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt
122 | 
123 | #Clean up
124 | echo 'STEP 7: Clean up'
125 | cp *final.pkl ../
126 | cd ..
127 | 
128 | # rm idL1reads/*
129 | # rmdir idL1reads
130 | # rm split_fqs/*
131 | # rmdir split_fqs
132 | # rm G_of_R/*
133 | # rmdir G_of_R
134 | # rm L1EM/*
135 | # rmdir L1EM
136 | 


--------------------------------------------------------------------------------
/utilities/L1EM_readpairs.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | 
 7 | """
 8 | Report the total numbr of read pairs passed to L1EM
 9 | 
10 | Copyright (C) 2019 Wilson McKerrow
11 | 
12 |     This program is free software: you can redistribute it and/or modify
13 |     it under the terms of the GNU General Public License as published by
14 |     the Free Software Foundation, either version 3 of the License, or
15 |     (at your option) any later version.
16 | 
17 |     This program is distributed in the hope that it will be useful,
18 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
19 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 |     GNU General Public License for more details.
21 | 
22 |     You should have received a copy of the GNU General Public License
23 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
24 | 
25 | """
26 | 
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | 	G_of_R = pickle.load(open(line.strip(),'rb'))
30 | 	if G_of_R != None:
31 | 		total += G_of_R.shape[0]
32 | 
33 | print(total)
34 | 


--------------------------------------------------------------------------------
/utilities/filtered_and_normalized_active_l1md.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | import sys
 7 | 
 8 | """
 9 | Extract the LINE-1 transcript estimates from mm39 version of L1EM.
10 | 
11 | Copyright (C) 2021 Wilson McKerrow
12 | 
13 |     This program is free software: you can redistribute it and/or modify
14 |     it under the terms of the GNU General Public License as published by
15 |     the Free Software Foundation, either version 3 of the License, or
16 |     (at your option) any later version.
17 | 
18 |     This program is distributed in the hope that it will be useful,
19 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
20 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 |     GNU General Public License for more details.
22 | 
23 |     You should have received a copy of the GNU General Public License
24 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
25 | 
26 | """
27 | 
28 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
29 | 
30 | proper_pairs_in_original_bam = float(sys.argv[3])
31 | 
32 | total = float(sys.argv[4])
33 | 
34 | written_seqs = set([])
35 | 
36 | print("family.category.locus.strand\tonly\t3prunon")
37 | 
38 | names = list(X_est.keys())
39 | 
40 | for name in names:
41 | 	if 'L1MdTf_' in name or 'L1MdGf_' in name or 'L1MdA_I' in name or 'L1MdA_II' in name or 'L1MdA_III' in name:
42 | 		seq_name = '_'.join(name.split('_')[:-1])
43 | 		if seq_name in written_seqs:
44 | 			continue
45 | 		written_seqs.add(seq_name)
46 | 		print_string = seq_name.split('(')[0]
47 | 		only_name = seq_name+'_only'
48 | 		if only_name not in X_est:
49 | 			X_est[only_name]=0.0
50 | 		only_pairs = total*X_est[only_name]
51 | 		runon_name = seq_name+'_3prunon'
52 | 		if runon_name not in X_est:
53 | 			X_est[runon_name]=0.0
54 | 		runon_pairs = total*X_est[runon_name]
55 | 		runthroughS_name = seq_name+'_senserunthrough'
56 | 		if runthroughS_name not in X_est:
57 | 			X_est[runthroughS_name]=0.0
58 | 		runthrough_pairs = total*X_est[runthroughS_name]
59 | 		runthroughA_name = seq_name+'_antisenserunthrough'
60 | 		if runthroughA_name not in X_est:
61 | 			X_est[runthroughA_name]=0.0
62 | 		runthrough_pairs += total*X_est[runthroughA_name]
63 | 		if (only_pairs+runon_pairs > 10*runthrough_pairs) & (only_pairs+runon_pairs>100):
64 | 			print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6))
65 | 


--------------------------------------------------------------------------------
/utilities/filtered_and_normalized_active_l1md_unstranded.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | import sys
 7 | 
 8 | """
 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 | 
12 | Copyright (C) 2021 Wilson McKerrow
13 | 
14 |     This program is free software: you can redistribute it and/or modify
15 |     it under the terms of the GNU General Public License as published by
16 |     the Free Software Foundation, either version 3 of the License, or
17 |     (at your option) any later version.
18 | 
19 |     This program is distributed in the hope that it will be useful,
20 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |     GNU General Public License for more details.
23 | 
24 |     You should have received a copy of the GNU General Public License
25 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
26 | 
27 | """
28 | 
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 | 
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 | 
33 | total = float(sys.argv[4])
34 | 
35 | written_seqs = set([])
36 | 
37 | print("family.category.locus.strand\tonly\t3prunon")
38 | 
39 | names = list(X_est.keys())
40 | 
41 | for name in names:
42 | 	if if 'L1MdTf_' in name or 'L1MdGf_' in name or 'L1MdA_I' in name or 'L1MdA_II' in name or 'L1MdA_III' in name:
43 | 		seq_name = '_'.join(name.split('_')[:-1])
44 | 		if seq_name in written_seqs:
45 | 			continue
46 | 		written_seqs.add(seq_name)
47 | 		print_string = seq_name.split('(')[0]
48 | 		only_name = seq_name+'_only'
49 | 		if only_name not in X_est:
50 | 			X_est[only_name]=0.0
51 | 		only_pairs = total*X_est[only_name]
52 | 		runon_name = seq_name+'_3prunon'
53 | 		if runon_name not in X_est:
54 | 			X_est[runon_name]=0.0
55 | 		runon_pairs = total*X_est[runon_name]
56 | 		runthrough_name = seq_name+'_runthrough'
57 | 		if runthrough_name not in X_est:
58 | 			X_est[runthrough_name]=0.0
59 | 		runthrough_pairs = total*X_est[runthrough_name]
60 | 		if (only_pairs+runon_pairs > 10*runthrough_pairs) & (only_pairs+runon_pairs>100):
61 | 			print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6))
62 | 


--------------------------------------------------------------------------------
/utilities/filtered_and_normalized_l1hs.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | import sys
 7 | 
 8 | """
 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 | 
12 | Copyright (C) 2019 Wilson McKerrow
13 | 
14 |     This program is free software: you can redistribute it and/or modify
15 |     it under the terms of the GNU General Public License as published by
16 |     the Free Software Foundation, either version 3 of the License, or
17 |     (at your option) any later version.
18 | 
19 |     This program is distributed in the hope that it will be useful,
20 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |     GNU General Public License for more details.
23 | 
24 |     You should have received a copy of the GNU General Public License
25 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
26 | 
27 | """
28 | 
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 | 
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 | 
33 | total = float(sys.argv[4])
34 | 
35 | written_seqs = set([])
36 | 
37 | print("family.category.locus.strand\tonly\t3prunon")
38 | 
39 | names = list(X_est.keys())
40 | 
41 | for name in names:
42 | 	if 'L1HS' in name:
43 | 		seq_name = '_'.join(name.split('_')[:-1])
44 | 		if seq_name in written_seqs:
45 | 			continue
46 | 		written_seqs.add(seq_name)
47 | 		print_string = seq_name.split('(')[0]
48 | 		only_name = seq_name+'_only'
49 | 		if only_name not in X_est:
50 | 			X_est[only_name]=0.0
51 | 		only_pairs = total*X_est[only_name]
52 | 		runon_name = seq_name+'_3prunon'
53 | 		if runon_name not in X_est:
54 | 			X_est[runon_name]=0.0
55 | 		runon_pairs = total*X_est[runon_name]
56 | 		runthroughS_name = seq_name+'_senserunthrough'
57 | 		if runthroughS_name not in X_est:
58 | 			X_est[runthroughS_name]=0.0
59 | 		runthrough_pairs = total*X_est[runthroughS_name]
60 | 		runthroughA_name = seq_name+'_antisenserunthrough'
61 | 		if runthroughA_name not in X_est:
62 | 			X_est[runthroughA_name]=0.0
63 | 		runthrough_pairs += total*X_est[runthroughA_name]
64 | 		if (only_pairs+runon_pairs > 3*runthrough_pairs) & (only_pairs+runon_pairs>100):
65 | 			print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6))
66 | 


--------------------------------------------------------------------------------
/utilities/filtered_and_normalized_l1hs_unstranded.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | import sys
 7 | 
 8 | """
 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 | 
12 | Copyright (C) 2019 Wilson McKerrow
13 | 
14 |     This program is free software: you can redistribute it and/or modify
15 |     it under the terms of the GNU General Public License as published by
16 |     the Free Software Foundation, either version 3 of the License, or
17 |     (at your option) any later version.
18 | 
19 |     This program is distributed in the hope that it will be useful,
20 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |     GNU General Public License for more details.
23 | 
24 |     You should have received a copy of the GNU General Public License
25 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
26 | 
27 | """
28 | 
29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb'))))
30 | 
31 | proper_pairs_in_original_bam = float(sys.argv[3])
32 | 
33 | total = float(sys.argv[4])
34 | 
35 | written_seqs = set([])
36 | 
37 | print("family.category.locus.strand\tonly\t3prunon")
38 | 
39 | names = list(X_est.keys())
40 | 
41 | for name in names:
42 | 	if 'L1HS' in name:
43 | 		seq_name = '_'.join(name.split('_')[:-1])
44 | 		if seq_name in written_seqs:
45 | 			continue
46 | 		written_seqs.add(seq_name)
47 | 		print_string = seq_name.split('(')[0]
48 | 		only_name = seq_name+'_only'
49 | 		if only_name not in X_est:
50 | 			X_est[only_name]=0.0
51 | 		only_pairs = total*X_est[only_name]
52 | 		runon_name = seq_name+'_3prunon'
53 | 		if runon_name not in X_est:
54 | 			X_est[runon_name]=0.0
55 | 		runon_pairs = total*X_est[runon_name]
56 | 		runthrough_name = seq_name+'_runthrough'
57 | 		if runthrough_name not in X_est:
58 | 			X_est[runthrough_name]=0.0
59 | 		runthrough_pairs = total*X_est[runthrough_name]
60 | 		if (only_pairs+runon_pairs > 3*runthrough_pairs) & (only_pairs+runon_pairs>100):
61 | 			print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6))
62 | 


--------------------------------------------------------------------------------
/utilities/median_template.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pysam
 3 | import random
 4 | import numpy
 5 | 
 6 | """
 7 | Estimate median template length of a bam file.
 8 | 
 9 | Part of the L1-EM package.
10 | 
11 | Copyright (C) 2019 Wilson McKerrow
12 | 
13 |     This program is free software: you can redistribute it and/or modify
14 |     it under the terms of the GNU General Public License as published by
15 |     the Free Software Foundation, either version 3 of the License, or
16 |     (at your option) any later version.
17 | 
18 |     This program is distributed in the hope that it will be useful,
19 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
20 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21 |     GNU General Public License for more details.
22 | 
23 |     You should have received a copy of the GNU General Public License
24 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
25 | 
26 | """
27 | 
28 | bamfile = sys.argv[1]
29 | fraction = float(sys.argv[2])
30 | 
31 | tlens = list()
32 | 
33 | for read in pysam.AlignmentFile(bamfile):
34 | 	if not read.is_unmapped and random.random() < fraction:
35 | 		tlens.append(read.template_length)
36 | 
37 | print(numpy.median(numpy.abs(tlens)))
38 | 


--------------------------------------------------------------------------------
/utilities/read_or_pair_overlap_bed.py:
--------------------------------------------------------------------------------
 1 | import pysam
 2 | import sys
 3 | 
 4 | """
 5 | Extract reads or pairs of reads that overlap a bed file.
 6 | 
 7 | Part of the L1-EM package.
 8 | 
 9 | Copyright (C) 2019 Wilson McKerrow
10 | 
11 |     This program is free software: you can redistribute it and/or modify
12 |     it under the terms of the GNU General Public License as published by
13 |     the Free Software Foundation, either version 3 of the License, or
14 |     (at your option) any later version.
15 | 
16 |     This program is distributed in the hope that it will be useful,
17 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 |     GNU General Public License for more details.
20 | 
21 |     You should have received a copy of the GNU General Public License
22 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
23 | 
24 | """
25 | 
26 | def main():
27 | 	bedfile = sys.argv[1]
28 | 	bamfile = sys.argv[2]
29 | 	outbamfile = sys.argv[3]
30 | 	if len(sys.argv) > 4:
31 | 		flanking = int(sys.argv[4])
32 | 	else:
33 | 		flanking = 400
34 | 	if len(sys.argv) > 5:
35 | 		maxNM = int(sys.argv[5])
36 | 	else:
37 | 		maxNM = 4
38 | 	
39 | 	inbam = pysam.AlignmentFile(bamfile,'rb')
40 | 	outbam = pysam.AlignmentFile(outbamfile,'wb',template=inbam)
41 | 	
42 | 	read_ids = set()
43 | 	for line in open(bedfile):
44 | 		chrom,start,stop = line.strip().split('\t')[:3]
45 | 		start = int(start)+flanking
46 | 		stop = int(stop)-flanking
47 | 		if chrom in inbam.references:
48 | 			for read in inbam.fetch(chrom,start,stop):
49 | 				if not read.is_unmapped:
50 | 					if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and (not read.has_tag('NM') or read.get_tag('NM')<=maxNM):
51 | 						read_ids.add(read.query_name)
52 | # 		if chrom[3:] in inbam.references:
53 | # 			for read in inbam.fetch(chrom[3:],start,stop):
54 | # 				if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3:
55 | # 						read_ids.add(read.query_name)
56 | # 		if '_' in chrom and chrom.split('_')[1].upper()+'.1' in inbam.references:
57 | # 			for read in inbam.fetch(chrom.split('_')[1].upper()+'.1',start,stop):
58 | # 				if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3:
59 | # 					read_ids.add(read.query_name)
60 | 	
61 | 	inbam.close()
62 | 	inbam = pysam.AlignmentFile(bamfile,'rb')
63 | 	
64 | 	for read in inbam:
65 | 		if read.query_name in read_ids:
66 | 			if not read.is_secondary and not read.is_supplementary:
67 | 				outbam.write(read)
68 | 	
69 | 	inbam.close()
70 | 	outbam.close()
71 | 
72 | if __name__ == '__main__':
73 | 	main()
74 | 


--------------------------------------------------------------------------------
/utilities/report_l1_exp_counts.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | 
 7 | """
 8 | Extract the estimate of proper transcription of L1HS elements.
 9 | 
10 | Copyright (C) 2019 Wilson McKerrow
11 | 
12 |     This program is free software: you can redistribute it and/or modify
13 |     it under the terms of the GNU General Public License as published by
14 |     the Free Software Foundation, either version 3 of the License, or
15 |     (at your option) any later version.
16 | 
17 |     This program is distributed in the hope that it will be useful,
18 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
19 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 |     GNU General Public License for more details.
21 | 
22 |     You should have received a copy of the GNU General Public License
23 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
24 | 
25 | """
26 | 
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | 	G_of_R = pickle.load(open(line.strip(),'rb'))
30 | 	if G_of_R != None:
31 | 		total += pickle.load(open(line.strip(),'rb')).shape[0]
32 | 
33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
34 | 
35 | written_seqs = set([])
36 | 
37 | print("family.category.locus.strand\tonly\t3prunon\tpassive_sense\tpassive_antisense\tantisense")
38 | 
39 | names = list(X_est.keys())
40 | 
41 | for name in names:
42 | 	if 'exon' not in name:
43 | 		seq_name = '_'.join(name.split('_')[:-1])
44 | 		if seq_name in written_seqs:
45 | 			continue
46 | 		written_seqs.add(seq_name)
47 | 		print_string = seq_name.split('(')[0]
48 | 		only_name = seq_name+'_only'
49 | 		if only_name not in X_est:
50 | 			X_est[only_name]=0.0
51 | 		print_string += '\t'+str(total*X_est[only_name])
52 | 		runon_name = seq_name+'_3prunon'
53 | 		if runon_name not in X_est:
54 | 			X_est[runon_name]=0.0
55 | 		print_string += '\t'+str(total*X_est[runon_name])
56 | 		senserunthrough_name = seq_name+'_senserunthrough'
57 | 		if senserunthrough_name not in X_est:
58 | 			X_est[senserunthrough_name]=0.0
59 | 		print_string += '\t'+str(total*X_est[senserunthrough_name])
60 | 		antisenserunthrough_name = seq_name+'_antisenserunthrough'
61 | 		if antisenserunthrough_name not in X_est:
62 | 			X_est[antisenserunthrough_name]=0.0
63 | 		print_string += '\t'+str(total*X_est[antisenserunthrough_name])
64 | 		antisense_name = seq_name+'_antisense'
65 | 		if antisense_name not in X_est:
66 | 			X_est[antisense_name]=0.0
67 | 		print_string += '\t'+str(total*X_est[antisense_name])
68 | 		print(print_string)
69 | 


--------------------------------------------------------------------------------
/utilities/report_l1_exp_counts_clip.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | 
 7 | """
 8 | Extract the estimate of proper transcription of L1HS elements.
 9 | 
10 | Copyright (C) 2019 Wilson McKerrow
11 | 
12 |     This program is free software: you can redistribute it and/or modify
13 |     it under the terms of the GNU General Public License as published by
14 |     the Free Software Foundation, either version 3 of the License, or
15 |     (at your option) any later version.
16 | 
17 |     This program is distributed in the hope that it will be useful,
18 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
19 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 |     GNU General Public License for more details.
21 | 
22 |     You should have received a copy of the GNU General Public License
23 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
24 | 
25 | """
26 | 
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | 	G_of_R = pickle.load(open(line.strip(),'rb'))
30 | 	if G_of_R != None:
31 | 		total += pickle.load(open(line.strip(),'rb')).shape[0]
32 | 
33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
34 | 
35 | written_seqs = set([])
36 | 
37 | print("family.category.locus.strand\sesne\tantisense")
38 | 
39 | names = list(X_est.keys())
40 | 
41 | for name in names:
42 | 	if 'exon' not in name:
43 | 		seq_name = '_'.join(name.split('_')[:-1])
44 | 		if seq_name in written_seqs:
45 | 			continue
46 | 		written_seqs.add(seq_name)
47 | 		print_string = seq_name.split('(')[0]
48 | 		sense_name = seq_name+'_sense'
49 | 		if sense_name not in X_est:
50 | 			X_est[sense_name]=0.0
51 | 		print_string += '\t'+str(total*X_est[sense_name])
52 | 		antisense_name = seq_name+'_antisense'
53 | 		if antisense_name not in X_est:
54 | 			X_est[antisense_name]=0.0
55 | 		print_string += '\t'+str(total*X_est[antisense_name])
56 | 		print(print_string)
57 | 


--------------------------------------------------------------------------------
/utilities/report_l1_exp_counts_unstranded.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | import sys
 7 | 
 8 | """
 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC
10 | to analyze TCGA data.
11 | 
12 | Copyright (C) 2019 Wilson McKerrow
13 | 
14 |     This program is free software: you can redistribute it and/or modify
15 |     it under the terms of the GNU General Public License as published by
16 |     the Free Software Foundation, either version 3 of the License, or
17 |     (at your option) any later version.
18 | 
19 |     This program is distributed in the hope that it will be useful,
20 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |     GNU General Public License for more details.
23 | 
24 |     You should have received a copy of the GNU General Public License
25 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
26 | 
27 | """
28 | 
29 | total = 0
30 | for line in open('G_of_R_list.txt'):
31 | 	G_of_R = pickle.load(open(line.strip(),'rb'))
32 | 	if G_of_R != None:
33 | 		total += pickle.load(open(line.strip(),'rb')).shape[0]
34 | 
35 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
36 | 
37 | written_seqs = set([])
38 | 
39 | print("family.category.locus.strand\tonly\t3prunon\tpassive")
40 | 
41 | names = list(X_est.keys())
42 | 
43 | for name in names:
44 | 	if 'exon' not in name:
45 | 		seq_name = '_'.join(name.split('_')[:-1])
46 | 		if seq_name in written_seqs:
47 | 			continue
48 | 		written_seqs.add(seq_name)
49 | 		print_string = seq_name.split('(')[0]
50 | 		only_name = seq_name+'_only'
51 | 		if only_name not in X_est:
52 | 			X_est[only_name]=0.0
53 | 		print_string += '\t'+str(total*X_est[only_name])
54 | 		runon_name = seq_name+'_3prunon'
55 | 		if runon_name not in X_est:
56 | 			X_est[runon_name]=0.0
57 | 		print_string += '\t'+str(total*X_est[runon_name])
58 | 		runthrough_name = seq_name+'_runthrough'
59 | 		if runthrough_name not in X_est:
60 | 			X_est[runthrough_name]=0.0
61 | 		print_string += '\t'+str(total*X_est[runthrough_name])
62 | 		print(print_string)
63 | 


--------------------------------------------------------------------------------
/utilities/report_l1hs_transcription.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | 
 7 | """
 8 | Extract the estimate of proper transcription of L1HS elements.
 9 | 
10 | Copyright (C) 2019 Wilson McKerrow
11 | 
12 |     This program is free software: you can redistribute it and/or modify
13 |     it under the terms of the GNU General Public License as published by
14 |     the Free Software Foundation, either version 3 of the License, or
15 |     (at your option) any later version.
16 | 
17 |     This program is distributed in the hope that it will be useful,
18 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
19 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 |     GNU General Public License for more details.
21 | 
22 |     You should have received a copy of the GNU General Public License
23 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
24 | 
25 | """
26 | 
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | 	G_of_R = pickle.load(open(line.strip(),'rb'))
30 | 	if G_of_R != None:
31 | 		total += G_of_R.shape[0]
32 | 
33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
34 | 
35 | written_seqs = set([])
36 | 
37 | print("family.category.locus.strand\tonly\t3prunon")
38 | 
39 | names = list(X_est.keys())
40 | 
41 | for name in names:
42 | 	if 'L1HS' in name:
43 | 		seq_name = '_'.join(name.split('_')[:-1])
44 | 		if seq_name in written_seqs:
45 | 			continue
46 | 		written_seqs.add(seq_name)
47 | 		print_string = seq_name.split('(')[0]
48 | 
49 | 		total_proper = 0.0
50 | 		total_passive = 0.0
51 | 
52 | 		only_name = seq_name+'_only'
53 | 		if only_name not in X_est:
54 | 			X_est[only_name]=0.0
55 | 		print_string += '\t'+str(total*X_est[only_name])
56 | 		total_proper += total*X_est[only_name]
57 | 		runon_name = seq_name+'_3prunon'
58 | 		if runon_name not in X_est:
59 | 			X_est[runon_name]=0.0
60 | 		print_string += '\t'+str(total*X_est[runon_name])
61 | 		total_proper += total*X_est[runon_name]
62 | 		senserunthrough_name = seq_name+'_senserunthrough'
63 | 		if senserunthrough_name not in X_est:
64 | 			X_est[senserunthrough_name]=0.0
65 | 		total_passive += total*X_est[senserunthrough_name]
66 | 		antisenserunthrough_name = seq_name+'_antisenserunthrough'
67 | 		if antisenserunthrough_name not in X_est:
68 | 			X_est[antisenserunthrough_name]=0.0
69 | 		total_passive += total*X_est[senserunthrough_name]
70 | 		if total_proper > 3*total_passive:
71 | 			print(print_string)
72 | 


--------------------------------------------------------------------------------
/utilities/report_l1hs_transcription_unstranded.py:
--------------------------------------------------------------------------------
 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3).
 2 | try:
 3 |     import cPickle as pickle
 4 | except ImportError:
 5 |     import pickle
 6 | 
 7 | """
 8 | Extract the estimate of proper transcription of L1HS elements.
 9 | 
10 | Copyright (C) 2019 Wilson McKerrow
11 | 
12 |     This program is free software: you can redistribute it and/or modify
13 |     it under the terms of the GNU General Public License as published by
14 |     the Free Software Foundation, either version 3 of the License, or
15 |     (at your option) any later version.
16 | 
17 |     This program is distributed in the hope that it will be useful,
18 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
19 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 |     GNU General Public License for more details.
21 | 
22 |     You should have received a copy of the GNU General Public License
23 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
24 | 
25 | """
26 | 
27 | total = 0
28 | for line in open('G_of_R_list.txt'):
29 | 	G_of_R = pickle.load(open(line.strip(),'rb'))
30 | 	if G_of_R != None:
31 | 		total += G_of_R.shape[0]
32 | 
33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb'))))
34 | 
35 | written_seqs = set([])
36 | 
37 | print("family.category.locus.strand\tonly\t3prunon")
38 | 
39 | names = list(X_est.keys())
40 | 
41 | for name in names:
42 | 	if 'L1HS' in name:
43 | 		seq_name = '_'.join(name.split('_')[:-1])
44 | 		if seq_name in written_seqs:
45 | 			continue
46 | 		written_seqs.add(seq_name)
47 | 		print_string = seq_name.split('(')[0]
48 | 
49 | 		total_proper = 0.0
50 | 		total_passive = 0.0
51 | 
52 | 		only_name = seq_name+'_only'
53 | 		if only_name not in X_est:
54 | 			X_est[only_name]=0.0
55 | 		print_string += '\t'+str(total*X_est[only_name])
56 | 		total_proper += total*X_est[only_name]
57 | 		runon_name = seq_name+'_3prunon'
58 | 		if runon_name not in X_est:
59 | 			X_est[runon_name]=0.0
60 | 		print_string += '\t'+str(total*X_est[runon_name])
61 | 		total_proper += total*X_est[runon_name]
62 | 		senserunthrough_name = seq_name+'_runthrough'
63 | 		if senserunthrough_name not in X_est:
64 | 			X_est[senserunthrough_name]=0.0
65 | 		total_passive += total*X_est[senserunthrough_name]
66 | 		if total_proper > 3*total_passive:
67 | 			print(print_string)
68 | 


--------------------------------------------------------------------------------