├── CGC ├── ORF1_list.txt ├── ORF2_list.txt ├── make_ORF1_and_intact_table.py ├── make_ORF1_and_intact_table_stranded.py ├── make_l1pa1to4table.py ├── make_l1pa1to4table_stranded.py ├── median_template_and_pairs.py ├── read_or_pair_overlap_bed_and_unmapped.py ├── report_l1_exp_counts.py ├── report_l1_exp_counts_unstranded.py ├── total_orf1_and_orf2.py └── total_orf1_and_orf2_stranded.py ├── Dockerfile ├── L1EM.yml ├── L1EM ├── G_of_R.py ├── G_of_R_single_unstranded.py ├── G_of_R_unstranded.py └── L1EM.py ├── LICENSE.txt ├── README.md ├── annotation ├── L1EM.400.bed └── mm39.L1EM.bed ├── generate_L1EM_fasta_and_index.sh ├── generate_mm39_L1EM_fasta_and_index.sh ├── manual.md ├── parameters.sh ├── run_L1EM.sh ├── run_L1EM_fortcga.sh ├── run_L1EM_mm39.sh ├── run_L1EM_mm39_unstranded.sh ├── run_L1EM_unstranded.sh ├── run_L1EM_unstranded_fromdocker.sh ├── run_L1EM_withlessmemory.sh └── utilities ├── L1EM_readpairs.py ├── filtered_and_normalized_active_l1md.py ├── filtered_and_normalized_active_l1md_unstranded.py ├── filtered_and_normalized_l1hs.py ├── filtered_and_normalized_l1hs_unstranded.py ├── median_template.py ├── read_or_pair_overlap_bed.py ├── report_l1_exp_counts.py ├── report_l1_exp_counts_clip.py ├── report_l1_exp_counts_unstranded.py ├── report_l1hs_transcription.py └── report_l1hs_transcription_unstranded.py /CGC/ORF1_list.txt: -------------------------------------------------------------------------------- 1 | L1HS.1.chrX:141421202-141427246 2 | L1HS.1.chr2:172315270-172321297 3 | L1HS.1.chr17:70458956-70464987 4 | L1HS.1.chr15:82882881-82888919 5 | L1HS.1.chr14:63116706-63122735 6 | L1HS.1.chr13:29641706-29647706 7 | L1HS.1.chr12:126299023-126305038 8 | L1PA3.1.chr12:13391606-13397632 9 | L1HS.1.chr11:95436216-95442246 10 | L1HS.1.chr10:98782941-98788971 11 | L1HS.1.chrX:11935296-11941314 12 | L1HS.1.chr7:111243515-111249546 13 | L1HS.1.chr7:96846650-96852680 14 | L1HS.1.chr7:66286853-66292884 15 | L1HS.1.chr7:49680245-49686300 16 | L1HS.1.chr6:24811657-24817706 17 | L1HS.1.chr5:109259387-109265418 18 | L1HS.1.chr5:104518587-104524616 19 | L1HS.1.chr4:136293494-136299546 20 | L1PA2.1.chr4:128213789-128219796 21 | L1HS.1.chr4:79966907-79972933 22 | L1HS.1.chr4:70328906-70334307 23 | L1HS.1.chr4:21159390-21165421 24 | L1HS.1.chr3:89460825-89466856 25 | L1PA2.1.chr3:81051389-81057413 26 | L1HS.1.chr1:237019467-237025494 27 | L1HS.1.chr1:180866811-180872843 28 | L1HS.1.chr1:84052389-84058406 29 | L1HS.1.chr1:104770247-104776278 30 | L1HS.1.chr6:86000000-86005073 31 | L1HS.1.chr22:28663283-28669315 32 | L1HS.1.chr11:78677772-78683802 33 | L1HS.1.chr10:19088601-19094618 34 | L1HS.1.chr9:90149604-90155634 35 | L1PA2.1.chr8:91558668-91564687 36 | L1HS.1.chr3:46783105-46789138 37 | L1HS.1.chr1:174590323-174596379 38 | L1HS.1.chr22:48985761-48991792 39 | L1HS.1.chr17:70544788-70550795 40 | L1HS.1.chrX:155516016-155522048 41 | L1HS.1.chrX:83059584-83065637 42 | L1HS.1.chr9:112798107-112804159 43 | L1HS.1.chr9:94113535-94119565 44 | L1HS.1.chr8:72875538-72881588 45 | L1HS.1.chr5:173402796-173408828 46 | L1HS.1.chr20:12801017-12807044 47 | L1HS.1.chr18:37819737-37825798 48 | L1HS.1.chr16:68583448-68589505 49 | L1HS.1.chr16:33952564-33958612 50 | L1HS.1.chrX:130517377-130523407 51 | L1HS.1.chrX:11707248-11713279 52 | L1HS.1.chr8:134070756-134076773 53 | L1HS.1.chr4:166569976-166576007 54 | L1HS.1.chr4:87347103-87353146 55 | L1HS.1.chr3:130628808-130634065 56 | L1HS.1.chr2:71411474-71417501 57 | L1HS.1.chr1:118852351-118858380 58 | L1HS.1.chr20:55859566-55865521 59 | L1HS.1.chr16:83637252-83643296 60 | L1HS.1.chr15:83450804-83456834 61 | L1HS.1.chr10:5245354-5251383 62 | L1HS.1.chr9:110791097-110797129 63 | L1HS.1.chr8:125582886-125588889 64 | L1HS.1.chr7:141920659-141926712 65 | L1HS.1.chr7:25041860-25047891 66 | L1HS.1.chr5:156061919-156067966 67 | L1HS.1.chr4:90675739-90681757 68 | L1HS.1.chr4:59078847-59084877 69 | L1HS.1.chr3:163236941-163242962 70 | L1HS.1.chr3:22050867-22053197 71 | L1HS.1.chr2:148188745-148194773 72 | L1HS.1.chr2:4733729-4739760 73 | L1HS.1.chr15:70729744-70735160 74 | L1HS.1.chr1:121532230-121538261 75 | L1HS.1.chr12:73283667-73289668 76 | L1HS.1.chr6:51874783-51880802 77 | L1HS.1.chr2:112503812-112509845 78 | L1HS.1.chr13:108510472-108516495 79 | L1HS.1.chr11:93136638-93142673 80 | L1HS.1.chr11:24327951-24334001 81 | L1HS.1.chr6:133020691-133026746 82 | L1HS.1.chr4:98592435-98598463 83 | L1HS.1.chr4:23614771-23620793 84 | L1HS.1.chr3:159095379-159101394 85 | L1HS.1.chr16:9584490-9590522 86 | L1HS.1.chr10:33510845-33516876 87 | L1HS.1.chrX:106469285-106475319 88 | L1HS.1.chr4:79704552-79710581 89 | L1HS.1.chr3:158019676-158025704 90 | L1PA2.1.chr3:63211708-63217714 91 | L1HS.1.chr2:166988454-166994509 92 | L1HS.1.chr13:31302314-31308370 93 | L1HS.1.chr12:74874868-74880901 94 | L1HS.1.chr7:30439242-30445274 95 | L1HS.1.chr6:72988654-72994686 96 | L1HS.1.chr4:166755895-166761908 97 | L1HS.1.chr4:79937715-79943746 98 | L1HS.1.chr2:102566355-102572385 99 | L1PA2.1.chr18:59403939-59409970 100 | L1HS.1.chr12:3500000-3505228 101 | L1HS.1.chr11:93420986-93427031 102 | L1HS.1.chr11:90400067-90406098 103 | L1HS.1.chr11:36551606-36557636 104 | L1HS.1.chr8:128453002-128459020 105 | L1HS.1.chr5:166966760-166972815 106 | L1HS.1.chr5:146609485-146615534 107 | L1HS.1.chr3:109199872-109205903 108 | L1PA3.1.chrX:64252345-64258375 109 | L1HS.1.chr6:2417774-2423803 110 | L1HS.1.chr5:102189483-102194435 111 | L1PA2.1.chr5:39787652-39793671 112 | L1HS.1.chr4:169515501-169521532 113 | L1HS.1.chr4:78105735-78111765 114 | L1PA2.1.chr4:55619153-55625181 115 | L1HS.1.chr3:136479056-136485103 116 | L1HS.1.chr3:116359999-116366026 117 | L1PA2.1.chr2:106130892-106136925 118 | L1PA2.1.chr1:71888203-71894235 119 | L1HS.1.chr15:87509891-87515920 120 | L1HS.1.chr10:109812437-109818457 121 | L1HS.1.chr10:105775520-105781551 122 | L1PA2.1.chrX:42888370-42894396 123 | L1HS.1.chr7:93787624-93793679 124 | L1HS.1.chr5:13416497-13422525 125 | L1HS.1.chr2:11000000-11002136 126 | L1PA2.1.chr10:78088450-78094479 127 | L1HS.1.chrX:73380991-73387013 128 | L1HS.1.chr7:97613656-97619688 129 | L1HS.1.chr5:79778884-79784938 130 | L1HS.1.chr4:61939927-61945962 131 | L1PA2.1.chr2:43660471-43666500 132 | L1PA2.1.chr1:93790652-93796681 133 | L1HS.1.chr1:68736693-68740136 134 | L1HS.1.chr14:30684809-30690837 135 | L1HS.1.chr12:54788573-54794627 136 | L1HS.1.chrX:26314417-26320446 137 | L1HS.1.chr6:112703745-112709778 138 | L1HS.1.chr6:70010347-70016552 139 | L1PA2.1.chr6:44870634-44876665 140 | L1HS.1.chr5:119684785-119690814 141 | L1HS.1.chr5:32824614-32827992 142 | L1HS.1.chr2:193212420-193218448 143 | L1HS.1.chr20:7116194-7122199 144 | L1PA2.1.chr11:116570827-116576273 145 | L1HS.1.chr11:82155865-82161891 146 | L1HS.1.chrY:5606144-5612199 147 | L1HS.1.chr4:78347980-78354013 148 | L1HS.1.chr4:15841546-15847572 149 | L1PA2.1.chr3:43064774-43070790 150 | L1HS.1.chr1:67078891-67084915 151 | L1HS.1.chr18:5684668-5687891 152 | L1HS.1.chr16:18821266-18827058 153 | L1HS.1.chr14:79308933-79314061 154 | L1HS.1.chr11:109177494-109183526 155 | L1HS.1.chr9:95697585-95703604 156 | L1HS.1.chr7:113776122-113782152 157 | L1HS.1.chr5:15906515-15912550 158 | L1HS.1.chr4:19077911-19083929 159 | L1HS.1.chr3:90169567-90175598 160 | L1HS.1.chr20:23426108-23432140 161 | L1HS.1.chr16:54042096-54048145 162 | L1HS.1.chrY:4948913-4954938 163 | L1HS.1.chrX:66180696-66186728 164 | L1HS.1.chrX:54118685-54124744 165 | L1HS.1.chr5:152886441-152892473 166 | L1HS.1.chr5:102131356-102137385 167 | L1HS.1.chr4:74717539-74723587 168 | L1HS.1.chr2:169248623-169254656 169 | L1HS.1.chr1:218009227-218015252 170 | L1PA2.1.chr1:176256085-176262110 171 | L1HS.1.chr1:34566055-34572105 172 | L1PA2.1.chr11:87047304-87053192 173 | L1HS.1.chrX:64013267-64019286 174 | L1HS.1.chr5:58384174-58390206 175 | L1HS.1.chr5:34147845-34154031 176 | L1HS.1.chr4:52538471-52544498 177 | L1HS.1.chr1:80939203-80945257 178 | L1HS.1.chr18:70746549-70752581 179 | L1PA2.1.chr15:71174139-71180152 180 | L1HS.1.chr7:110707004-110713024 181 | L1HS.1.chr6:117102131-117108163 182 | L1HS.1.chr4:91978211-91984413 183 | L1HS.1.chr1:197707714-197713746 184 | L1PA2.1.chr15:58125731-58131761 185 | L1HS.1.chrX:119435468-119441493 186 | L1HS.1.chr5:160709608-160715639 187 | L1HS.1.chr4:119948726-119954758 188 | L1HS.1.chr4:14755114-14761144 189 | L1HS.1.chr3:77763677-77769678 190 | L1HS.1.chr2:175481951-175487994 191 | L1HS.1.chr2:16593725-16599758 192 | L1HS.1.chr12:69773410-69779441 193 | L1PA2.1.chr9:120055235-120061264 194 | L1PA2.1.chr8:97295603-97301657 195 | L1PA2.1.chr8:58914690-58920717 196 | L1HS.1.chr7:63148831-63154859 197 | L1HS.1.chr7:61837998-61844054 198 | L1HS.1.chr4:111894801-111900831 199 | L1HS.1.chr3:103556537-103562569 200 | L1HS.1.chr3:79129777-79133955 201 | L1HS.1.chr3:26398017-26404045 202 | L1PA2.1.chr3:12028021-12033291 203 | L1HS.1.chr2:213567231-213573262 204 | L1HS.1.chr1:196219370-196225402 205 | L1HS.1.chr15:54926081-54932099 206 | L1HS.1.chr11:99602687-99608113 207 | L1HS.1.chr11:31315654-31321680 208 | L1HS.1.chr7:70197328-70203357 209 | L1PA2.1.chr3:137633714-137639732 210 | L1HS.1.chr2:196905587-196911636 211 | L1HS.1.chr2:86655238-86661268 212 | L1HS.1.chr1:187343764-187349794 213 | L1HS.1.chr1:71513698-71519742 214 | L1PA2.1.chr16:61801455-61807489 215 | L1PA2.1.chr11:14715908-14721938 216 | L1HS.1.chrX:83542396-83548420 217 | L1HS.1.chr1:193717837-193723892 218 | L1HS.1.chr1:113497220-113500000 219 | L1HS.1.chr1:86679080-86685111 220 | L1HS.1.chr16:16840517-16846556 221 | L1HS.1.chr5:133583288-133589299 222 | L1PA2.1.chr5:65164017-65170048 223 | L1HS.1.chr1:209913771-209919823 224 | L1PA2.1.chr12:112621197-112627228 225 | L1PA2.1.chr12:92313998-92320023 226 | L1HS.1.chr12:38799646-38805673 227 | L1PA2.1.chrY:17060920-17066963 228 | L1HS.1.chrX:96057824-96063842 229 | L1HS.1.chrX:50019456-50025505 230 | L1HS.1.chr5:152076868-152082891 231 | L1HS.1.chr3:108749400-108755425 232 | L1HS.1.chr18:47660373-47666427 233 | L1PA2.1.chr13:39000817-39006875 234 | L1HS.1.chr12:51562631-51568657 235 | L1PA2.1.chr8:93405812-93411825 236 | L1HS.1.chr5:166141191-166145692 237 | L1HS.1.chr5:153070982-153077008 238 | L1HS.1.chr5:81616090-81622140 239 | L1HS.1.chr4:93638307-93644337 240 | L1HS.1.chr2:153007766-153013796 241 | L1PA2.1.chrX:98687494-98693514 242 | L1HS.1.chr5:177772245-177778274 243 | L1PA2.1.chr3:158634523-158640540 244 | L1HS.1.chrX:23238516-23244575 245 | L1PA7.1.chr9:113437560-113443590 246 | L1HS.1.chr9:83049539-83055571 247 | L1PA2.1.chr8:40432212-40438240 248 | L1HS.1.chr6:156324980-156331010 249 | L1HS.1.chr4:93608283-93614338 250 | L1HS.1.chr4:57562316-57568347 251 | L1HS.1.chr18:50343959-50349987 252 | L1HS.1.chr18:535701-541755 253 | L1PA2.1.chr11:94232524-94238528 254 | L1HS.1.chrY:9941130-9947151 255 | L1PA2.1.chr6:99823597-99829594 256 | L1PA2.1.chr4:143100259-143106289 257 | L1HS.1.chr4:106571057-106577070 258 | L1PA2.1.chr21:35493766-35499791 259 | L1HS.1.chr14:70547290-70553322 260 | L1PA2.1.chr9:101102144-101108174 261 | L1HS.1.chr8:135875862-135881890 262 | L1PA2.1.chr8:68362478-68367911 263 | L1PA2.1.chr3:26384735-26390767 264 | L1HS.1.chr1:247687173-247693204 265 | L1PA2.1.chr1:174377791-174383815 266 | L1HS.1.chr12:44108220-44114234 267 | L1PA2.1.chr10:117832895-117838887 268 | L1HS.1.chr9:28111895-28117865 269 | L1PA2.1.chr3:53365276-53371325 270 | L1PA2.1.chr2:222149601-222155632 271 | L1PA2.1.chr2:165485934-165491963 272 | L1PA2.1.chr11:60532161-60538190 273 | L1HS.1.chrX:56695884-56701916 274 | L1HS.1.chr8:136438074-136444105 275 | L1PA2.1.chr5:152340020-152346052 276 | L1PA2.1.chr4:27375687-27381719 277 | L1PA2.1.chr1:242045561-242051585 278 | L1PA2.1.chr1:192500584-192506612 279 | L1PA2.1.chr1:78845456-78851474 280 | L1HS.1.chr17:9615985-9622015 281 | L1HS.1.chrX:142477849-142483853 282 | L1PA2.1.chrX:50060143-50066175 283 | L1PA3.1.chr8:61115375-61121394 284 | L1PA2.1.chr8:10932425-10938427 285 | L1HS.1.chr3:3963076-3969110 286 | L1PA2.1.chr2:158522617-158528649 287 | L1PA2.1.chr15:86528094-86534125 288 | L1PA2.1.chr6:104489393-104495424 289 | L1PA2.1.chr2:124139775-124145807 290 | L1PA2.1.chr2:72063975-72069997 291 | L1HS.1.chr18:62906292-62912314 292 | L1HS.1.chr11:125536609-125542640 293 | L1PA2.1.chr10:20751667-20757692 294 | L1PA2.1.chr9:19536200-19542230 295 | L1HS.1.chr8:91522091-91528121 296 | L1PA2.1.chr7:23035734-23039855 297 | L1PA2.1.chr18:8057452-8063463 298 | L1PA2.1.chr8:110952164-110957638 299 | L1PA2.1.chr4:75126805-75132838 300 | L1PA2.1.chr20:8595101-8601127 301 | L1PA2.1.chr4:119274113-119280127 302 | L1PA2.1.chr2:157368535-157374566 303 | L1PA2.1.chr20:42206269-42212317 304 | L1PA2.1.chr18:57719248-57725264 305 | L1PA2.1.chr18:24619042-24625072 306 | L1PA2.1.chr17:32887137-32893184 307 | L1PA2.1.chr14:26629268-26635299 308 | L1PA2.1.chrX:34249185-34253913 309 | L1PA2.1.chr8:98614445-98620471 310 | L1HS.1.chr7:141062014-141068042 311 | L1HS.1.chr5:111302238-111308262 312 | L1PA2.1.chr5:93213145-93219176 313 | L1PA2.1.chr1:40365613-40370869 314 | L1PA2.1.chr13:40734919-40740945 315 | L1PA2.1.chr13:82045349-82051380 316 | L1PA2.1.chr8:98260275-98266293 317 | L1HS.1.chr1:187597671-187603699 318 | L1HS.1.chr15:81995166-82000000 319 | L1HS.1.chr14:51794601-51800632 320 | L1HS.1.chr10:108310130-108316139 321 | L1HS.1.chr8:104739851-104745873 322 | L1PA2.1.chr6:69515143-69521169 323 | L1PA2.1.chr3:119001470-119007490 324 | L1PA2.1.chr18:39565592-39571600 325 | L1PA2.1.chr11:49775119-49781151 326 | L1PA2.1.chr4:100000524-100006553 327 | L1PA2.1.chr3:103220448-103226476 328 | L1PA2.1.chr2:219931818-219937838 329 | L1PA2.1.chr2:173699375-173705410 330 | L1PA2.1.chr15:44252034-44258049 331 | L1PA2.1.chr12:61941440-61947489 332 | L1PA2.1.chr10:80722509-80728544 333 | L1PA2.1.chrX:36465194-36471217 334 | L1PA2.1.chr9:100527228-100533251 335 | L1PA2.1.chr6:8770471-8776512 336 | L1PA2.1.chr4:160574032-160580085 337 | L1PA2.1.chr1:65558564-65564576 338 | L1PA2.1.chr14:58032539-58038561 339 | L1PA2.1.chr13:73640527-73646551 340 | L1PA2.1.chrY:13179085-13185115 341 | L1PA2.1.chrX:130958931-130964957 342 | L1PA2.1.chr7:136414180-136420210 343 | L1PA3.1.chr5:93261035-93267065 344 | L1PA2.1.chr2:128858984-128865016 345 | L1PA2.1.chr1:91211587-91216947 346 | L1PA2.1.chr16:48768571-48774603 347 | L1PA2.1.chrX:76005216-76007849 348 | L1PA2.1.chr8:35528045-35534071 349 | L1PA3.1.chr7:37612053-37618072 350 | L1PA3.1.chr2:122046204-122052249 351 | L1PA2.1.chr2:122012673-122018708 352 | L1PA2.1.chr18:69449559-69455072 353 | L1PA2.1.chr18:24710814-24716841 354 | L1PA2.1.chr12:64195587-64201638 355 | L1PA2.1.chr8:135259101-135265130 356 | L1PA2.1.chr7:141032606-141038609 357 | L1PA2.1.chr3:141757129-141763153 358 | L1PA2.1.chr3:111018716-111024745 359 | L1HS.1.chr2:230337069-230342513 360 | L1HS.1.chr14:45477110-45483169 361 | L1HS.1.chr12:55096256-55102283 362 | L1HS.1.chr3:18516080-18520244 363 | L1PA2.1.chr2:174404458-174410482 364 | L1HS.1.chr19:43864494-43867300 365 | L1PA2.1.chrX:117873996-117880023 366 | L1PA2.1.chrX:107364336-107370349 367 | L1PA2.1.chr9:1223881-1229900 368 | L1PA2.1.chr6:9966044-9972049 369 | L1PA2.1.chr14:30363537-30369568 370 | L1PA2.1.chr12:57646479-57652498 371 | L1PA2.1.chrX:111588704-111594727 372 | L1PA3.1.chr2:799542-805567 373 | L1PA2.1.chr17:55501716-55507741 374 | L1PA3.1.chr12:88846991-88853008 375 | L1PA2.1.chr12:23070636-23076652 376 | L1PA2.1.chrX:19142668-19148700 377 | L1HS.1.chr9:20655632-20658802 378 | L1PA2.1.chr7:16216428-16222457 379 | L1HS.1.chr3:82337499-82339442 380 | L1PA7.1.chr3:27815983-27821983 381 | L1PA2.1.chr2:196521488-196527500 382 | L1PA2.1.chr16:60522745-60528760 383 | L1PA2.1.chr11:75748616-75754649 384 | L1PA2.1.chrX:104026871-104032902 385 | L1PA2.1.chr9:22348829-22354859 386 | L1PA2.1.chr8:1607577-1613555 387 | L1PA2.1.chr6:113531117-113537147 388 | L1PA2.1.chr6:39454557-39460534 389 | L1PA2.1.chr2:133910480-133916504 390 | L1PA2.1.chr17:12449903-12455932 391 | L1PA2.1.chr8:119159398-119165410 392 | L1PA2.1.chr6:113226487-113232515 393 | L1PA2.1.chr3:65509292-65515316 394 | L1PA2.1.chr18:60029678-60035707 395 | L1PA3.1.chr12:47426904-47432928 396 | L1PA2.1.chr10:106844583-106850610 397 | L1PA2.1.chrX:74788952-74795000 398 | L1PA2.1.chrX:65614680-65620735 399 | L1PA2.1.chr7:15547309-15553333 400 | L1PA2.1.chr3:111556203-111562234 401 | L1PA2.1.chr12:55484008-55490018 402 | L1PA2.1.chr8:74942646-74948678 403 | L1PA3.1.chr6:9810750-9816777 404 | L1PA2.1.chr4:186110393-186116420 405 | L1PA2.1.chr2:211219320-211225344 406 | L1PA2.1.chr2:182025225-182031239 407 | L1PA2.1.chr2:62835478-62841498 408 | L1PA2.1.chr12:55344249-55350274 409 | L1PA2.1.chr11:55685638-55691665 410 | L1PA2.1.chr10:111583927-111589948 411 | L1PA2.1.chr8:25730343-25736354 412 | L1PA2.1.chr2:30904198-30910223 413 | L1PA3.1.chr1:49006012-49012044 414 | L1PA2.1.chr13:40356290-40362321 415 | L1PA2.1.chr11:100475720-100481744 416 | L1PA2.1.chr6:100196461-100202490 417 | L1PA2.1.chr20:53503851-53509874 418 | L1PA2.1.chrX:100550262-100555448 419 | L1PA2.1.chrX:79765605-79771625 420 | L1PA2.1.chrX:47783671-47789697 421 | L1PA3.1.chrX:14103920-14109975 422 | L1PA2.1.chr9:119621304-119627350 423 | L1PA2.1.chr7:34169569-34175587 424 | L1HS.1.chr5:123933969-123935867 425 | L1PA2.1.chr5:78866805-78872828 426 | L1PA2.1.chr21:17376038-17381930 427 | L1PA2.1.chr8:35171472-35177508 428 | L1PA2.1.chr2:195067521-195073543 429 | L1PA2.1.chr2:191761849-191767876 430 | L1PA2.1.chr1:58269013-58275032 431 | L1PA2.1.chr15:83383651-83389616 432 | L1PA2.1.chr8:15555981-15562028 433 | L1PA2.1.chr6:156361254-156367276 434 | L1PA2.1.chr4:11235217-11241254 435 | L1PA2.1.chr8:83447142-83453169 436 | L1PA3.1.chr3:70285056-70291083 437 | L1PA2.1.chr22:32294665-32300684 438 | L1PA2.1.chr18:28416544-28422561 439 | L1PA2.1.chrX:117727933-117733746 440 | L1PA2.1.chr7:122278915-122284945 441 | L1PA2.1.chr7:111445694-111451725 442 | L1PA3.1.chr1:159452953-159458976 443 | L1PA2.1.chr13:42424880-42430912 444 | L1PA2.1.chrY:7249559-7255517 445 | L1PA2.1.chr9:70199056-70205081 446 | L1PA2.1.chr8:120644161-120650187 447 | L1PA4.1.chr5:90816413-90822442 448 | L1PA3.1.chr4:167343516-167349561 449 | L1PA3.1.chr4:11143617-11149645 450 | L1PA2.1.chr2:83768302-83774332 451 | L1PA2.1.chr1:115147821-115153959 452 | L1PA3.1.chr1:84228930-84234940 453 | L1HS.1.chr7:12497211-12500000 454 | L1PA3.1.chr6:102752889-102758907 455 | L1PA2.1.chr17:32678257-32684272 456 | L1PA2.1.chr15:93675399-93681428 457 | L1PA2.1.chr12:85312419-85318459 458 | L1PA3.1.chr10:120234299-120240325 459 | L1PA2.1.chr8:84419545-84425573 460 | L1PA3.1.chr6:48737348-48743377 461 | L1PA2.1.chr4:141877346-141883376 462 | L1PA3.1.chr3:40902374-40908432 463 | L1PA2.1.chr12:102827177-102833208 464 | L1PA2.1.chr12:88708857-88714885 465 | L1PA2.1.chr10:84756878-84762878 466 | L1PA2.1.chrX:150417926-150421451 467 | L1PA2.1.chr18:68784834-68790853 468 | L1PA2.1.chr16:36071922-36077950 469 | L1PA4.1.chr8:75278367-75284402 470 | L1PA2.1.chr7:85249854-85255879 471 | L1PA2.1.chr2:209746180-209752208 472 | L1PA3.1.chr2:72015949-72021960 473 | L1PA2.1.chr1:163639993-163646040 474 | L1PA2.1.chr15:81870667-81876699 475 | L1PA2.1.chr14:40326207-40332209 476 | L1PA2.1.chr9:14663995-14670015 477 | L1PA2.1.chr21:33925606-33931606 478 | L1PA3.1.chr11:4170252-4176276 479 | L1PA2.1.chrX:86969631-86975651 480 | L1PA2.1.chr3:122041911-122047938 481 | L1PA2.1.chr3:34413126-34419172 482 | L1PA2.1.chr3:55046104-55052129 483 | L1PA2.1.chr1:223395534-223401557 484 | L1PA2.1.chr15:49778509-49784518 485 | L1HS.1.chrX:92254241-92256469 486 | L1PA3.1.chr8:59168655-59174679 487 | L1PA2.1.chr7:96552195-96558214 488 | L1PA3.1.chr7:22528296-22534331 489 | L1PA2.1.chr6:82389419-82395425 490 | L1PA2.1.chr5:64709034-64715065 491 | L1PA3.1.chr5:26476048-26482063 492 | L1PA2.1.chr19:55822401-55828429 493 | L1PA2.1.chr6:141814814-141820842 494 | L1PA3.1.chr3:97904737-97910773 495 | L1PA3.1.chr11:32941250-32947256 496 | L1PA2.1.chr8:101678069-101684093 497 | L1PA3.1.chr8:87223230-87229276 498 | L1PA2.1.chr4:64859153-64865171 499 | L1PA2.1.chr20:53472644-53478653 500 | L1PA2.1.chr10:117079038-117085063 501 | L1PA3.1.chrX:80405241-80411257 502 | L1PA2.1.chr9:28348134-28354162 503 | L1PA3.1.chr6:44754479-44760501 504 | L1PA2.1.chr5:30817969-30823996 505 | L1PA2.1.chr2:158351231-158357242 506 | L1PA2.1.chr1:177633927-177639946 507 | L1PA2.1.chr18:34552378-34558395 508 | L1PA2.1.chr15:20311030-20317051 509 | L1PA3.1.chr10:35477-41492 510 | L1PA2.1.chrX:113337822-113343853 511 | L1PA3.1.chr8:129324372-129329792 512 | L1PA2.1.chr6:86806663-86812682 513 | L1PA2.1.chr6:22174840-22180874 514 | L1PA2.1.chr5:139005423-139011486 515 | L1PA2.1.chr4:75401194-75407226 516 | L1PA2.1.chr18:34331115-34337159 517 | L1PA2.1.chr15:81668513-81674504 518 | L1PA2.1.chr11:26965118-26971134 519 | L1PA2.1.chr8:40647061-40653096 520 | L1PA2.1.chr7:91903682-91909706 521 | L1PA3.1.chr7:34906239-34912252 522 | L1PA3.1.chr6:133142073-133148104 523 | L1PA2.1.chr6:112813021-112819047 524 | L1PA2.1.chr6:91047151-91053161 525 | L1PA2.1.chr5:78452185-78458210 526 | L1PA3.1.chr4:53564637-53570664 527 | L1PA3.1.chr3:164702605-164708638 528 | L1PA2.1.chr3:58816278-58822304 529 | L1PA3.1.chr2:188123561-188129537 530 | L1PA3.1.chr18:35807-41823 531 | L1PA2.1.chr14:57112210-57118214 532 | L1PA2.1.chr8:126313241-126319040 533 | L1PA3.1.chr4:93589053-93595080 534 | L1PA2.1.chr2:153719447-153725466 535 | L1PA2.1.chrX:88207885-88213919 536 | L1PA3.1.chr5:133180711-133186159 537 | L1PA3.1.chr3:176827330-176833078 538 | L1PA2.1.chr17:61110229-61116238 539 | L1PA3.1.chr9:9931213-9937220 540 | L1PA2.1.chr7:113742900-113748900 541 | L1PA2.1.chr3:53921747-53927805 542 | L1PA2.1.chr5:123236611-123242616 543 | L1PA2.1.chr1:30438491-30444125 544 | L1PA2.1.chr8:137558584-137564612 545 | L1PA2.1.chr5:58552269-58558283 546 | L1PA2.1.chr3:195087672-195093677 547 | L1PA2.1.chr16:48015465-48021489 548 | L1PA3.1.chr14:88443481-88448908 549 | L1PA3.1.chrX:125608955-125614969 550 | L1PA3.1.chr6:49791502-49797511 551 | L1PA2.1.chr5:43717953-43723974 552 | L1PA2.1.chr16:21042672-21048703 553 | L1PA2.1.chr14:53266406-53271579 554 | L1HS.1.chr13:76612823-76618851 555 | L1PA2.1.chr10:30948036-30953410 556 | L1HS.1.chr7:10168424-10171419 557 | L1PA2.1.chr5:103046752-103052769 558 | L1PA2.1.chr11:42429263-42435286 559 | L1PA3.1.chr9:79177278-79183283 560 | L1PA2.1.chr20:21910853-21916881 561 | L1PA2.1.chr15:100417071-100423096 562 | L1PA2.1.chr9:97580993-97587031 563 | L1PA2.1.chr6:82558403-82564423 564 | L1PA2.1.chr4:167276135-167282161 565 | L1PA2.1.chr11:37240712-37246716 566 | L1PA3.1.chr10:19581231-19587252 567 | L1PA2.1.chr8:120348977-120354404 568 | L1PA3.1.chr4:175019740-175025751 569 | L1PA2.1.chr3:156278973-156284990 570 | L1PA3.1.chr10:31469210-31475223 571 | L1PA2.1.chrX:110816764-110822773 572 | L1PA2.1.chr7:84062409-84068436 573 | L1PA2.1.chr4:104687024-104692901 574 | L1PA2.1.chr2:22984153-22990168 575 | L1PA2.1.chr1:247888322-247894338 576 | L1HS.1.chr1:216485645-216487614 577 | L1PA2.1.chrX:152235997-152242028 578 | L1PA2.1.chrX:87116306-87122337 579 | L1PA3.1.chr20:40619322-40625356 580 | L1PA3.1.chr15:51031621-51037649 581 | L1PA3.1.chr16:86233295-86239310 582 | L1PA3.1.chr10:126676471-126682458 583 | L1PA3.1.chr9:87041882-87047900 584 | L1PA3.1.chr8:86586338-86592366 585 | L1PA2.1.chr8:74866519-74872545 586 | L1PA2.1.chr5:16335410-16341440 587 | L1PA2.1.chr1:185251798-185257805 588 | L1PA2.1.chr5:67736897-67742931 589 | L1PA2.1.chr5:42734714-42740780 590 | L1PA2.1.chr4:104786988-104793014 591 | L1PA3.1.chrX:126104124-126110137 592 | L1PA2.1.chr16:63388077-63394106 593 | L1PA3.1.chr14:43698653-43704668 594 | L1PA2.1.chr9:31294065-31300087 595 | L1PA2.1.chr8:72147447-72153464 596 | L1PA3.1.chr5:79095899-79101910 597 | L1PA3.1.chr3:168764449-168770163 598 | L1PA5.1.chr6:30247968-30253370 599 | L1PA3.1.chr5:152762660-152768698 600 | L1PA3.1.chr8:31851611-31857633 601 | L1PA2.1.chr7:93586902-93591767 602 | L1PA3.1.chr6:6185423-6191251 603 | L1PA2.1.chr14:40348058-40354070 604 | L1PA3.1.chr9:4605799-4611811 605 | L1PA3.1.chr2:117649184-117655215 606 | L1PA2.1.chr1:98588661-98592036 607 | L1PA2.1.chr4:14480751-14486827 608 | L1PA2.1.chr7:50898842-50904874 609 | L1PA3.1.chr3:50973137-50979172 610 | L1PA3.1.chr15:87463738-87469749 611 | L1PA2.1.chr4:75270579-75276607 612 | L1PA2.1.chr11:108260626-108266643 613 | L1PA2.1.chr18:7398117-7403452 614 | L1PA3.1.chr6:160679719-160685751 615 | L1PA3.1.chr5:117338253-117344286 616 | L1PA2.1.chr4:132631286-132637304 617 | L1PA4.1.chr11:88986995-88993027 618 | L1PA2.1.chr8:87624830-87630833 619 | L1PA3.1.chr4:138881278-138887439 620 | L1PA3.1.chr3:159392310-159398352 621 | L1PA2.1.chr2:208811621-208817648 622 | L1HS.1.chr10:37995443-38000000 623 | L1PA2.1.chr2:124407870-124413893 624 | L1PA2.1.chr14:43597900-43602088 625 | L1PA3.1.chrX:69676829-69682848 626 | L1PA3.1.chrX:29981730-29987885 627 | L1PA2.1.chr9:32729016-32735047 628 | L1PA3.1.chr3:189017229-189025537 629 | L1PA3.1.chr15:49259578-49265724 630 | L1PA2.1.chrX:63597195-63603223 631 | L1PA3.1.chr9:79494547-79500000 632 | L1PA2.1.chr6:86021488-86027515 633 | L1PA3.1.chr3:22737735-22743754 634 | L1PA2.1.chr15:56311143-56317177 635 | L1P1.1.chr12:21083799-21088613 636 | L1PA3.1.chr9:139648-145712 637 | L1PA3.1.chr3:135025209-135031249 638 | L1PA3.1.chr5:122509616-122515639 639 | L1PA3.1.chr4:184208227-184214253 640 | L1HS.1.chr16:65690011-65696020 641 | L1PA3.1.chr9:25070956-25076968 642 | L1PA3.1.chrX:148123449-148129475 643 | L1PA2.1.chr8:15576322-15582323 644 | L1PA2.1.chr2:40008598-40014619 645 | L1PA3.1.chr12:87969763-87975178 646 | L1PA3.1.chr6:116638696-116644847 647 | L1PA2.1.chr1:68923486-68926929 648 | L1PA3.1.chr14:43262062-43268079 649 | L1PA3.1.chr11:13793800-13799246 650 | L1HS.1.chr17:69000000-69005148 651 | L1PA2.1.chr14:26673315-26679374 652 | L1PA3.1.chrX:91752800-91758828 653 | L1PA3.1.chr7:8726340-8732368 654 | L1PA3.1.chr10:98541327-98544428 655 | L1PA2.1.chr8:91869518-91875534 656 | L1PA2.1.chrY:22339612-22345645 657 | L1PA2.1.chrX:6933211-6938641 658 | L1PA3.1.chr9:28989909-28996028 659 | L1PA3.1.chr6:71826715-71832734 660 | L1PA3.1.chrX:130567667-130573661 661 | L1PA2.1.chr9:86606053-86612054 662 | L1PA2.1.chr6:102722105-102725773 663 | L1HS.1.chr3:135002247-135005976 664 | L1PA3.1.chrX:13132317-13138466 665 | L1PA3.1.chr8:120190619-120194194 666 | L1PA3.1.chr1:75677750-75683758 667 | L1PA3.1.chr19:42714368-42720306 668 | L1PA2.1.chr15:49354233-49360252 669 | L1PA3.1.chr12:87098406-87104423 670 | L1PA3.1.chr5:79902294-79908448 671 | L1HS.1.chr4:110690112-110693684 672 | L1PA3.1.chr11:4000000-4005742 673 | L1PA2.1.chr8:131770949-131776926 674 | L1PA3.1.chr7:45377515-45383528 675 | L1PA2.1.chr5:28798790-28804814 676 | L1PA3.1.chr4:117921983-117927986 677 | L1PA2.1.chr8:66949103-66955119 678 | L1PA3.1.chr6:54961429-54967437 679 | L1PA3.1.chr5:148610279-148615691 680 | L1PA3.1.chr7:96579968-96585972 681 | L1P1.1.chr6:68912761-68914683 682 | L1PA2.1.chr3:85289930-85295940 683 | L1PA3.1.chr1:94506993-94512223 684 | L1PA3.1.chr10:35858668-35864695 685 | L1PA3.1.chr6:75899741-75905745 686 | L1PA3.1.chr10:105735717-105741873 687 | L1PA3.1.chr9:65709351-65715424 688 | L1PA3.1.chr2:166702725-166708745 689 | L1PA3.1.chr10:9519878-9526029 690 | L1PA3.1.chr4:147434630-147440182 691 | L1PA3.1.chr4:11557447-11563586 692 | L1PA3.1.chr3:133551337-133557486 693 | L1PA4.1.chr3:29993917-29999918 694 | L1PA4.1.chr4:122939633-122945796 695 | L1PA3.1.chr2:113471481-113477545 696 | L1PA3.1.chr8:107889880-107895907 697 | L1P1.1.chr16:59363201-59365889 698 | L1PA2.1.chr13:95733699-95740837 699 | L1PA3.1.chr7:36552965-36559126 700 | L1PA3.1.chr7:87238750-87244161 701 | L1PA2.1.chr5:85657593-85663641 702 | L1HS.1.chr7:145561496-145564595 703 | L1PA3.1.chr14:93413032-93419170 704 | L1PA3.1.chr4:102697355-102702634 705 | L1PA3.1.chr5:75769521-75774874 706 | L1PA3.1.chr2:137309506-137315534 707 | L1PA3.1.chrX:51512321-51518344 708 | L1PA3.1.chrX:32730898-32736929 709 | L1PA3.1.chr6:115009066-115014475 710 | L1PA3.1.chr13:102522020-102528176 711 | L1HS.1.chr9:63913382-63916426 712 | L1PA3.1.chr4:127676608-127682756 713 | L1PA2.1.chr6:141916863-141922880 714 | L1PA3.1.chr3:67147643-67153797 715 | L1PA3.1.chr2:227064885-227070875 716 | L1PA3.1.chr2:154147919-154154081 717 | L1PA2.1.chr4:156284540-156290599 718 | L1PA3.1.chr4:119061901-119068062 719 | L1PA3.1.chr16:47182757-47188783 720 | L1PA3.1.chr4:187050521-187056650 721 | L1PA3.1.chr4:90152884-90159024 722 | L1PA3.1.chr11:113656606-113662757 723 | L1PA3.1.chr10:130316193-130319757 724 | L1PA2.1.chr10:37896970-37902988 725 | L1PA3.1.chr4:167055311-167058094 726 | L1PA3.1.chr9:92900099-92906066 727 | L1PA3.1.chr6:1428614-1434750 728 | L1PA3.1.chr3:809376-815406 729 | L1PA3.1.chr2:240544494-240550522 730 | L1PA3.1.chr20:31332715-31338865 731 | L1PA3.1.chr5:98342992-98349146 732 | L1PA3.1.chrX:56075957-56081346 733 | L1PA3.1.chr2:96378981-96385131 734 | L1PA3.1.chr8:138042576-138048582 735 | L1PA3.1.chrX:42202163-42208183 736 | L1PA3.1.chr8:118548537-118554688 737 | L1PA2.1.chr7:113533338-113538585 738 | L1PA3.1.chr6:94757768-94763915 739 | L1PA3.1.chr3:180448471-180452905 740 | L1HS.1.chr1:103922065-103925398 741 | L1PA2.1.chr7:39007527-39013552 742 | L1PA3.1.chr6:4831172-4836230 743 | L1PA3.1.chr2:172187264-172193414 744 | L1PA3.1.chr1:113633560-113639380 745 | L1PA3.1.chrX:75561270-75567303 746 | L1HS.1.chr6:121162716-121168725 747 | L1P1.1.chr2:31496361-31500000 748 | L1PA3.1.chr13:104340349-104346508 749 | L1PA3.1.chr5:44578944-44584955 750 | L1PA3.1.chr5:15291930-15298100 751 | L1PA3.1.chr4:127860152-127865687 752 | L1PA3.1.chr15:38834949-38841092 753 | L1PA3.1.chrX:116159201-116165340 754 | L1PA3.1.chr8:74594235-74600252 755 | L1PA3.1.chrX:16230649-16236856 756 | L1PA3.1.chr16:65356743-65362738 757 | L1PA3.1.chr7:90634134-90640272 758 | L1PA3.1.chr12:66578267-66584409 759 | L1PA4.1.chr6:122537169-122543352 760 | L1PA3.1.chr6:107854715-107860748 761 | L1PA3.1.chr3:23349243-23355385 762 | L1PA3.1.chrX:108277015-108283183 763 | L1PA3.1.chr5:126903810-126909817 764 | L1PA3.1.chr5:91762133-91768161 765 | L1PA3.1.chr3:63733312-63739364 766 | L1PA2.1.chr16:80086062-80091657 767 | L1PA3.1.chr3:171625080-171632749 768 | L1PA4.1.chr6:70782891-70788469 769 | L1PA4.1.chr2:149501242-149507406 770 | L1PA3.1.chr14:47171654-47177809 771 | L1PA3.1.chr12:105776064-105782085 772 | L1PA3.1.chr15:97372487-97377657 773 | L1HS.1.chrX:21330646-21331772 774 | L1PA3.1.chr8:126341585-126347597 775 | L1PA2.1.chr11:60435417-60441441 776 | L1PA2.1.chr8:48793905-48799930 777 | L1PA2.1.chr3:108257680-108263696 778 | L1PA4.1.chr7:111500001-111505980 779 | L1PA2.1.chr21:40022874-40028842 780 | L1PA2.1.chr8:95552232-95558265 781 | L1PA3.1.chrX:51644758-51650770 782 | L1PA2.1.chr20:39368228-39373484 783 | L1PA3.1.chrX:85146837-85153009 784 | L1PA3.1.chr7:23321058-23327074 785 | L1PA3.1.chr5:99339615-99345764 786 | L1HS.1.chr2:131815264-131816385 787 | L1PA2.1.chr9:74424274-74430304 788 | L1PA3.1.chr7:32708015-32714180 789 | L1PA3.1.chr6:23804047-23810075 790 | L1PA4.1.chr4:63948074-63953638 791 | L1PA2.1.chrY:20367992-20374018 792 | L1PA3.1.chr5:120176046-120182173 793 | L1PA3.1.chr10:45009173-45015196 794 | L1PA3.1.chr4:163207006-163212968 795 | L1PA2.1.chr2:236383666-236389689 796 | L1PA2.1.chr6:71138558-71146364 797 | L1PA3.1.chr3:94846027-94852063 798 | L1PA2.1.chrX:114720516-114726531 799 | L1PA3.1.chr9:12333994-12340033 800 | L1PA2.1.chr5:19161336-19167360 801 | L1PA2.1.chr21:40033161-40039180 802 | L1PA2.1.chr4:106935103-106941147 803 | L1PA3.1.chr12:55081365-55087407 804 | L1PA2.1.chr12:58109257-58115306 805 | L1PA3.1.chr14:39521539-39527458 806 | L1PA2.1.chr8:4854406-4860419 807 | L1PA3.1.chr13:60709752-60715784 808 | L1PA2.1.chr7:32682678-32688715 809 | L1PA3.1.chrX:36170297-36176324 810 | L1PA3.1.chr3:18328056-18331287 811 | L1PA3.1.chr12:59751112-59757266 812 | L1PA3.1.chr3:26239082-26245212 813 | L1PA2.1.chr1:186637331-186643356 814 | L1PA3.1.chr3:61211021-61217154 815 | -------------------------------------------------------------------------------- /CGC/ORF2_list.txt: -------------------------------------------------------------------------------- 1 | L1HS.1.chr20:7116194-7122199 2 | L1HS.1.chr5:152886441-152892473 3 | L1HS.1.chr15:70729744-70735160 4 | L1HS.1.chr8:125582886-125588889 5 | L1HS.1.chr4:136293494-136299546 6 | L1HS.1.chrX:141421202-141427246 7 | L1HS.1.chr15:54926081-54932099 8 | L1HS.1.chr4:74717539-74723587 9 | L1HS.1.chr8:128453002-128459020 10 | L1HS.1.chr2:4733729-4739760 11 | L1HS.1.chr16:16840517-16846556 12 | L1HS.1.chr9:95697585-95703604 13 | L1HS.1.chr7:30439242-30445274 14 | L1HS.1.chr4:138547723-138552054 15 | L1HS.1.chr11:78677772-78683802 16 | L1HS.1.chr8:134070756-134076773 17 | L1HS.1.chr5:109259387-109265418 18 | L1HS.1.chr4:21159390-21165421 19 | L1HS.1.chr6:2417774-2423803 20 | L1HS.1.chrX:11935296-11941314 21 | L1HS.1.chrX:11707248-11713279 22 | L1HS.1.chr16:18821266-18827058 23 | L1HS.1.chr13:29641706-29647706 24 | L1HS.1.chr8:72875538-72881588 25 | L1HS.1.chr12:126299023-126305038 26 | L1HS.1.chr5:104518587-104524616 27 | L1HS.1.chr3:130628808-130634065 28 | L1HS.1.chr10:105377346-105383377 29 | L1HS.1.chr6:129000000-129004416 30 | L1HS.1.chr4:79937715-79943746 31 | L1HS.1.chr22:28663283-28669315 32 | L1HS.1.chr2:16593725-16599758 33 | L1HS.1.chr18:70746549-70752581 34 | L1HS.1.chr16:33952564-33958612 35 | L1HS.1.chr10:109812437-109818457 36 | L1HS.1.chr10:6369617-6375667 37 | L1HS.1.chr6:156034135-156040165 38 | L1HS.1.chr1:84052389-84058406 39 | L1HS.1.chr18:75846851-75852883 40 | L1HS.1.chr11:93420986-93427031 41 | L1HS.1.chr1:71513698-71519742 42 | L1HS.1.chrX:147653734-147659767 43 | L1HS.1.chr1:247687173-247693204 44 | L1HS.1.chr7:113776122-113782152 45 | L1HS.1.chr4:78347980-78354013 46 | L1HS.1.chr11:93136638-93142673 47 | L1HS.1.chr5:177772245-177778274 48 | L1HS.1.chr4:90675739-90681757 49 | L1HS.1.chr2:196905587-196911636 50 | L1HS.1.chr16:83637252-83643296 51 | L1HS.1.chr16:9584490-9590522 52 | L1HS.1.chr7:141920659-141926712 53 | L1HS.1.chr3:109199872-109205903 54 | L1HS.1.chr1:174590323-174596379 55 | L1HS.1.chr11:95436216-95442246 56 | L1HS.1.chr11:24327951-24334001 57 | L1HS.1.chr9:90149604-90155634 58 | L1HS.1.chr6:19764892-19770918 59 | L1HS.1.chr7:110707004-110713024 60 | L1HS.1.chr6:83333952-83339981 61 | L1HS.1.chr2:86655238-86661268 62 | L1HS.1.chr7:49680245-49686300 63 | L1HS.1.chr6:133020691-133026746 64 | L1HS.1.chr1:86679080-86685111 65 | L1HS.1.chr10:85355506-85361538 66 | L1HS.1.chr8:27113618-27119645 67 | L1HS.1.chr3:103556537-103562569 68 | L1HS.1.chr6:24811657-24817706 69 | L1PA2.1.chr5:132513964-132519996 70 | L1HS.1.chr5:79778884-79784938 71 | L1HS.1.chr3:120573021-120579186 72 | L1HS.1.chr2:175481951-175487994 73 | L1HS.1.chr1:239623498-239629523 74 | L1HS.1.chr14:70547290-70553322 75 | L1HS.1.chrX:54118685-54124744 76 | L1HS.1.chr13:92685561-92691592 77 | L1HS.1.chr1:237019467-237025494 78 | L1HS.1.chr1:80939203-80945257 79 | L1HS.1.chr5:58384174-58390206 80 | L1HS.1.chr5:173402796-173408828 81 | L1HS.1.chr4:16944926-16949113 82 | L1HS.1.chr4:93638307-93644337 83 | L1HS.1.chr3:77763677-77769678 84 | L1HS.1.chr17:9615985-9622015 85 | L1HS.1.chr6:121162716-121168725 86 | L1HS.1.chr22:48985761-48991792 87 | L1HS.1.chrX:23238516-23244575 88 | L1HS.1.chr2:166988454-166994509 89 | L1HS.1.chrX:81841153-81847184 90 | L1PA2.1.chr11:60532161-60538190 91 | L1HS.1.chr4:111894801-111900831 92 | L1HS.1.chr1:180866811-180872843 93 | L1HS.1.chr17:66596579-66602595 94 | L1HS.1.chr6:117102131-117108163 95 | L1PA2.1.chr5:39787652-39793671 96 | L1HS.1.chr4:59078847-59084877 97 | L1HS.1.chr9:28111895-28117865 98 | L1HS.1.chr7:111963193-111969223 99 | L1HS.1.chr5:146609485-146615534 100 | L1HS.1.chr3:159095379-159101394 101 | L1HS.1.chr2:180833661-180839689 102 | L1HS.1.chr7:111243515-111249546 103 | L1HS.1.chr15:87509891-87515920 104 | L1HS.1.chr11:85324758-85330821 105 | L1HS.1.chr10:98782941-98788971 106 | L1HS.1.chr1:187597671-187603699 107 | L1HS.1.chr14:63116706-63122735 108 | L1HS.1.chr1:187343764-187349794 109 | L1HS.1.chr18:13975860-13981891 110 | L1PA2.1.chr1:71888203-71894235 111 | L1HS.1.chr20:11632779-11638837 112 | L1HS.1.chrX:96057824-96063842 113 | L1HS.1.chr4:122652658-122656850 114 | L1HS.1.chr1:195925003-195929320 115 | L1HS.1.chr1:85927067-85933100 116 | L1HS.1.chr18:50343959-50349987 117 | L1HS.1.chr6:72988654-72994686 118 | L1HS.1.chr11:109177494-109183526 119 | L1HS.1.chr8:88685705-88691760 120 | L1HS.1.chr5:111302238-111308262 121 | L1HS.1.chr2:102566355-102572385 122 | L1HS.1.chr5:86510690-86516743 123 | L1HS.1.chr3:132946006-132952034 124 | L1HS.1.chr1:118852351-118858380 125 | L1HS.1.chr10:76586841-76591752 126 | L1HS.1.chrX:151330320-151336351 127 | L1HS.1.chr10:5245354-5251383 128 | L1PA2.1.chr6:115960032-115966060 129 | L1PA2.1.chr12:92313998-92320023 130 | L1HS.1.chrX:155516016-155522048 131 | L1HS.1.chr4:169515501-169521532 132 | L1HS.1.chr7:93787624-93793679 133 | L1HS.1.chr10:19088601-19094618 134 | L1HS.1.chrX:76322775-76328806 135 | L1PA2.1.chrX:28206791-28212789 136 | L1HS.1.chr5:102131356-102137385 137 | L1PA2.1.chr12:90536603-90542635 138 | L1HS.1.chr7:46820756-46825657 139 | L1PA2.1.chr19:37837502-37843533 140 | L1PA2.1.chr10:15915731-15921753 141 | L1HS.1.chr20:12801017-12807044 142 | L1HS.1.chr11:49793154-49797728 143 | L1HS.1.chr18:37819737-37825798 144 | L1HS.1.chrY:5606144-5612199 145 | L1HS.1.chr3:4916534-4922591 146 | L1PA2.1.chr18:59403939-59409970 147 | L1PA2.1.chr15:71174139-71180152 148 | L1HS.1.chrX:142477849-142483853 149 | L1HS.1.chr10:33510845-33516876 150 | L1HS.1.chr11:90400067-90406098 151 | L1HS.1.chr7:63148831-63154859 152 | L1PA2.1.chr5:83316287-83320401 153 | L1HS.1.chr1:209913771-209919823 154 | L1HS.1.chr11:36551606-36557636 155 | L1PA2.1.chr3:187412123-187418152 156 | L1HS.1.chr3:136479056-136485103 157 | L1PA2.1.chr3:81051389-81057413 158 | L1PA2.1.chr18:7966442-7972474 159 | L1PA2.1.chr8:91558668-91564687 160 | L1HS.1.chr3:89460825-89466856 161 | L1PA2.1.chr6:44870634-44876665 162 | L1PA2.1.chr5:45658440-45664470 163 | L1HS.1.chr3:54394322-54400323 164 | L1PA2.1.chr6:72570139-72576167 165 | L1HS.1.chr18:72966526-72972556 166 | L1HS.1.chr3:3963076-3969110 167 | L1PA2.1.chr2:128858984-128865016 168 | L1PA2.1.chr3:177388770-177394751 169 | L1PA2.1.chr10:11731436-11737465 170 | L1PA2.1.chr10:39466259-39470575 171 | L1PA2.1.chr9:19536200-19542230 172 | L1PA2.1.chr6:104489393-104495424 173 | L1HS.1.chrX:83059584-83065637 174 | L1HS.1.chr7:70197328-70203357 175 | L1PA2.1.chr2:173699375-173705410 176 | L1HS.1.chrX:64013267-64019286 177 | L1PA2.1.chrX:103891506-103897537 178 | L1PA2.1.chr4:164553492-164559523 179 | L1PA2.1.chr8:63797384-63803439 180 | L1HS.1.chr12:54788573-54794627 181 | L1PA2.1.chr10:106844583-106850610 182 | L1PA2.1.chr15:51173565-51179009 183 | L1PA2.1.chr8:75444000-75448442 184 | L1PA2.1.chr6:104452399-104457460 185 | L1PA3.1.chr3:137454947-137460983 186 | L1HS.1.chr5:122240435-122244924 187 | L1PA2.1.chr4:102204930-102210958 188 | L1HS.1.chr7:7465092-7471120 189 | L1PA2.1.chr3:155119416-155125444 190 | L1PA2.1.chr16:21042672-21048703 191 | L1PA3.1.chr3:187424407-187428816 192 | L1HS.1.chr16:35608475-35614501 193 | L1PA2.1.chr5:139005423-139011486 194 | L1PA2.1.chr15:93675399-93681428 195 | L1PA2.1.chr2:165485934-165491963 196 | L1PA2.1.chr18:24619042-24625072 197 | L1PA3.1.chr6:48363090-48369117 198 | L1PA2.1.chr3:65509292-65515316 199 | L1PA3.1.chr19:29225779-29231807 200 | L1PA2.1.chr8:120348977-120354404 201 | L1PA2.1.chr12:77173381-77179424 202 | L1PA2.1.chr13:100698082-100704117 203 | L1PA2.1.chr12:64195587-64201638 204 | L1PA2.1.chr2:174269465-174274464 205 | L1PA2.1.chr8:72479440-72485463 206 | L1PA2.1.chr4:14009454-14015486 207 | L1PA2.1.chr13:40356290-40362321 208 | L1PA2.1.chr6:156361254-156367276 209 | L1PA2.1.chr1:174377791-174383815 210 | L1PA2.1.chr4:145369388-145375369 211 | L1HS.1.chr1:104770247-104776278 212 | L1PA2.1.chr13:42424880-42430912 213 | L1PA2.1.chr14:101266199-101272227 214 | L1PA2.1.chr4:158084240-158090272 215 | L1PA2.1.chr5:21107412-21113430 216 | L1PA2.1.chr3:141757129-141763153 217 | L1PA2.1.chr1:49875162-49881175 218 | L1PA2.1.chr18:22529636-22535670 219 | L1PA2.1.chr1:25506585-25512707 220 | L1PA3.1.chr6:107854715-107860748 221 | L1HS.1.chr13:31302314-31308370 222 | L1PA2.1.chr14:26629268-26635299 223 | L1PA2.1.chrX:127116697-127122729 224 | L1PA2.1.chr5:57471563-57475609 225 | L1PA2.1.chr8:131770949-131776926 226 | L1PA2.1.chr1:178314791-178320818 227 | L1PA2.1.chr16:63388077-63394106 228 | L1HS.1.chr4:79704552-79710581 229 | L1PA2.1.chr3:178859948-178865979 230 | L1PA2.1.chr18:40187639-40193657 231 | L1PA3.1.chr18:27279551-27285578 232 | L1PA2.1.chr2:158351231-158357242 233 | L1PA3.1.chr4:120741413-120747472 234 | L1PA2.1.chr12:57646479-57652498 235 | L1PA2.1.chr7:29579936-29585963 236 | L1PA2.1.chr8:72147447-72153464 237 | L1HS.1.chr11:90966271-90972302 238 | L1PA2.1.chrX:47783671-47789697 239 | L1PA2.1.chrX:18105518-18110908 240 | L1PA2.1.chr4:4953897-4959919 241 | L1PA2.1.chr11:107361810-107367839 242 | L1PA2.1.chr1:75383477-75388208 243 | L1PA2.1.chr12:70013065-70019067 244 | L1PA2.1.chr2:76775758-76781758 245 | L1PA3.1.chr6:105716122-105722275 246 | L1PA2.1.chr18:41631742-41637769 247 | L1PA3.1.chr2:4157808-4163833 248 | L1PA3.1.chr2:57587069-57592429 249 | L1PA2.1.chrX:36465194-36471217 250 | L1PA2.1.chr2:192268435-192274450 251 | L1PA3.1.chrX:68736250-68742398 252 | L1PA3.1.chrX:137672117-137678261 253 | L1PA3.1.chr2:48739839-48745890 254 | L1PA2.1.chr2:195067521-195073543 255 | L1PA3.1.chr7:37612053-37618072 256 | L1PA3.1.chr5:3439025-3445063 257 | L1PA2.1.chr3:116203398-116209426 258 | L1PA3.1.chrX:125510365-125515187 259 | L1PA3.1.chr4:97633479-97639503 260 | L1PA2.1.chr20:53472644-53478653 261 | L1PA2.1.chr7:16216428-16222457 262 | L1PA2.1.chr1:177633927-177639946 263 | L1HS.1.chrX:56695884-56701916 264 | L1PA2.1.chr18:35205779-35211809 265 | L1PA2.1.chr15:56311143-56317177 266 | L1PA2.1.chr20:24900605-24906618 267 | L1PA3.1.chr8:2413733-2419762 268 | L1PA3.1.chr8:2301964-2307993 269 | L1PA2.1.chr3:158634523-158640540 270 | L1HS.1.chr1:67078891-67084915 271 | L1PA2.1.chr2:124593139-124599168 272 | L1PA2.1.chr7:43059096-43065116 273 | L1PA3.1.chr5:35568225-35574246 274 | L1PA2.1.chr17:3176530-3182557 275 | L1PA3.1.chr4:65052166-65058198 276 | L1PA3.1.chr2:228229041-228234515 277 | L1PA2.1.chr20:18601523-18606939 278 | L1PA3.1.chr9:133310021-133316045 279 | L1PA2.1.chr2:151698868-151704889 280 | L1PA2.1.chr6:141566105-141572136 281 | L1HS.1.chr1:56365452-56369282 282 | L1HS.1.chr14:30684809-30690837 283 | L1PA2.1.chr16:61801455-61807489 284 | L1PA2.1.chr22:16021017-16027044 285 | L1PA3.1.chr2:188123561-188129537 286 | L1HS.1.chr4:15841546-15847572 287 | L1PA3.1.chr11:89744187-89750239 288 | L1HS.1.chr4:107206672-107210557 289 | L1PA2.1.chr8:58914690-58920717 290 | L1HS.1.chr1:237075264-237081293 291 | L1PA3.1.chr3:135025209-135031249 292 | L1PA2.1.chr5:75642235-75648286 293 | L1PA2.1.chr19:55822401-55828429 294 | L1PA2.1.chr6:103709031-103715056 295 | L1PA2.1.chr10:7137522-7142956 296 | L1PA2.1.chr12:106471865-106477891 297 | L1HS.1.chr20:55859566-55865521 298 | L1PA2.1.chr9:14663995-14670015 299 | L1HS.1.chr5:152076868-152082891 300 | L1PA2.1.chr14:55988182-55993244 301 | L1PA2.1.chr10:18030651-18036675 302 | L1PA2.1.chr2:204991072-204997106 303 | L1PA2.1.chr1:174233266-174239293 304 | L1PA2.1.chr13:82045349-82051380 305 | L1PA2.1.chr15:81797930-81803963 306 | L1PA3.1.chr7:14313260-14319290 307 | L1HS.1.chr18:62906292-62912314 308 | L1PA2.1.chr6:162989737-162995762 309 | L1PA2.1.chr9:1223881-1229900 310 | L1PA2.1.chrX:5480456-5486466 311 | L1PA2.1.chrX:98424325-98430357 312 | L1HS.1.chr2:193212420-193218448 313 | L1PA3.1.chr13:105383251-105388345 314 | L1PA2.1.chr12:80244169-80250184 315 | L1PA2.1.chr1:91211587-91216947 316 | L1PA2.1.chr4:64859153-64865171 317 | L1PA2.1.chr9:21536697-21541948 318 | L1PA3.1.chrX:64252345-64258375 319 | L1PA3.1.chr11:127868667-127872497 320 | L1PA2.1.chr1:82250044-82256069 321 | L1PA2.1.chr3:111556203-111562234 322 | L1PA3.1.chr4:53564637-53570664 323 | L1PA3.1.chr6:136000726-136006368 324 | L1HS.1.chrX:127362223-127368248 325 | L1PA2.1.chr4:65288237-65294261 326 | L1PA3.1.chr10:126881867-126887893 327 | L1PA3.1.chr6:133142073-133148104 328 | L1PA3.1.chr15:97372487-97377657 329 | L1PA3.1.chr11:79552653-79558680 330 | L1PA3.1.chr10:60692960-60698897 331 | L1PA2.1.chr5:51250746-51256770 332 | L1PA2.1.chr11:40585472-40591501 333 | L1PA3.1.chr4:157174892-157180916 334 | L1PA4.1.chr16:72580798-72586932 335 | L1PA2.1.chr7:86340208-86346233 336 | L1HS.1.chr4:135178140-135183747 337 | L1PA3.1.chr8:118548537-118554688 338 | L1PA3.1.chrX:124582570-124588702 339 | L1PA2.1.chr2:137393160-137399190 340 | L1PA4.1.chr10:20291416-20297572 341 | -------------------------------------------------------------------------------- /CGC/make_ORF1_and_intact_table.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 3 | try: 4 | import cPickle as pickle 5 | except ImportError: 6 | import pickle 7 | 8 | exp_prob_pkls_list = sys.argv[1] 9 | bam_info_list = sys.argv[2] 10 | orf1_list = sys.argv[3] 11 | orf2_list = sys.argv[4] 12 | allowed_runthrough_fraction = float(sys.argv[5]) 13 | 14 | output_orf1_name = sys.argv[6] 15 | output_intact_name = sys.argv[7] 16 | 17 | orf1_intact = set() 18 | for line in open(orf1_list): 19 | orf1_intact.add(line.strip()) 20 | orf2_intact = set() 21 | for line in open(orf2_list): 22 | orf2_intact.add(line.strip()) 23 | 24 | exp_probs = dict() 25 | seqs = set([]) 26 | 27 | for line in open(exp_prob_pkls_list): 28 | names_file, X_file = line.strip().split('\t') 29 | name = names_file.split('/')[-1][:-16] 30 | exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb')))) 31 | seqs = seqs | set(exp_probs[name].keys()) 32 | 33 | l1pa_pairs = dict() 34 | mapped_pairs = dict() 35 | 36 | for line in open(bam_info_list): 37 | name = line.strip().split('/')[-1][:-4] 38 | baminfo = open(line.strip()).readlines() 39 | mapped_pairs[name] = int(baminfo[1]) 40 | l1pa_pairs[name] = int(baminfo[2]) 41 | 42 | output_orf1 = open(output_orf1_name,'w') 43 | output_intact = open(output_intact_name,'w') 44 | 45 | print_string = "locus" 46 | for name in exp_probs: 47 | print_string += "\t"+name 48 | 49 | output_orf1.write (print_string+'\n') 50 | output_intact.write (print_string+'\n') 51 | 52 | completed = set() 53 | 54 | for name in seqs: 55 | seq_name = '_'.join(name.split('_')[:-1]) 56 | if seq_name in completed: 57 | continue 58 | else: 59 | completed.add(seq_name) 60 | print_string = seq_name.split('(')[0] 61 | only_name = seq_name+'_only' 62 | runon_name = seq_name+'_3prunon' 63 | runthrough_name = seq_name+'_runthrough' 64 | for name in exp_probs: 65 | FPM = 0.0 66 | runthrough_FPM = 0.0 67 | if only_name in exp_probs[name]: 68 | FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 69 | if runon_name in exp_probs[name]: 70 | FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 71 | if runthrough_name in exp_probs[name]: 72 | runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 73 | if FPM>0 and FPM/(FPM+runthrough_FPM) > allowed_runthrough_fraction: 74 | print_string += '\t'+str(FPM) 75 | else: 76 | print_string += '\t0.0' 77 | if seq_name.split('(')[0][:-2] in orf1_intact: 78 | output_orf1.write(print_string+'\n') 79 | if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact: 80 | output_intact.write(print_string+'\n') 81 | 82 | output_orf1.close() 83 | output_intact.close() 84 | -------------------------------------------------------------------------------- /CGC/make_ORF1_and_intact_table_stranded.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 3 | try: 4 | import cPickle as pickle 5 | except ImportError: 6 | import pickle 7 | 8 | exp_prob_pkls_list = sys.argv[1] 9 | bam_info_list = sys.argv[2] 10 | orf1_list = sys.argv[3] 11 | orf2_list = sys.argv[4] 12 | allowed_runthrough_fraction = float(sys.argv[5]) 13 | 14 | output_orf1_name = sys.argv[6] 15 | output_intact_name = sys.argv[7] 16 | 17 | orf1_intact = set() 18 | for line in open(orf1_list): 19 | orf1_intact.add(line.strip()) 20 | orf2_intact = set() 21 | for line in open(orf2_list): 22 | orf2_intact.add(line.strip()) 23 | 24 | exp_probs = dict() 25 | seqs = set([]) 26 | 27 | for line in open(exp_prob_pkls_list): 28 | names_file, X_file = line.strip().split('\t') 29 | name = names_file.split('/')[-1][:-16] 30 | exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb')))) 31 | seqs = seqs | set(exp_probs[name].keys()) 32 | 33 | l1pa_pairs = dict() 34 | mapped_pairs = dict() 35 | 36 | for line in open(bam_info_list): 37 | name = line.strip().split('/')[-1][:-4] 38 | baminfo = open(line.strip()).readlines() 39 | mapped_pairs[name] = int(baminfo[1]) 40 | l1pa_pairs[name] = int(baminfo[2]) 41 | 42 | output_orf1 = open(output_orf1_name,'w') 43 | output_intact = open(output_intact_name,'w') 44 | 45 | print_string = "locus" 46 | for name in exp_probs: 47 | print_string += "\t"+name 48 | 49 | output_orf1.write (print_string+'\n') 50 | output_intact.write (print_string+'\n') 51 | 52 | completed = set() 53 | 54 | for name in seqs: 55 | seq_name = '_'.join(name.split('_')[:-1]) 56 | if seq_name in completed: 57 | continue 58 | else: 59 | completed.add(seq_name) 60 | print_string = seq_name.split('(')[0] 61 | only_name = seq_name+'_only' 62 | runon_name = seq_name+'_3prunon' 63 | senserunthrough_name = seq_name+'_senserunthrough' 64 | antisenserunthrough_name = seq_name+'_antisenserunthrough' 65 | for name in exp_probs: 66 | FPM = 0.0 67 | runthrough_FPM = 0.0 68 | if only_name in exp_probs[name]: 69 | FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 70 | if runon_name in exp_probs[name]: 71 | FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 72 | if senserunthrough_name in exp_probs[name]: 73 | runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 74 | if FPM>0 and FPM/(FPM+runthrough_FPM) > allowed_runthrough_fraction: 75 | print_string += '\t'+str(FPM) 76 | else: 77 | print_string += '\t0.0' 78 | if seq_name.split('(')[0][:-2] in orf1_intact: 79 | output_orf1.write(print_string+'\n') 80 | if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact: 81 | output_intact.write(print_string+'\n') 82 | 83 | output_orf1.close() 84 | output_intact.close() 85 | -------------------------------------------------------------------------------- /CGC/make_l1pa1to4table.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 3 | try: 4 | import cPickle as pickle 5 | except ImportError: 6 | import pickle 7 | 8 | exp_prob_pkls_list = sys.argv[1] 9 | bam_info_list = sys.argv[2] 10 | 11 | exp_probs = dict() 12 | seqs = set([]) 13 | 14 | for line in open(exp_prob_pkls_list): 15 | names_file, X_file = line.strip().split('\t') 16 | name = names_file.split('/')[-1][:-16] 17 | exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb')))) 18 | seqs = seqs | set(exp_probs[name].keys()) 19 | 20 | l1pa_pairs = dict() 21 | mapped_pairs = dict() 22 | 23 | for line in open(bam_info_list): 24 | name = line.strip().split('/')[-1][:-4] 25 | baminfo = open(line.strip()).readlines() 26 | mapped_pairs[name] = int(baminfo[1]) 27 | l1pa_pairs[name] = int(baminfo[2]) 28 | 29 | print_string = "locus" 30 | for name in exp_probs: 31 | print_string += "\t"+name+'-active'+"\t"+name+'-passive' 32 | 33 | print(print_string) 34 | 35 | completed = set() 36 | 37 | for name in seqs: 38 | if name.split('.')[0] not in ['L1HS','L1PA2','L1PA3','L1PA4']: 39 | continue 40 | seq_name = '_'.join(name.split('_')[:-1]) 41 | if seq_name in completed: 42 | continue 43 | else: 44 | completed.add(seq_name) 45 | print_string = seq_name.split('(')[0] 46 | only_name = seq_name+'_only' 47 | runon_name = seq_name+'_3prunon' 48 | runthrough_name = seq_name+'_runthrough' 49 | for name in exp_probs: 50 | FPM = 0.0 51 | runthrough_FPM = 0.0 52 | if only_name in exp_probs[name]: 53 | FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 54 | if runon_name in exp_probs[name]: 55 | FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 56 | if runthrough_name in exp_probs[name]: 57 | runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 58 | print_string += '\t'+str(FPM)+'\t'+str(runthrough_FPM) 59 | print(print_string) 60 | -------------------------------------------------------------------------------- /CGC/make_l1pa1to4table_stranded.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 3 | try: 4 | import cPickle as pickle 5 | except ImportError: 6 | import pickle 7 | 8 | exp_prob_pkls_list = sys.argv[1] 9 | bam_info_list = sys.argv[2] 10 | allowed_rt_fraction = float(sys.argv[3]) 11 | 12 | exp_probs = dict() 13 | seqs = set([]) 14 | 15 | for line in open(exp_prob_pkls_list): 16 | names_file, X_file = line.strip().split('\t') 17 | name = names_file.split('/')[-1][:-16] 18 | exp_probs[name] = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb')))) 19 | seqs = seqs | set(exp_probs[name].keys()) 20 | 21 | l1pa_pairs = dict() 22 | mapped_pairs = dict() 23 | 24 | for line in open(bam_info_list): 25 | name = line.strip().split('/')[-1][:-4] 26 | baminfo = open(line.strip()).readlines() 27 | mapped_pairs[name] = int(baminfo[1]) 28 | l1pa_pairs[name] = int(baminfo[2]) 29 | 30 | print_string = "locus" 31 | for name in exp_probs: 32 | print_string += "\t"+name 33 | 34 | print(print_string) 35 | 36 | completed = set() 37 | 38 | for name in seqs: 39 | if name.split('.')[0] not in ['L1HS','L1PA2','L1PA3','L1PA4']: 40 | continue 41 | seq_name = '_'.join(name.split('_')[:-1]) 42 | if seq_name in completed: 43 | continue 44 | else: 45 | completed.add(seq_name) 46 | print_string = seq_name.split('(')[0] 47 | only_name = seq_name+'_only' 48 | runon_name = seq_name+'_3prunon' 49 | runthrough_name = seq_name+'_senserunthrough' 50 | for name in exp_probs: 51 | FPM = 0.0 52 | runthrough_FPM = 0.0 53 | if only_name in exp_probs[name]: 54 | FPM += exp_probs[name][only_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 55 | if runon_name in exp_probs[name]: 56 | FPM += exp_probs[name][runon_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 57 | if runthrough_name in exp_probs[name]: 58 | runthrough_FPM += exp_probs[name][runthrough_name]*l1pa_pairs[name]/mapped_pairs[name]*10**6 59 | if runthrough_FPM < allowed_rt_fraction*FPM: 60 | print_string += '\t'+str(FPM) 61 | else: 62 | print_string += '\t0.0' 63 | print(print_string) 64 | -------------------------------------------------------------------------------- /CGC/median_template_and_pairs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pysam 3 | import random 4 | import numpy 5 | 6 | """ 7 | Estimate median template length of a bam file. 8 | 9 | Part of the L1-EM package. 10 | 11 | Copyright (C) 2019 Wilson McKerrow 12 | 13 | This program is free software: you can redistribute it and/or modify 14 | it under the terms of the GNU General Public License as published by 15 | the Free Software Foundation, either version 3 of the License, or 16 | (at your option) any later version. 17 | 18 | This program is distributed in the hope that it will be useful, 19 | but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 | GNU General Public License for more details. 22 | 23 | You should have received a copy of the GNU General Public License 24 | along with this program. If not, see . 25 | 26 | """ 27 | 28 | bamfile = sys.argv[1] 29 | fraction = float(sys.argv[2]) 30 | 31 | tlens = list() 32 | n_proper_reads = 0 33 | 34 | for read in pysam.AlignmentFile(bamfile): 35 | if read.is_proper_pair: 36 | n_proper_reads += 1 37 | if random.random() < fraction: 38 | tlens.append(read.template_length) 39 | 40 | print(numpy.median(numpy.abs(tlens))) 41 | print(n_proper_reads/2) 42 | -------------------------------------------------------------------------------- /CGC/read_or_pair_overlap_bed_and_unmapped.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import sys 3 | 4 | """ 5 | Extract reads or pairs of reads that overlap a bed file. 6 | 7 | Part of the L1-EM package. 8 | 9 | Copyright (C) 2019 Wilson McKerrow 10 | 11 | This program is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | This program is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with this program. If not, see . 23 | 24 | """ 25 | 26 | def main(): 27 | bedfile = sys.argv[1] 28 | bamfile = sys.argv[2] 29 | outbamfile = sys.argv[3] 30 | outunmappedbamfile = sys.argv[4] 31 | if len(sys.argv) > 5: 32 | flanking = int(sys.argv[5]) 33 | else: 34 | flanking = 400 35 | if len(sys.argv) > 6: 36 | maxNM = int(sys.argv[6]) 37 | else: 38 | maxNM = 4 39 | 40 | inbam = pysam.AlignmentFile(bamfile,'rb') 41 | outbam = pysam.AlignmentFile(outbamfile,'wb',template=inbam) 42 | outunmappedbam = pysam.AlignmentFile(outunmappedbamfile,'wb',template=inbam) 43 | 44 | read_ids = set() 45 | for line in open(bedfile): 46 | chrom,start,stop = line.strip().split('\t')[:3] 47 | start = int(start)+flanking 48 | stop = int(stop)-flanking 49 | if chrom in inbam.references: 50 | for read in inbam.fetch(chrom,start,stop): 51 | if not read.is_unmapped: 52 | if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and (not read.has_tag('NM') or read.get_tag('NM')<=maxNM): 53 | read_ids.add(read.query_name) 54 | # if chrom[3:] in inbam.references: 55 | # for read in inbam.fetch(chrom[3:],start,stop): 56 | # if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3: 57 | # read_ids.add(read.query_name) 58 | # if '_' in chrom and chrom.split('_')[1].upper()+'.1' in inbam.references: 59 | # for read in inbam.fetch(chrom.split('_')[1].upper()+'.1',start,stop): 60 | # if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3: 61 | # read_ids.add(read.query_name) 62 | 63 | inbam.close() 64 | inbam = pysam.AlignmentFile(bamfile,'rb') 65 | 66 | for read in inbam: 67 | if read.query_name in read_ids: 68 | if not read.is_secondary and not read.is_supplementary: 69 | outbam.write(read) 70 | elif read.is_unmapped or read.mate_is_unmapped: 71 | if not read.is_secondary and not read.is_supplementary: 72 | outunmappedbam.write(read) 73 | 74 | inbam.close() 75 | outbam.close() 76 | 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /CGC/report_l1_exp_counts.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | import sys 7 | 8 | """ 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC 10 | to analyze TCGA data. 11 | 12 | Copyright (C) 2019 Wilson McKerrow 13 | 14 | This program is free software: you can redistribute it and/or modify 15 | it under the terms of the GNU General Public License as published by 16 | the Free Software Foundation, either version 3 of the License, or 17 | (at your option) any later version. 18 | 19 | This program is distributed in the hope that it will be useful, 20 | but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | GNU General Public License for more details. 23 | 24 | You should have received a copy of the GNU General Public License 25 | along with this program. If not, see . 26 | 27 | """ 28 | 29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb')))) 30 | 31 | proper_pairs_in_original_bam = float(sys.argv[3]) 32 | 33 | total = float(sys.argv[4]) 34 | 35 | written_seqs = set([]) 36 | 37 | print("family.category.locus.strand\tonly\t3prunon\tpassive_sense\tpassive_antisense\tantisense") 38 | 39 | names = list(X_est.keys()) 40 | 41 | for name in names: 42 | if 'exon' not in name: 43 | seq_name = '_'.join(name.split('_')[:-1]) 44 | if seq_name in written_seqs: 45 | continue 46 | written_seqs.add(seq_name) 47 | print_string = seq_name.split('(')[0] 48 | only_name = seq_name+'_only' 49 | if only_name not in X_est: 50 | X_est[only_name]=0.0 51 | print_string += '\t'+str(total*X_est[only_name]/proper_pairs_in_original_bam*10**6) 52 | runon_name = seq_name+'_3prunon' 53 | if runon_name not in X_est: 54 | X_est[runon_name]=0.0 55 | print_string += '\t'+str(total*X_est[runon_name]/proper_pairs_in_original_bam*10**6) 56 | runthroughS_name = seq_name+'_senserunthrough' 57 | if runthroughS_name not in X_est: 58 | X_est[runthroughS_name]=0.0 59 | print_string += '\t'+str(total*X_est[runthroughS_name]/proper_pairs_in_original_bam*10**6) 60 | runthroughA_name = seq_name+'_antisenserunthrough' 61 | if runthroughA_name not in X_est: 62 | X_est[runthroughA_name]=0.0 63 | print_string += '\t'+str(total*X_est[runthroughA_name]/proper_pairs_in_original_bam*10**6) 64 | antisense_name = seq_name+'_antisense' 65 | if antisense_name not in X_est: 66 | X_est[antisense_name]=0.0 67 | print_string += '\t'+str(total*X_est[antisense_name]/proper_pairs_in_original_bam*10**6) 68 | print(print_string) 69 | -------------------------------------------------------------------------------- /CGC/report_l1_exp_counts_unstranded.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | import sys 7 | 8 | """ 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC 10 | to analyze TCGA data. 11 | 12 | Copyright (C) 2019 Wilson McKerrow 13 | 14 | This program is free software: you can redistribute it and/or modify 15 | it under the terms of the GNU General Public License as published by 16 | the Free Software Foundation, either version 3 of the License, or 17 | (at your option) any later version. 18 | 19 | This program is distributed in the hope that it will be useful, 20 | but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | GNU General Public License for more details. 23 | 24 | You should have received a copy of the GNU General Public License 25 | along with this program. If not, see . 26 | 27 | """ 28 | 29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb')))) 30 | 31 | proper_pairs_in_original_bam = float(sys.argv[3]) 32 | 33 | total = float(sys.argv[4]) 34 | 35 | written_seqs = set([]) 36 | 37 | print("family.category.locus.strand\tonly\t3prunon\tpassive") 38 | 39 | names = list(X_est.keys()) 40 | 41 | for name in names: 42 | if 'exon' not in name: 43 | seq_name = '_'.join(name.split('_')[:-1]) 44 | if seq_name in written_seqs: 45 | continue 46 | written_seqs.add(seq_name) 47 | print_string = seq_name.split('(')[0] 48 | only_name = seq_name+'_only' 49 | if only_name not in X_est: 50 | X_est[only_name]=0.0 51 | print_string += '\t'+str(total*X_est[only_name]/proper_pairs_in_original_bam*10**6) 52 | runon_name = seq_name+'_3prunon' 53 | if runon_name not in X_est: 54 | X_est[runon_name]=0.0 55 | print_string += '\t'+str(total*X_est[runon_name]/proper_pairs_in_original_bam*10**6) 56 | runthrough_name = seq_name+'_runthrough' 57 | if runthrough_name not in X_est: 58 | X_est[runthrough_name]=0.0 59 | print_string += '\t'+str(total*X_est[runthrough_name]/proper_pairs_in_original_bam*10**6) 60 | print(print_string) 61 | -------------------------------------------------------------------------------- /CGC/total_orf1_and_orf2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 3 | try: 4 | import cPickle as pickle 5 | except ImportError: 6 | import pickle 7 | 8 | exp_prob_pkls_list = sys.argv[1] 9 | bam_info_list = sys.argv[2] 10 | orf1_list = sys.argv[3] 11 | orf2_list = sys.argv[4] 12 | min_FPM = float(sys.argv[5]) 13 | allowed_runthrough_fraction = float(sys.argv[6]) 14 | 15 | l1pa_pairs = dict() 16 | mapped_pairs = dict() 17 | 18 | orf1_intact = set() 19 | for line in open(orf1_list): 20 | orf1_intact.add(line.strip()) 21 | orf2_intact = set() 22 | for line in open(orf2_list): 23 | orf2_intact.add(line.strip()) 24 | 25 | for line in open(bam_info_list): 26 | name = line.strip().split('/')[-1][:-4] 27 | baminfo = open(line.strip()).readlines() 28 | mapped_pairs[name] = int(baminfo[1]) 29 | l1pa_pairs[name] = int(baminfo[2]) 30 | 31 | print('name\torf1_FPM\tORF2_FPM\tboth_FPM\tL1HS_expression_FPM\tL1HS_all_FPM') 32 | 33 | for line in open(exp_prob_pkls_list): 34 | names_file, X_file = line.strip().split('\t') 35 | sample_name = names_file.split('/')[-1][:-16] 36 | exp_prob = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb')))) 37 | orf1 = 0.0 38 | orf2 = 0.0 39 | both = 0.0 40 | L1HS_exp = 0.0 41 | L1HS_all = 0.0 42 | for transcript in exp_prob: 43 | if 'L1HS' in transcript: 44 | L1HS_all += exp_prob[transcript]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6 45 | if 'only' not in transcript: 46 | continue 47 | seq_name = '_'.join(transcript.split('_')[:-1]) 48 | only_name = seq_name+'_only' 49 | runon_name = seq_name+'_3prunon' 50 | runthrough_name = seq_name+'_runthrough' 51 | FPM = 0.0 52 | FPM += exp_prob[only_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6 53 | if runon_name in exp_prob: 54 | FPM += exp_prob[runon_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6 55 | if runthrough_name in exp_prob: 56 | runthrough_FPM = exp_prob[runthrough_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6 57 | else: 58 | runthrough_FPM = 0.0 59 | FPM *= FPM >= min_FPM and runthrough_FPM/(runthrough_FPM+FPM) <= allowed_runthrough_fraction 60 | if seq_name.split('(')[0][:-2] in orf1_intact: 61 | orf1 += FPM 62 | if seq_name.split('(')[0][:-2] in orf2_intact: 63 | orf2 += FPM 64 | if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact: 65 | both += FPM 66 | if 'L1HS' in seq_name: 67 | L1HS_exp += FPM 68 | print(sample_name +'\t'+ str(orf1) +'\t'+ str(orf2) +'\t'+ str(both) +'\t'+ str(L1HS_exp) +'\t'+ str(L1HS_all)) 69 | -------------------------------------------------------------------------------- /CGC/total_orf1_and_orf2_stranded.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 3 | try: 4 | import cPickle as pickle 5 | except ImportError: 6 | import pickle 7 | 8 | exp_prob_pkls_list = sys.argv[1] 9 | bam_info_list = sys.argv[2] 10 | orf1_list = sys.argv[3] 11 | orf2_list = sys.argv[4] 12 | min_FPM = float(sys.argv[5]) 13 | allowed_runthrough_fraction = float(sys.argv[6]) 14 | 15 | l1pa_pairs = dict() 16 | mapped_pairs = dict() 17 | 18 | orf1_intact = set() 19 | for line in open(orf1_list): 20 | orf1_intact.add(line.strip()) 21 | orf2_intact = set() 22 | for line in open(orf2_list): 23 | orf2_intact.add(line.strip()) 24 | 25 | for line in open(bam_info_list): 26 | name = line.strip().split('/')[-1][:-4] 27 | baminfo = open(line.strip()).readlines() 28 | mapped_pairs[name] = int(baminfo[1]) 29 | l1pa_pairs[name] = int(baminfo[2]) 30 | 31 | print('name\torf1_FPM\tORF2_FPM\tboth_FPM\tL1HS_expression_FPM\tL1HS_all_FPM') 32 | 33 | for line in open(exp_prob_pkls_list): 34 | names_file, X_file = line.strip().split('\t') 35 | sample_name = names_file.split('/')[-1][:-16] 36 | exp_prob = dict(zip(pickle.load(open(names_file,'rb')),pickle.load(open(X_file,'rb')))) 37 | orf1 = 0.0 38 | orf2 = 0.0 39 | both = 0.0 40 | L1HS_exp = 0.0 41 | L1HS_all = 0.0 42 | for transcript in exp_prob: 43 | if 'L1HS' in transcript: 44 | L1HS_all += exp_prob[transcript]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6 45 | if 'only' not in transcript: 46 | continue 47 | seq_name = '_'.join(transcript.split('_')[:-1]) 48 | only_name = seq_name+'_only' 49 | runon_name = seq_name+'_3prunon' 50 | runthrough_name = seq_name+'_senserunthrough' 51 | FPM = 0.0 52 | FPM += exp_prob[only_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6 53 | if runon_name in exp_prob: 54 | FPM += exp_prob[runon_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6 55 | if runthrough_name in exp_prob: 56 | runthrough_FPM = exp_prob[runthrough_name]*l1pa_pairs[sample_name]/mapped_pairs[sample_name]*10**6 57 | else: 58 | runthrough_FPM = 0.0 59 | FPM *= FPM >= min_FPM and runthrough_FPM/(runthrough_FPM+FPM) <= allowed_runthrough_fraction 60 | if seq_name.split('(')[0][:-2] in orf1_intact: 61 | orf1 += FPM 62 | if seq_name.split('(')[0][:-2] in orf2_intact: 63 | orf2 += FPM 64 | if seq_name.split('(')[0][:-2] in orf1_intact and seq_name.split('(')[0][:-2] in orf2_intact: 65 | both += FPM 66 | if 'L1HS' in seq_name: 67 | L1HS_exp += FPM 68 | print(sample_name +'\t'+ str(orf1) +'\t'+ str(orf2) +'\t'+ str(both) +'\t'+ str(L1HS_exp) +'\t'+ str(L1HS_all)) 69 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3:4.5.12 2 | 3 | WORKDIR / 4 | 5 | RUN conda install -y --override-channels -c bioconda -c conda-forge -c defaults python=2.7.15 bwa=0.7.17 samtools=1.9 numpy=1.14.3 scipy=1.1.0 pysam=0.15.0 bedtools=2.27.1 6 | RUN git clone https://github.com/FenyoLab/L1EM/ 7 | 8 | -------------------------------------------------------------------------------- /L1EM.yml: -------------------------------------------------------------------------------- 1 | name: L1EM 2 | channels: 3 | - bioconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - python=2.7.15 8 | - bwa=0.7.17 9 | - samtools=1.9 10 | - numpy=1.14.3 11 | - scipy=1.1.0 12 | - pysam=0.15.0 13 | - bedtools=2.27.1 14 | 15 | -------------------------------------------------------------------------------- /L1EM/G_of_R.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import sys 3 | import numpy 4 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 5 | try: 6 | import cPickle as pickle 7 | except ImportError: 8 | import pickle 9 | from scipy import sparse 10 | import datetime 11 | import argparse 12 | 13 | """ 14 | This script reads through a bam file resulting from a bwa aln alignment to the L1EM reference. 15 | The output is a sparse matrix in which the rows are reads, the columns are transcripts 16 | and the entries are the likelihood of that read arising from that transcript. 17 | The matrix is pickled and saved. The column names are writted to a text file. 18 | 19 | Part of the L1-EM package. 20 | 21 | Copyright (C) 2019 Wilson McKerrow 22 | 23 | This program is free software: you can redistribute it and/or modify 24 | it under the terms of the GNU General Public License as published by 25 | the Free Software Foundation, either version 3 of the License, or 26 | (at your option) any later version. 27 | 28 | This program is distributed in the hope that it will be useful, 29 | but WITHOUT ANY WARRANTY; without even the implied warranty of 30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 | GNU General Public License for more details. 32 | 33 | You should have received a copy of the GNU General Public License 34 | along with this program. If not, see . 35 | 36 | """ 37 | 38 | """ 39 | This class stores relevant information about a read's potential alignment as a dictionary 40 | with references names as keys and as list of potential alignments to that reference name 41 | as values. 42 | """ 43 | class read_alignments(object): 44 | def __init__(self, alignment,rnames,P): 45 | self.alignments = dict() 46 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)] 47 | # Add a new alignment, passing a pysam aligned_segnment object. 48 | def add(self, alignment,rnames,P): 49 | if rnames[alignment.rname] not in self.alignments: 50 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)] 51 | else: 52 | self.alignments[rnames[alignment.rname]].append(alignment_at_name(alignment.reference_start,alignment.is_reverse,P)) 53 | # Add a new alignment, passing the output of parseXA. 54 | def addXA(self,refname,start,is_reverse,P): 55 | if refname not in self.alignments: 56 | self.alignments[refname] = [alignment_at_name(start,is_reverse,P)] 57 | else: 58 | self.alignments[refname].append(alignment_at_name(start,is_reverse,P)) 59 | 60 | # Stores position, strand and likelihood for an alignment. 61 | class alignment_at_name(object): 62 | def __init__(self,start,is_reverse,P): 63 | self.start = start 64 | self.is_reverse = is_reverse 65 | self.P = P 66 | 67 | # Read command line arguments 68 | def GetArgs(): 69 | 70 | def ParseArgs(parser): 71 | class Parser(argparse.ArgumentParser): 72 | def error(self, message): 73 | sys.stderr.write('error: %s\n' % message) 74 | self.print_help() 75 | sys.exit(2) 76 | 77 | parser.add_argument('-b', '--bamfile', 78 | type=str, 79 | required=True, 80 | help='Bam to generate alignments from. Required.') 81 | parser.add_argument('-e', '--error_prob', 82 | required=False, 83 | default=0.01, 84 | type=float, 85 | help='Probability of an alignment mismatch. [0.01]') 86 | parser.add_argument('-m', '--max_start2start_len', 87 | required=False, 88 | default=500, 89 | type=int, 90 | help='Maximium distance between read starts to be considered concordant. [500]') 91 | parser.add_argument('-r', '--reads_per_pickle', 92 | required=False, 93 | default=12500, 94 | type=int, 95 | help='Split output into chunks of this many reads. [12500]') 96 | parser.add_argument('-p', '--prefix', 97 | required=False, 98 | default='G_of_R', 99 | type=str, 100 | help='Prefix for output file(s) [G_of_R]') 101 | parser.add_argument('-n', '--NMdiff', 102 | required=False, 103 | default=2, 104 | type=int, 105 | help='Ignore alignments with edit distance that exceed the best alignment by more than this number. [2]') 106 | parser.add_argument('-i', '--insert_mean', 107 | required=True, 108 | type=float, 109 | help='Median template length. Required.') 110 | parser.add_argument('--flanking', 111 | required=False, 112 | default=400, 113 | type=int, 114 | help='Number of flanking bases included on each end of repeats in reference fasta. [400]') 115 | parser.add_argument('--as_start', 116 | required=False, 117 | default=500, 118 | type=int, 119 | help='Position of the antisense TSS in L1. [500]') 120 | parser.add_argument('-w', '--wiggle', 121 | required=False, 122 | default=20, 123 | type=int, 124 | help='Extend L1 annotation this many bases in both directions. [20]') 125 | parser.add_argument('--min_len', 126 | required=False, 127 | default=500, 128 | type=int, 129 | help='When alignments probabilities are normalized for element length take max of elements length and this value. [500]') 130 | parser.add_argument('--min_exon_len', 131 | required=False, 132 | default=100, 133 | type=int, 134 | help='When alignments probabilities are normalized for exon length take max of elements length and this value. [100]') 135 | return parser.parse_args() 136 | 137 | parser = argparse.ArgumentParser() 138 | args = ParseArgs(parser) 139 | 140 | return args.bamfile, args.error_prob, args.max_start2start_len, args.reads_per_pickle, args.prefix, args.NMdiff, args.insert_mean, args.flanking, args.as_start,args.wiggle, args.min_len, args.min_exon_len 141 | 142 | """ 143 | Takes as input alignments (read_alignments class) of two paired reads and returns a sparse 144 | row matrix with the likelihoods of all properly paired alignments. 145 | """ 146 | def get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len): 147 | this_G_of_R = numpy.zeros(5*nreps) 148 | for refname in alignments1.alignments: 149 | if refname not in alignments2.alignments: 150 | continue 151 | for aln1 in alignments1.alignments[refname]: 152 | for aln2 in alignments2.alignments[refname]: 153 | if aln1.is_reverse == aln2.is_reverse: 154 | continue 155 | if max(aln1.start,aln2.start)-min(aln1.start,aln2.start) <= max_start2start_len: 156 | has_5pUTR = refname.split('.')[1]=='1' 157 | if refname.split('.')[1]=='2': 158 | this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/(max(rlens[rnames_index[refname]]-insert_mean,min_exon_len)) 159 | continue 160 | is_sense = not aln2.is_reverse 161 | within_5p = min(aln1.start,aln2.start) > flanking -wiggle 162 | within_3p = max(aln1.start,aln2.start)+read_length < rlens[rnames_index[refname]]-flanking +wiggle 163 | overlap_element = max(aln1.start,aln2.start)+read_length > flanking and min(aln1.start,aln2.start) < rlens[rnames_index[refname]]-flanking 164 | if not overlap_element: 165 | continue 166 | if is_sense: 167 | this_G_of_R[rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+insert_mean+2*wiggle) 168 | if not is_sense: 169 | this_G_of_R[nreps+rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+insert_mean+2*wiggle) 170 | if within_5p and within_3p and is_sense and has_5pUTR: 171 | this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/max(rlens[rnames_index[refname]]-2*flanking-insert_mean+2*wiggle,min_len) 172 | if within_5p and is_sense and has_5pUTR: 173 | this_G_of_R[3*nreps+rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+2*wiggle) 174 | if has_5pUTR and rlens[rnames_index[refname]] > flanking+as_start and max(aln1.start,aln2.start)+read_length < flanking+as_start and (not is_sense) and rlens[rnames_index[refname]] > flanking+as_start: 175 | this_G_of_R[4*nreps+rnames_index[refname]] += aln1.P*aln2.P/(as_start+insert_mean+wiggle) 176 | return sparse.csr_matrix(this_G_of_R) 177 | 178 | # Parse secondary alignments in the XA tag from bwa aln. 179 | def parseXA(alignments,XAtagdict,error_prob,maxNM,reversed): 180 | for aln in [x.split(',') for x in XAtagdict.split(';')[:-1]]: 181 | refname = aln[0] 182 | #if not reversed: 183 | # is_reverse = aln[1][0] == '-' 184 | #else: 185 | # is_reverse = aln[1][0] == '+' 186 | is_reverse = aln[1][0] == '-' 187 | start = int(aln[1][1:]) 188 | cigarstring = aln[2] 189 | NM = int(aln[3]) 190 | if NM <= maxNM and 'S' not in cigarstring and 'H' not in cigarstring: 191 | P = error_prob**NM 192 | alignments.addXA(refname,start,is_reverse,P) 193 | return alignments 194 | 195 | def main(): 196 | bamfile, error_prob, max_start2start_len, reads_per_pickle, prefix, NMdiff, insert_mean, flanking, as_start, wiggle, min_len, min_exon_len = GetArgs() 197 | 198 | pickle_num = 0 199 | 200 | bam = pysam.Samfile(bamfile, "rb") 201 | rnames = bam.references 202 | rlens = bam.lengths 203 | nreps = len(rnames) 204 | rnames_index = dict() 205 | for i in range(nreps): 206 | rnames_index[rnames[i]] = i 207 | 208 | # Write transcript (column) names 209 | TEnamefile = open(prefix+'_TE_list.txt','w') 210 | for i in range(nreps): 211 | TEnamefile.write(rnames[i]+'_senserunthrough'+'\t'+str(rlens[i]+2*flanking)+'\n') 212 | for i in range(nreps): 213 | TEnamefile.write(rnames[i]+'_antisenserunthrough'+'\t'+str(rlens[i]+2*flanking)+'\n') 214 | for i in range(nreps): 215 | TEnamefile.write(rnames[i]+'_only'+'\t'+str(rlens[i])+'\n') 216 | for i in range(nreps): 217 | TEnamefile.write(rnames[i]+'_3prunon'+'\t'+str(rlens[i]+flanking)+'\n') 218 | for i in range(nreps): 219 | TEnamefile.write(rnames[i]+'_antisense'+'\t'+str(flanking+as_start)+'\n') 220 | TEnamefile.close() 221 | 222 | read_id = None 223 | 224 | G_of_R = None 225 | G_of_R_list_file = open(prefix+'_list.txt','w') 226 | G_of_R_row = 0 227 | 228 | starttime = datetime.datetime.now() 229 | 230 | # Read through the name sorted bam file 231 | for alignment in bam: 232 | read_length = alignment.query_length 233 | # Throw out alignments that are unmapped, clipped or low quality 234 | if alignment.is_unmapped: 235 | continue 236 | if 'N' in alignment.cigarstring or 'S' in alignment.cigarstring or 'H' in alignment.cigarstring or 'P' in alignment.cigarstring or '=' in alignment.cigarstring or 'X' in alignment.cigarstring: 237 | continue 238 | if numpy.mean(alignment.query_qualities) < 30: 239 | continue 240 | 241 | if not read_id: 242 | read_id = alignment.qname 243 | new_read_id1 = True 244 | new_read_id2 = True 245 | 246 | # Once we have read all entries for a given query name, create a row for that fragment 247 | if read_id != alignment.qname: 248 | if not (new_read_id1 or new_read_id2): 249 | this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len) 250 | if this_G_of_R.nnz > 0: 251 | if G_of_R_row > 0: 252 | G_of_R = sparse.vstack([G_of_R,this_G_of_R]) 253 | else: 254 | G_of_R = this_G_of_R 255 | G_of_R_row += 1 256 | # If necessary, break up matrix into multiple pickle files. 257 | if G_of_R_row >= reads_per_pickle: 258 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL) 259 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n') 260 | pickle_num += 1 261 | G_of_R_row = 0 262 | G_of_R = None 263 | print('wrote '+str(reads_per_pickle)+' reads in '+str(datetime.datetime.now()-starttime)) 264 | starttime = datetime.datetime.now() 265 | 266 | read_id = alignment.qname 267 | new_read_id1 = True 268 | new_read_id2 = True 269 | 270 | # Parse primary alignment 271 | # There's a bug in bwa samse (0.7.17) when writing NM tag for overlapping read pairs 272 | NMtag = dict(alignment.tags)['XM'] 273 | for pair in alignment.cigartuples: 274 | NMtag += (pair[0]>0)*pair[1] 275 | P = error_prob**NMtag 276 | 277 | if alignment.is_read1: 278 | if new_read_id1: 279 | alignments1 = read_alignments(alignment,rnames,P) 280 | new_read_id1 = False 281 | else: 282 | alignments1.add(alignment,rnames,P) 283 | else: 284 | if new_read_id2: 285 | alignments2 = read_alignments(alignment,rnames,P) 286 | new_read_id2 = False 287 | else: 288 | alignments2.add(alignment,rnames,P) 289 | 290 | # Parse secondary alignments 291 | if 'XA' in dict(alignment.tags): 292 | if alignment.is_read1: 293 | alignments1 = parseXA(alignments1,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse) 294 | else: 295 | alignments2 = parseXA(alignments2,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse) 296 | 297 | # Make row for last read 298 | if read_id and not (new_read_id1 or new_read_id2): 299 | this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len) 300 | if this_G_of_R.nnz > 0: 301 | if G_of_R_row > 0: 302 | G_of_R = sparse.vstack([G_of_R,this_G_of_R]) 303 | else: 304 | G_of_R = this_G_of_R 305 | 306 | # Write matrix to disk. 307 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL) 308 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n') 309 | print(G_of_R_row+reads_per_pickle*pickle_num) 310 | 311 | if __name__ == '__main__': 312 | main() 313 | -------------------------------------------------------------------------------- /L1EM/G_of_R_single_unstranded.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import sys 3 | import numpy 4 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 5 | try: 6 | import cPickle as pickle 7 | except ImportError: 8 | import pickle 9 | from scipy import sparse 10 | import datetime 11 | import argparse 12 | 13 | """ 14 | This script reads through a bam file resulting from a bwa aln alignment to the L1EM reference. 15 | The output is a sparse matrix in which the rows are reads, the columns are transcripts 16 | and the entries are the likelihood of that read arising from that transcript. 17 | The matrix is pickled and saved. The column names are writted to a text file. 18 | 19 | Copyright (C) 2019 Wilson McKerrow 20 | 21 | This program is free software: you can redistribute it and/or modify 22 | it under the terms of the GNU General Public License as published by 23 | the Free Software Foundation, either version 3 of the License, or 24 | (at your option) any later version. 25 | 26 | This program is distributed in the hope that it will be useful, 27 | but WITHOUT ANY WARRANTY; without even the implied warranty of 28 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 | GNU General Public License for more details. 30 | 31 | You should have received a copy of the GNU General Public License 32 | along with this program. If not, see . 33 | 34 | """ 35 | 36 | """ 37 | This class stores relevant information about a read's potential alignment as a dictionary 38 | with references names as keys and as list of potential alignments to that reference name 39 | as values. 40 | """ 41 | class read_alignments(object): 42 | def __init__(self, alignment,rnames,P): 43 | self.alignments = dict() 44 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)] 45 | # Add a new alignment, passing a pysam aligned_segnment object. 46 | def add(self, alignment,rnames,P): 47 | if rnames[alignment.rname] not in self.alignments: 48 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)] 49 | else: 50 | self.alignments[rnames[alignment.rname]].append(alignment_at_name(alignment.reference_start,alignment.is_reverse,P)) 51 | # Add a new alignment, passing the output of parseXA. 52 | def addXA(self,refname,start,is_reverse,P): 53 | if refname not in self.alignments: 54 | self.alignments[refname] = [alignment_at_name(start,is_reverse,P)] 55 | else: 56 | self.alignments[refname].append(alignment_at_name(start,is_reverse,P)) 57 | 58 | # Stores position, strand and likelihood for an alignment. 59 | class alignment_at_name(object): 60 | def __init__(self,start,is_reverse,P): 61 | self.start = start 62 | self.is_reverse = is_reverse 63 | self.P = P 64 | 65 | # Read command line arguments 66 | def GetArgs(): 67 | 68 | def ParseArgs(parser): 69 | class Parser(argparse.ArgumentParser): 70 | def error(self, message): 71 | sys.stderr.write('error: %s\n' % message) 72 | self.print_help() 73 | sys.exit(2) 74 | 75 | parser.add_argument('-b', '--bamfile', 76 | type=str, 77 | required=True, 78 | help='Bam to generate alignments from. Required.') 79 | parser.add_argument('-e', '--error_prob', 80 | required=False, 81 | default=0.01, 82 | type=float, 83 | help='Probability of an alignment mismatch. [0.01]') 84 | parser.add_argument('-r', '--reads_per_pickle', 85 | required=False, 86 | default=12500, 87 | type=int, 88 | help='Split output into chunks of this many reads. [12500]') 89 | parser.add_argument('-p', '--prefix', 90 | required=False, 91 | default='G_of_R', 92 | type=str, 93 | help='Prefix for output file(s) [G_of_R]') 94 | parser.add_argument('-n', '--NMdiff', 95 | required=False, 96 | default=2, 97 | type=int, 98 | help='Ignore alignments with edit distance that exceed the best alignment by more than this number. [2]') 99 | parser.add_argument('--flanking', 100 | required=False, 101 | default=400, 102 | type=int, 103 | help='Number of flanking bases included on each end of repeats in reference fasta. [400]') 104 | parser.add_argument('-w', '--wiggle', 105 | required=False, 106 | default=20, 107 | type=int, 108 | help='Extend L1 annotation this many bases in both directions. [20]') 109 | parser.add_argument('--min_len', 110 | required=False, 111 | default=500, 112 | type=int, 113 | help='When alignments probabilities are normalized for element length take max of elements length and this value. [500]') 114 | parser.add_argument('--min_exon_len', 115 | required=False, 116 | default=100, 117 | type=int, 118 | help='When alignments probabilities are normalized for exon length take max of elements length and this value. [100]') 119 | return parser.parse_args() 120 | 121 | parser = argparse.ArgumentParser() 122 | args = ParseArgs(parser) 123 | 124 | return args.bamfile, args.error_prob, args.reads_per_pickle, args.prefix, args.NMdiff, args.flanking, args.wiggle, args.min_len, args.min_exon_len 125 | 126 | """ 127 | Takes as input alignments (read_alignments class) of two paired reads and returns a sparse 128 | row matrix with the likelihoods of all properly paired alignments. 129 | """ 130 | def make_G_of_R_row(alignments,rnames_index,rlens,nreps,read_length,flanking,wiggle,min_len,min_exon_len): 131 | this_G_of_R = numpy.zeros(3*nreps) 132 | for refname in alignments.alignments: 133 | for aln in alignments.alignments[refname]: 134 | has_5pUTR = refname.split('.')[1]=='1' 135 | within_5p = aln.start > flanking -wiggle 136 | within_3p = aln.start+read_length < rlens[rnames_index[refname]]-flanking +wiggle 137 | overlap_element = aln.start+read_length > flanking and aln.start < rlens[rnames_index[refname]]-flanking 138 | if not overlap_element: 139 | continue 140 | this_G_of_R[rnames_index[refname]] += aln.P/(rlens[rnames_index[refname]]-2*flanking+read_length+2*wiggle) 141 | if within_5p and within_3p and has_5pUTR: 142 | this_G_of_R[1*nreps+rnames_index[refname]] += aln.P/max(rlens[rnames_index[refname]]-2*flanking-read_length+2*wiggle,min_len) 143 | if within_5p and has_5pUTR: 144 | this_G_of_R[2*nreps+rnames_index[refname]] += aln.P/(rlens[rnames_index[refname]]-2*flanking+2*wiggle) 145 | return sparse.csr_matrix(this_G_of_R) 146 | 147 | # Parse secondary alignments in the XA tag from bwa aln. 148 | def parseXA(alignments,XAtagdict,error_prob,maxNM,reversed): 149 | for aln in [x.split(',') for x in XAtagdict.split(';')[:-1]]: 150 | refname = aln[0] 151 | #if not reversed: 152 | # is_reverse = aln[1][0] == '-' 153 | #else: 154 | # is_reverse = aln[1][0] == '+' 155 | is_reverse = aln[1][0] == '-' 156 | start = int(aln[1][1:]) 157 | cigarstring = aln[2] 158 | NM = int(aln[3]) 159 | if NM <= maxNM and 'S' not in cigarstring and 'H' not in cigarstring: 160 | P = error_prob**NM 161 | alignments.addXA(refname,start,is_reverse,P) 162 | return alignments 163 | 164 | def main(): 165 | bamfile, error_prob, reads_per_pickle, prefix, NMdiff, flanking, wiggle, min_len, min_exon_len = GetArgs() 166 | 167 | pickle_num = 0 168 | 169 | bam = pysam.Samfile(bamfile, "rb") 170 | rnames = bam.references 171 | rlens = bam.lengths 172 | nreps = len(rnames) 173 | rnames_index = dict() 174 | for i in range(nreps): 175 | rnames_index[rnames[i]] = i 176 | 177 | # Write transcript (column) names 178 | TEnamefile = open(prefix+'_TE_list.txt','w') 179 | for i in range(nreps): 180 | TEnamefile.write(rnames[i]+'_runthrough'+'\t'+str(rlens[i]+2*flanking)+'\n') 181 | for i in range(nreps): 182 | TEnamefile.write(rnames[i]+'_only'+'\t'+str(rlens[i])+'\n') 183 | for i in range(nreps): 184 | TEnamefile.write(rnames[i]+'_3prunon'+'\t'+str(rlens[i]+flanking)+'\n') 185 | TEnamefile.close() 186 | 187 | read_id = None 188 | 189 | G_of_R = None 190 | G_of_R_list_file = open(prefix+'_list.txt','w') 191 | G_of_R_row = 0 192 | 193 | starttime = datetime.datetime.now() 194 | 195 | # Read through the name sorted bam file 196 | for alignment in bam: 197 | read_length = alignment.query_length 198 | # Throw out alignments that are unmapped, clipped or low quality 199 | if alignment.is_unmapped: 200 | continue 201 | if 'N' in alignment.cigarstring or 'S' in alignment.cigarstring or 'H' in alignment.cigarstring or 'P' in alignment.cigarstring or '=' in alignment.cigarstring or 'X' in alignment.cigarstring: 202 | continue 203 | if numpy.mean(alignment.query_qualities) < 30: 204 | continue 205 | 206 | if not read_id: 207 | read_id = alignment.qname 208 | new_read_id = True 209 | 210 | # Once we have read all entries for a given query name, create a row for that fragment 211 | if read_id != alignment.qname: 212 | if not new_read_id: 213 | this_G_of_R = make_G_of_R_row(alignments,rnames_index,rlens,nreps,read_length,flanking,wiggle,min_len,min_exon_len) 214 | # Don't add row if its empty 215 | if this_G_of_R.nnz > 0: 216 | if G_of_R_row > 0: 217 | G_of_R = sparse.vstack([G_of_R,this_G_of_R]) 218 | else: 219 | G_of_R = this_G_of_R 220 | G_of_R_row += 1 221 | # If necessary, break up matrix into multiple pickle files. 222 | if G_of_R_row >= reads_per_pickle: 223 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL) 224 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n') 225 | pickle_num += 1 226 | G_of_R_row = 0 227 | G_of_R = None 228 | print('wrote '+str(reads_per_pickle)+' reads in '+str(datetime.datetime.now()-starttime)) 229 | starttime = datetime.datetime.now() 230 | 231 | read_id = alignment.qname 232 | new_read_id = True 233 | 234 | # Parse primary alignment 235 | # There's a bug in bwa samse (0.7.17) when writing NM tag for overlapping read pairs 236 | NMtag = dict(alignment.tags)['NM'] 237 | P = error_prob**NMtag 238 | 239 | if new_read_id: 240 | alignments = read_alignments(alignment,rnames,P) 241 | new_read_id = False 242 | else: 243 | alignments.add(alignment,rnames,P) 244 | 245 | # Parse secondary alignments 246 | if 'XA' in dict(alignment.tags): 247 | alignments = parseXA(alignments,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse) 248 | 249 | # Make row for last read 250 | if not new_read_id: 251 | this_G_of_R = make_G_of_R_row(alignments,rnames_index,rlens,nreps,read_length,flanking,wiggle,min_len,min_exon_len) 252 | if this_G_of_R.nnz > 0: 253 | if G_of_R_row > 0: 254 | G_of_R = sparse.vstack([G_of_R,this_G_of_R]) 255 | else: 256 | G_of_R = this_G_of_R 257 | 258 | # Write matrix to disk. 259 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL) 260 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n') 261 | print(G_of_R_row+reads_per_pickle*pickle_num) 262 | 263 | if __name__ == '__main__': 264 | main() 265 | -------------------------------------------------------------------------------- /L1EM/G_of_R_unstranded.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import sys 3 | import numpy 4 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 5 | try: 6 | import cPickle as pickle 7 | except ImportError: 8 | import pickle 9 | from scipy import sparse 10 | import datetime 11 | import argparse 12 | 13 | """ 14 | This script reads through a bam file resulting from a bwa aln alignment to the L1EM reference. 15 | The output is a sparse matrix in which the rows are reads, the columns are transcripts 16 | and the entries are the likelihood of that read arising from that transcript. 17 | The matrix is pickled and saved. The column names are writted to a text file. 18 | 19 | Copyright (C) 2019 Wilson McKerrow 20 | 21 | This program is free software: you can redistribute it and/or modify 22 | it under the terms of the GNU General Public License as published by 23 | the Free Software Foundation, either version 3 of the License, or 24 | (at your option) any later version. 25 | 26 | This program is distributed in the hope that it will be useful, 27 | but WITHOUT ANY WARRANTY; without even the implied warranty of 28 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 | GNU General Public License for more details. 30 | 31 | You should have received a copy of the GNU General Public License 32 | along with this program. If not, see . 33 | 34 | """ 35 | 36 | """ 37 | This class stores relevant information about a read's potential alignment as a dictionary 38 | with references names as keys and as list of potential alignments to that reference name 39 | as values. 40 | """ 41 | class read_alignments(object): 42 | def __init__(self, alignment,rnames,P): 43 | self.alignments = dict() 44 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)] 45 | # Add a new alignment, passing a pysam aligned_segnment object. 46 | def add(self, alignment,rnames,P): 47 | if rnames[alignment.rname] not in self.alignments: 48 | self.alignments[rnames[alignment.rname]] = [alignment_at_name(alignment.reference_start,alignment.is_reverse,P)] 49 | else: 50 | self.alignments[rnames[alignment.rname]].append(alignment_at_name(alignment.reference_start,alignment.is_reverse,P)) 51 | # Add a new alignment, passing the output of parseXA. 52 | def addXA(self,refname,start,is_reverse,P): 53 | if refname not in self.alignments: 54 | self.alignments[refname] = [alignment_at_name(start,is_reverse,P)] 55 | else: 56 | self.alignments[refname].append(alignment_at_name(start,is_reverse,P)) 57 | 58 | # Stores position, strand and likelihood for an alignment. 59 | class alignment_at_name(object): 60 | def __init__(self,start,is_reverse,P): 61 | self.start = start 62 | self.is_reverse = is_reverse 63 | self.P = P 64 | 65 | # Read command line arguments 66 | def GetArgs(): 67 | 68 | def ParseArgs(parser): 69 | class Parser(argparse.ArgumentParser): 70 | def error(self, message): 71 | sys.stderr.write('error: %s\n' % message) 72 | self.print_help() 73 | sys.exit(2) 74 | 75 | parser.add_argument('-b', '--bamfile', 76 | type=str, 77 | required=True, 78 | help='Bam to generate alignments from. Required.') 79 | parser.add_argument('-e', '--error_prob', 80 | required=False, 81 | default=0.01, 82 | type=float, 83 | help='Probability of an alignment mismatch. [0.01]') 84 | parser.add_argument('-m', '--max_start2start_len', 85 | required=False, 86 | default=500, 87 | type=int, 88 | help='Maximium distance between read starts to be considered concordant. [500]') 89 | parser.add_argument('-r', '--reads_per_pickle', 90 | required=False, 91 | default=12500, 92 | type=int, 93 | help='Split output into chunks of this many reads. [12500]') 94 | parser.add_argument('-p', '--prefix', 95 | required=False, 96 | default='G_of_R', 97 | type=str, 98 | help='Prefix for output file(s) [G_of_R]') 99 | parser.add_argument('-n', '--NMdiff', 100 | required=False, 101 | default=2, 102 | type=int, 103 | help='Ignore alignments with edit distance that exceed the best alignment by more than this number. [2]') 104 | parser.add_argument('-i', '--insert_mean', 105 | required=True, 106 | type=float, 107 | help='Median template length. Required.') 108 | parser.add_argument('--flanking', 109 | required=False, 110 | default=400, 111 | type=int, 112 | help='Number of flanking bases included on each end of repeats in reference fasta. [400]') 113 | parser.add_argument('--as_start', 114 | required=False, 115 | default=500, 116 | type=int, 117 | help='Position of the antisense TSS in L1. [500]') 118 | parser.add_argument('-w', '--wiggle', 119 | required=False, 120 | default=20, 121 | type=int, 122 | help='Extend L1 annotation this many bases in both directions. [20]') 123 | parser.add_argument('--min_len', 124 | required=False, 125 | default=500, 126 | type=int, 127 | help='When alignments probabilities are normalized for element length take max of elements length and this value. [500]') 128 | parser.add_argument('--min_exon_len', 129 | required=False, 130 | default=100, 131 | type=int, 132 | help='When alignments probabilities are normalized for exon length take max of elements length and this value. [100]') 133 | return parser.parse_args() 134 | 135 | parser = argparse.ArgumentParser() 136 | args = ParseArgs(parser) 137 | 138 | return args.bamfile, args.error_prob, args.max_start2start_len, args.reads_per_pickle, args.prefix, args.NMdiff, args.insert_mean, args.flanking, args.as_start,args.wiggle, args.min_len, args.min_exon_len 139 | 140 | """ 141 | Takes as input alignments (read_alignments class) of two paired reads and returns a sparse 142 | row matrix with the likelihoods of all properly paired alignments. 143 | """ 144 | def get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len): 145 | this_G_of_R = numpy.zeros(3*nreps) 146 | for refname in alignments1.alignments: 147 | if refname not in alignments2.alignments: 148 | continue 149 | for aln1 in alignments1.alignments[refname]: 150 | for aln2 in alignments2.alignments[refname]: 151 | if aln1.is_reverse == aln2.is_reverse: 152 | continue 153 | if max(aln1.start,aln2.start)-min(aln1.start,aln2.start) <= max_start2start_len: 154 | has_5pUTR = refname.split('.')[1]=='1' 155 | if refname.split('.')[1]=='2': 156 | this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/(max(rlens[rnames_index[refname]]-insert_mean,min_exon_len)) 157 | continue 158 | within_5p = min(aln1.start,aln2.start) > flanking -wiggle 159 | within_3p = max(aln1.start,aln2.start)+read_length < rlens[rnames_index[refname]]-flanking +wiggle 160 | overlap_element = max(aln1.start,aln2.start)+read_length > flanking and min(aln1.start,aln2.start) < rlens[rnames_index[refname]]-flanking 161 | if not overlap_element: 162 | continue 163 | this_G_of_R[rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+insert_mean+2*wiggle) 164 | if within_5p and within_3p and has_5pUTR: 165 | this_G_of_R[1*nreps+rnames_index[refname]] += aln1.P*aln2.P/max(rlens[rnames_index[refname]]-2*flanking-insert_mean+2*wiggle,min_len) 166 | if within_5p and has_5pUTR: 167 | this_G_of_R[2*nreps+rnames_index[refname]] += aln1.P*aln2.P/(rlens[rnames_index[refname]]-2*flanking+2*wiggle) 168 | return sparse.csr_matrix(this_G_of_R) 169 | 170 | # Parse secondary alignments in the XA tag from bwa aln. 171 | def parseXA(alignments,XAtagdict,error_prob,maxNM,reversed): 172 | for aln in [x.split(',') for x in XAtagdict.split(';')[:-1]]: 173 | refname = aln[0] 174 | #if not reversed: 175 | # is_reverse = aln[1][0] == '-' 176 | #else: 177 | # is_reverse = aln[1][0] == '+' 178 | is_reverse = aln[1][0] == '-' 179 | start = int(aln[1][1:]) 180 | cigarstring = aln[2] 181 | NM = int(aln[3]) 182 | if NM <= maxNM and 'S' not in cigarstring and 'H' not in cigarstring: 183 | P = error_prob**NM 184 | alignments.addXA(refname,start,is_reverse,P) 185 | return alignments 186 | 187 | def main(): 188 | bamfile, error_prob, max_start2start_len, reads_per_pickle, prefix, NMdiff, insert_mean, flanking, as_start, wiggle, min_len, min_exon_len = GetArgs() 189 | 190 | pickle_num = 0 191 | 192 | bam = pysam.Samfile(bamfile, "rb") 193 | rnames = bam.references 194 | rlens = bam.lengths 195 | nreps = len(rnames) 196 | rnames_index = dict() 197 | for i in range(nreps): 198 | rnames_index[rnames[i]] = i 199 | 200 | # Write transcript (column) names 201 | TEnamefile = open(prefix+'_TE_list.txt','w') 202 | for i in range(nreps): 203 | TEnamefile.write(rnames[i]+'_runthrough'+'\t'+str(rlens[i]+2*flanking)+'\n') 204 | for i in range(nreps): 205 | TEnamefile.write(rnames[i]+'_only'+'\t'+str(rlens[i])+'\n') 206 | for i in range(nreps): 207 | TEnamefile.write(rnames[i]+'_3prunon'+'\t'+str(rlens[i]+flanking)+'\n') 208 | TEnamefile.close() 209 | 210 | read_id = None 211 | 212 | G_of_R = None 213 | G_of_R_list_file = open(prefix+'_list.txt','w') 214 | G_of_R_row = 0 215 | 216 | starttime = datetime.datetime.now() 217 | 218 | # Read through the name sorted bam file 219 | for alignment in bam: 220 | read_length = alignment.query_length 221 | # Throw out alignments that are unmapped, clipped or low quality 222 | if alignment.is_unmapped: 223 | continue 224 | if 'N' in alignment.cigarstring or 'S' in alignment.cigarstring or 'H' in alignment.cigarstring or 'P' in alignment.cigarstring or '=' in alignment.cigarstring or 'X' in alignment.cigarstring: 225 | continue 226 | if numpy.mean(alignment.query_qualities) < 30: 227 | continue 228 | 229 | if not read_id: 230 | read_id = alignment.qname 231 | new_read_id1 = True 232 | new_read_id2 = True 233 | 234 | # Once we have read all entries for a given query name, create a row for that fragment 235 | if read_id != alignment.qname: 236 | if not (new_read_id1 or new_read_id2): 237 | this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len) 238 | if this_G_of_R.nnz > 0: 239 | if G_of_R_row > 0: 240 | G_of_R = sparse.vstack([G_of_R,this_G_of_R]) 241 | else: 242 | G_of_R = this_G_of_R 243 | G_of_R_row += 1 244 | # If necessary, break up matrix into multiple pickle files. 245 | if G_of_R_row >= reads_per_pickle: 246 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL) 247 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n') 248 | pickle_num += 1 249 | G_of_R_row = 0 250 | G_of_R = None 251 | print('wrote '+str(reads_per_pickle)+' reads in '+str(datetime.datetime.now()-starttime)) 252 | starttime = datetime.datetime.now() 253 | 254 | read_id = alignment.qname 255 | new_read_id1 = True 256 | new_read_id2 = True 257 | 258 | # Parse primary alignment 259 | # There's a bug in bwa samse (0.7.17) when writing NM tag for overlapping read pairs 260 | NMtag = dict(alignment.tags)['XM'] 261 | for pair in alignment.cigartuples: 262 | NMtag += (pair[0]>0)*pair[1] 263 | P = error_prob**NMtag 264 | 265 | if alignment.is_read1: 266 | if new_read_id1: 267 | alignments1 = read_alignments(alignment,rnames,P) 268 | new_read_id1 = False 269 | else: 270 | alignments1.add(alignment,rnames,P) 271 | else: 272 | if new_read_id2: 273 | alignments2 = read_alignments(alignment,rnames,P) 274 | new_read_id2 = False 275 | else: 276 | alignments2.add(alignment,rnames,P) 277 | 278 | # Parse secondary alignments 279 | if 'XA' in dict(alignment.tags): 280 | if alignment.is_read1: 281 | alignments1 = parseXA(alignments1,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse) 282 | else: 283 | alignments2 = parseXA(alignments2,dict(alignment.tags)['XA'],error_prob,NMtag+NMdiff,alignment.is_reverse) 284 | 285 | # Make row for last read 286 | if read_id!=None and not (new_read_id1 or new_read_id2): 287 | this_G_of_R = get_concardant_alignments(alignments1,alignments2,max_start2start_len,rnames_index,rlens,insert_mean,nreps,read_length,flanking,as_start,wiggle,min_len,min_exon_len) 288 | if this_G_of_R.nnz > 0: 289 | if G_of_R_row > 0: 290 | G_of_R = sparse.vstack([G_of_R,this_G_of_R]) 291 | else: 292 | G_of_R = this_G_of_R 293 | 294 | # Write matrix to disk. 295 | if G_of_R_row+reads_per_pickle*pickle_num >0: 296 | pickle.dump(G_of_R,open(prefix+'.'+str(pickle_num)+'.pk2','wb'),protocol=pickle.HIGHEST_PROTOCOL) 297 | G_of_R_list_file.write(prefix+'.'+str(pickle_num)+'.pk2\n') 298 | print(G_of_R_row+reads_per_pickle*pickle_num) 299 | 300 | if __name__ == '__main__': 301 | main() 302 | -------------------------------------------------------------------------------- /L1EM/L1EM.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | import numpy 7 | import sys 8 | import datetime 9 | from scipy import sparse 10 | from multiprocessing import Pool 11 | import argparse 12 | 13 | """ 14 | This code takes as input the output of G_of_R.py and runs the EM algorithm to estimate 15 | transcript abundances. 16 | 17 | Part of the L1-EM package. 18 | 19 | Copyright (C) 2019 Wilson McKerrow 20 | 21 | This program is free software: you can redistribute it and/or modify 22 | it under the terms of the GNU General Public License as published by 23 | the Free Software Foundation, either version 3 of the License, or 24 | (at your option) any later version. 25 | 26 | This program is distributed in the hope that it will be useful, 27 | but WITHOUT ANY WARRANTY; without even the implied warranty of 28 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 29 | GNU General Public License for more details. 30 | 31 | You should have received a copy of the GNU General Public License 32 | along with this program. If not, see . 33 | """ 34 | 35 | # Main calculation for the E step 36 | def calculate_expcounts(G_of_R_pkl,X): 37 | G_of_R_file = open(G_of_R_pkl,'rb') 38 | G_of_R = pickle.load(G_of_R_file) 39 | G_of_R_file.close() 40 | if G_of_R == None: 41 | return 0.0,0.0 42 | L_of_R_mat = G_of_R.multiply(X) 43 | L_of_R = numpy.array(L_of_R_mat.sum(1)) 44 | L_of_R_mat = L_of_R_mat[L_of_R[:,0]>=10**-200,:] 45 | L_of_R = L_of_R[L_of_R>=10**-200] 46 | L_of_R_inv = sparse.csr_matrix(1.0/L_of_R).transpose() 47 | exp_counts = L_of_R_mat.multiply(L_of_R_inv).sum(0) 48 | loglik = numpy.sum(numpy.log(L_of_R)) 49 | if numpy.isfinite(loglik): 50 | return exp_counts,loglik 51 | else: 52 | return numpy.zeros(G_of_R.shape[1]),0.0 53 | 54 | # Divide send each thread a chunk of the G_of_R pkl files. 55 | def calculate_expcounts_chunk(input): 56 | G_of_R_pkl_list,X_len = input 57 | exp_counts = numpy.zeros(X_len.shape,dtype=numpy.float64) 58 | loglik = 0.0 59 | for G_of_R_pkl in G_of_R_pkl_list: 60 | this_exp_counts,this_loglik = calculate_expcounts(G_of_R_pkl,X_len) 61 | exp_counts += this_exp_counts 62 | loglik += this_loglik 63 | return exp_counts,loglik 64 | 65 | # Parse commandline arguments 66 | def GetArgs(): 67 | 68 | def ParseArgs(parser): 69 | class Parser(argparse.ArgumentParser): 70 | def error(self, message): 71 | sys.stderr.write('error: %s\n' % message) 72 | self.print_help() 73 | sys.exit(2) 74 | 75 | parser.add_argument('-g', '--G_of_R_list', 76 | type=str, 77 | required=True, 78 | help='Text file listing paths to chunks of the G(R) matrix.') 79 | parser.add_argument('-l', '--TE_list', 80 | required=True, 81 | type=str, 82 | help='Text file listing the names of all transcripts. Output of G_of_R.py.') 83 | parser.add_argument('-s', '--stop_thresh', 84 | required=False, 85 | default=10**-7, 86 | type=float, 87 | help='Continue EM iterations until no transcription expression fraction (X_i) changes by more than this value. [1e-7]') 88 | parser.add_argument('-r', '--report_every', 89 | required=False, 90 | default=100, 91 | type=int, 92 | help='Write X every 100 steps. [100]') 93 | parser.add_argument('-m', '--max_nEMsteps', 94 | required=False, 95 | default=10000, 96 | type=int, 97 | help='Terminate if threshold has not been reached after this many EM steps [10000]') 98 | parser.add_argument('-t', '--nThreads', 99 | required=False, 100 | default=16, 101 | type=int, 102 | help='Divide E step into this many threads. [16]') 103 | parser.add_argument('-p', '--prefix', 104 | required=False, 105 | type=str, 106 | default='', 107 | help='If specified, this prefix will be used for output files.') 108 | return parser.parse_args() 109 | 110 | parser = argparse.ArgumentParser() 111 | args = ParseArgs(parser) 112 | 113 | return args.G_of_R_list, args.TE_list, args.stop_thresh, args.report_every, args.max_nEMsteps, args.nThreads, args.prefix 114 | 115 | 116 | def main(): 117 | G_of_R_list, TE_list, stop_thresh, report_every, max_nEMsteps, nThreads, prefix = GetArgs() 118 | 119 | # All the transcripts names in the same order as the G_of_R matrix columns 120 | TE_names = list() 121 | for name in open(TE_list): 122 | TE_names.append(name.strip().split('\t')[0]) 123 | 124 | # Intial guess 125 | X = sparse.csr_matrix(numpy.ones((1,len(TE_names)),dtype=numpy.float64)/len(TE_names)) 126 | 127 | # Split up the pickle files into a set for each thread. 128 | G_of_R_pkl_fulllist = list() 129 | for G_of_R_pkl in open(G_of_R_list): 130 | G_of_R_pkl_fulllist.append(G_of_R_pkl.strip()) 131 | G_of_R_pkl_lists = list() 132 | listsize = len(G_of_R_pkl_fulllist)//nThreads 133 | nlistsp1 = len(G_of_R_pkl_fulllist)%nThreads 134 | k = 0 135 | for i in range(nlistsp1): 136 | G_of_R_pkl_lists.append(G_of_R_pkl_fulllist[k:k+listsize+1]) 137 | k+=listsize+1 138 | for i in range(nlistsp1,nThreads): 139 | G_of_R_pkl_lists.append(G_of_R_pkl_fulllist[k:k+listsize]) 140 | k+=listsize 141 | 142 | masterPool = Pool(processes = nThreads) 143 | 144 | # Run the EM steps 145 | for step in range(max_nEMsteps): 146 | starttime = datetime.datetime.now() 147 | exp_counts = numpy.zeros((1,len(TE_names)),dtype=numpy.float64) 148 | loglik = 0.0 149 | 150 | outputs = masterPool.map(calculate_expcounts_chunk,zip(G_of_R_pkl_lists,[X]*nThreads)) 151 | for output in outputs: 152 | this_exp_counts,this_loglik = output 153 | exp_counts += this_exp_counts 154 | loglik += this_loglik 155 | 156 | last_X = X.copy() 157 | X = sparse.csr_matrix(exp_counts/numpy.sum(exp_counts)) 158 | print(str(step)+" "+str(numpy.max(numpy.abs(X.toarray()-last_X.toarray())))+" "+str(loglik)+" "+str(datetime.datetime.now()-starttime)) 159 | 160 | if (step+1) % report_every == 0: 161 | pickle.dump(X.toarray()[X.toarray() > 10**-10],open(prefix+'X_step_'+str(step+1)+'.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL) 162 | pickle.dump(numpy.array(TE_names)[X.toarray()[0,:] > 10**-10],open(prefix+'names_step_'+str(step+1)+'.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL) 163 | 164 | if numpy.max(numpy.abs(X.toarray()-last_X.toarray())) < stop_thresh: 165 | break 166 | 167 | # Output the final results 168 | pickle.dump(X.toarray()[X.toarray() > 10**-10],open(prefix+'X_final.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL) 169 | pickle.dump(numpy.array(TE_names)[X.toarray()[0,:] > 10**-10],open(prefix+'names_final.pkl','wb'),protocol=pickle.HIGHEST_PROTOCOL) 170 | 171 | if __name__ == '__main__': 172 | main() 173 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | TERMS AND CONDITIONS 2 | 3 | 0. Definitions. 4 | 5 | “This License” refers to version 3 of the GNU General Public License. 6 | 7 | “Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. 8 | 9 | “The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations. 10 | 11 | To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work. 12 | 13 | A “covered work” means either the unmodified Program or a work based on the Program. 14 | 15 | To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. 16 | 17 | To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. 18 | 19 | An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 20 | 21 | 1. Source Code. 22 | 23 | The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work. 24 | 25 | A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. 26 | 27 | The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. 28 | 29 | The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. 30 | 31 | The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. 32 | 33 | The Corresponding Source for a work in source code form is that same work. 34 | 35 | 2. Basic Permissions. 36 | 37 | All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. 38 | 39 | You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. 40 | 41 | Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 42 | 43 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 44 | 45 | No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. 46 | 47 | When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 48 | 49 | 4. Conveying Verbatim Copies. 50 | 51 | You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. 52 | 53 | You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 54 | 55 | 5. Conveying Modified Source Versions. 56 | 57 | You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: 58 | 59 | a) The work must carry prominent notices stating that you modified it, and giving a relevant date. 60 | b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to “keep intact all notices”. 61 | c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. 62 | d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. 63 | A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an “aggregate” if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 64 | 65 | 6. Conveying Non-Source Forms. 66 | 67 | You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: 68 | 69 | a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. 70 | b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. 71 | c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. 72 | d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. 73 | e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. 74 | A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. 75 | 76 | A “User Product” is either (1) a “consumer product”, which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, “normally used” refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. 77 | 78 | “Installation Information” for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. 79 | 80 | If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). 81 | 82 | The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. 83 | 84 | Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 85 | 86 | 7. Additional Terms. 87 | 88 | “Additional permissions” are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. 89 | 90 | When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. 91 | 92 | Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: 93 | 94 | a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or 95 | b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or 96 | c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or 97 | d) Limiting the use for publicity purposes of names of licensors or authors of the material; or 98 | e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or 99 | f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. 100 | All other non-permissive additional terms are considered “further restrictions” within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. 101 | 102 | If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. 103 | 104 | Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 105 | 106 | 8. Termination. 107 | 108 | You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). 109 | 110 | However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. 111 | 112 | Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. 113 | 114 | Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 115 | 116 | 9. Acceptance Not Required for Having Copies. 117 | 118 | You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 119 | 120 | 10. Automatic Licensing of Downstream Recipients. 121 | 122 | Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. 123 | 124 | An “entity transaction” is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. 125 | 126 | You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 127 | 128 | 11. Patents. 129 | 130 | A “contributor” is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's “contributor version”. 131 | 132 | A contributor's “essential patent claims” are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, “control” includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. 133 | 134 | Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. 135 | 136 | In the following three paragraphs, a “patent license” is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To “grant” such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. 137 | 138 | If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. “Knowingly relying” means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. 139 | 140 | If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. 141 | 142 | A patent license is “discriminatory” if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. 143 | 144 | Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 145 | 146 | 12. No Surrender of Others' Freedom. 147 | 148 | If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 149 | 150 | 13. Use with the GNU Affero General Public License. 151 | 152 | Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 153 | 154 | 14. Revised Versions of this License. 155 | 156 | The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. 157 | 158 | Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License “or any later version” applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. 159 | 160 | If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. 161 | 162 | Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 163 | 164 | 15. Disclaimer of Warranty. 165 | 166 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 167 | 168 | 16. Limitation of Liability. 169 | 170 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 171 | 172 | 17. Interpretation of Sections 15 and 16. 173 | 174 | If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | ### conda way 3 | You will need 4 | 1. git (https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) 5 | 2. anaconda (https://docs.anaconda.com/anaconda/install/) 6 | 7 | Download from github 8 | ``` 9 | git clone https://github.com/FenyoLab/L1EM 10 | ``` 11 | Create conda environment 12 | ``` 13 | cd L1EM 14 | conda env create -f L1EM.yml 15 | ``` 16 | 17 | Before running L1EM, activate the environment: 18 | ``` 19 | source activate L1EM 20 | ``` 21 | 22 | When finished, deactivate the environment: 23 | ``` 24 | source deactivate L1EM 25 | ``` 26 | 27 | ### old way 28 | Alternatively you can install the following dependencies yourself: 29 | * python version 2.7+ (version 2.7 tested) 30 | * bwa (version 0.7.17 tested) 31 | * samtools (version 1.9 tested) 32 | * numpy (version 1.14.3 tested) 33 | * scipy (version 1.1.0 tested) 34 | * pysam (version 0.15.0 tested) 35 | * bedtools (version 2.27.1 tested) 36 | 37 | No compiling of L1EM is necessary. Python scripts will be called from inside the L1EM 38 | directory. 39 | 40 | If necessary, you can specify the path for bwa and samtools in the run\_L1EM.sh script. 41 | You must use samtools >=1.0. Early version of pysam will not work. I highly recommend 42 | that you use bwa 0.7.17. Earlier versions may differ in how they write the XA tag. This 43 | will lead to inaccurate results without throwing an error. 44 | 45 | ## Quick guide 46 | ### First time: build L1EM reference 47 | You will need the hg38 reference genome in fasta format, with bwa index. 48 | Downloaded from UCSC genome browser: 49 | ``` 50 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz 51 | zcat hg38.fa.gz > hg38.fa 52 | bwa index hg38.fa 53 | ``` 54 | Note: this will take some time. 55 | 56 | Then you can build the L1EM reference using the provided shell script: 57 | ``` 58 | bash generate_L1EM_fasta_and_index.sh /fullpathto/hg38.fa 59 | ``` 60 | This should be done inside the L1EM directory 61 | 62 | ### Executing the L1-EM pipeline 63 | You will need a bam file with strand specific paired end read alignments to hg38. You can 64 | use any aligner, but make sure that all reads from the original fastq files are present 65 | trimming should be okay, but is tested. Filtering reads will potentially break the pipeline. 66 | 67 | First move to an empty directory and then execute the shell script: 68 | ``` 69 | bash -e /fullpathto/run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa 70 | ``` 71 | L1EM will write files with specific names, so do NOT run two instances of L1EM in the same 72 | directory. 73 | 74 | At the end of the run\_L1EM.sh script are a commented set of commands to delete all the 75 | intermediate files. If you wish to automatically delete intermediate files, you can delete 76 | these comments. 77 | 78 | ### Output 79 | At completion, three tab delimited tables will be written. 80 | 1. full\_counts.txt: raw count estimates for each L1HS/L1PA\* element with any aligned read pairs 81 | 2. l1hs\_transcript\_counts.txt: expression estimates for L1HS elements, reported as raw counts 82 | 3. filter\_L1HS\_FPM.txt: L1HS whose expression is supported by at least 100 read pairs, reported as FPM (read pairs per million properly aligned) 83 | 84 | The rows of all files are L1 loci. 85 | 86 | For full\_counts.txt each of the five transcript types: 87 | only, runon, passive (sense), passive (antisense), antisense 88 | are reported. 89 | 90 | For l1hs\_transcript\_counts.txt and filter\_L1HS\_FPM.txt only proper transcription from L1HS elements start at the 91 | 5' UTR is reported. 92 | 93 | The results are also written as pickle files to facilitate further analysis in python. To 94 | generate a python dictionary with keys being the transcript names and values being the 95 | relative expression: 96 | ``` 97 | X_est = dict(zip(pickle.load(open('names_final.pkl')),pickle.load(open('X_final.pkl')))) 98 | ``` 99 | 100 | ## Additional details 101 | * Our Bioinformatics paper introducing L1EM: https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btz724/5581349 102 | * More details can be found in manual.md 103 | 104 | ## Mouse Version 105 | Scripts and annotation to measure the expression of LINE-1 loci in mm39 has been added. The mouse version uses all the same methodology as the human version, but has not been as rigorously tested. 106 | 1. Download and index the mm39 reference genome (UCSC genome browser version) 107 | ``` 108 | wget http://hgdownload.cse.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz 109 | zcat mm39.fa.gz > mm39.fa 110 | bwa index mm39.fa 111 | ``` 112 | 2. Build the mm39 L1EM reference. 113 | ``` 114 | bash generate_mm39_L1EM_fasta_and_index.sh /fullpathto/mm39.fa 115 | ``` 116 | 3. Run L1EM. 117 | ``` 118 | bash /fullpathto/run_L1EM_mm39.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/mm39.fa 119 | ``` 120 | All L1Md loci are quantified in full\_counts.txt. Normalized expression of 5' UTR intact young (L1Md\_Tf I/II/II, L1Md\_Gf I/II, L1Md\_A I/II/III) LINE-1 loci supported by at least 100 reads can be found in filter\_active\_L1Md\_FPM.txt. 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /generate_L1EM_fasta_and_index.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # If you need to specify package directories 4 | bedtools=$(which bedtools) 5 | bwa=$(which bwa) 6 | 7 | # Command line 8 | hg38=$1 9 | 10 | $bedtools getfasta -s -name -fi $hg38 -bed annotation/L1EM.400.bed > annotation/L1EM.400.fa 11 | $bwa index annotation/L1EM.400.fa -------------------------------------------------------------------------------- /generate_mm39_L1EM_fasta_and_index.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # If you need to specify package directories 4 | bedtools=$(which bedtools) 5 | bwa=$(which bwa) 6 | 7 | # Command line 8 | mm39=$1 9 | 10 | $bedtools getfasta -s -name -fi $mm39 -bed annotation/mm39.L1EM.bed > annotation/mm39.L1EM.400.fa 11 | $bwa index annotation/mm39.L1EM.400.fa 12 | -------------------------------------------------------------------------------- /manual.md: -------------------------------------------------------------------------------- 1 | ## Pipeline Parameters 2 | 3 | The key parameters for L1EM are listed at the beginning of the run\_L1EM.sh file. Default parameters should work well in most cases, but advanced users may wish to tinker. 4 | 1. threads. Dictates the number of threads that L1EM will spawn. More threads will improve parallel performance, but memory usage scales linearly with number of threads. 5 | 2. realignNM. The number of mismatches to allow when trying to realign reads that do not align as proper pairs in the bam file provided. Default is 3, but you might want to increase for longer reads. 6 | 3. L1EM_NM. As above, but for the generation of candidate alignments to the L1EM reference. Including more candidate alignments will slow the computation, but too few candidate alignments could yield less accurate results. 7 | 4. NMdiff. Only consider alignments with at most this many more mismatches than the primary alignment. Because read likelihood diminishes exponentially with additional mismatches, increasing this parameter is unlikely to affect results but will slow the EM steps. 8 | 5. bwa\_i. By default bwa will create a large number of alignments with indels near the edge of the read. This parameter will prevent this behavior. You may wish to decrease this parameter for shorter reads. 9 | 6. error\_prob. Probability of an error. Error probability is chosen to be constant because computing the read likelihood from base quality scores is slow. 10 | 7. max\_start2start\_len=500. Maximum allowed fragment/template length. Increase if you are using data with very large fragments. 11 | 8. reads\_per\_pickle. The G(R) matrix is split into a number of pickle files, so the entire matrix doesn't need to sit in memory. Decreasing this parameter will free up memory at the G(R) construction and EM steps. 12 | 9. EM\_threshold. Run EM steps until no entry in X changes by more than this value. The paremeter is chosen to be small by default to ensure convergence. Increasing the parameter modestly will improve run time. 13 | 10. template\_fraction. When computing median template length, subsample read to this fraction. You only need about 10,000 proper pairs to get a good estimate. 14 | 15 | ## Generating new annotations 16 | If you wish to run L1-EM for another retrotransposon or for another model organism, you will need to generate a new annotation. 17 | 1. Create a bedfile with the following naming scheme: 18 | family.category.region.strand 19 | Where family is the name of the repeat family, 20 | category is 1 is the element has a promoter and 0 otherwise 21 | region is the genome region (chrom:start-stop) of the element 22 | strand is +/- depending which strand the element falls on 23 | The bedfile must have the six required fields: chrom, start, stop, name, score, strand 24 | The start and stop coordinates should include 400 positions of flanking sequence on either end. 25 | Exons overlapping the annotation can also be included. 26 | 2. Create a fasta file from your bed file and index it with bwa: 27 | ``` 28 | bedtools getfasta -s -name -fi refernece.fa -bed annotation.bed > annotation.fa 29 | bwa index annotation.fa 30 | ``` 31 | 3. Update lines 27 and 28 to point toward your new annotation. 32 | 33 | ## Pipeline steps 34 | ### STEP 1: realign 35 | In this step reads that are not properly paired are extracted and realigned with bwa. Many aligners do not bother with highly redundant reads, so this step is included to ensure that LINE-1 aligning reads are identified. 36 | 37 | ### STEP 2: extract 38 | In this step, L1HS/L1PA reads are extracted. Any read pair for which either end overlaps an entry in the L1EM.400.bed annotation is considered. 39 | 40 | ### STEP 3: candidate alignments 41 | The extracted reads are aligned to L1EM.400.fa, all secondary alignments with up to L1EM_NM mismatches are found. The candidate alignments fastqs are split for parallelization. It is vitally important that all candidate alignments are identified. Missing some of these alignments will drastically hurt accuracy. For this reads bwa aln is used. Do not use bwa mem or STAR as these aligner do not provide a complete enumeration of secondary alignments for highly repetitive elements (like L1HS). 42 | 43 | ### STEP 4: G(R) matrix construction 44 | 45 | The bam files of candidate alignments are read by the script G\_of\_R.py. The likelihood of each candidate alignment is calculated and added to the G(R) matrix. 46 | 47 | The following options are additional parameters that can be accessed at this step: 48 | 1. -f/--flanking specifies the amount of flanking sequence in the annotation. If you created you own annotation with more or less that 400 bases of flanking sequence specify that here. 49 | 2. --as\_start. If you wish to change to TSS for antisense transcription do that here. 50 | 3. -w/--wiggle. Some proper LINE-1 transcripts start slightly before the annotation start of the 5'UTR. This parameter extends the annotated element this many bases in either direct (default is 20). 51 | 4. --min\_len. Puts a floor on transcript effective length to prevent cases where transcription of very short elements are over predicted. Default is 500. 52 | 5. --min\_exon\_len. Corresponding minimun effective length for exon annotations. Default is 100. 53 | 54 | ### STEP 5: Expectation maximization 55 | In this step, the expectation maximization algorithm is used to compute a maximum likelihood estimate of relative expression, using the G(R) matrix output in the previous as input. 56 | The following options are additional parameters that can be accessed at this step: 57 | 1. -r/--report\_every. Write the estimate every n steps. 58 | 2. -m/--max\_nEMsteps. By default EM stops if converge has not been achieved after 10000 steps. Change that value here. 59 | 60 | ### STEP 6: Writing results 61 | At completion, three tab delimited tables will be written. 62 | 1. full\_counts.txt: raw count estimates for each L1HS/L1PA\* element with any aligned read pairs 63 | 2. l1hs\_transcript\_counts.txt: expression estimates for L1HS elements, reported as raw counts 64 | 3. filter\_L1HS\_FPM.txt: L1HS whose expression is supported by at least 100 read pairs, reported as FPM (read pairs per million properly aligned) 65 | 66 | ### STEP 7: Clean up 67 | All the intermediate files are delete at this step. Comment out these lines if you want to keep them. 68 | 69 | The rows of both files are L1 loci. 70 | 71 | For full\_counts.txt each of the five transcript types: 72 | only, runon, passive (sense), passive (antisense), antisense 73 | are reported. 74 | 75 | For l1hs\_transcript_counts.txt only proper transcription from L1HS elements start at the 76 | 5' UTR is reported. 77 | 78 | The results are also written as pickle files to facilitate further analysis in python. To 79 | generate a python dictionary with keys being the transcript names and values being the 80 | relative expression: 81 | ``` 82 | X_est = dict(zip(pickle.load(open('names_final.pkl')),pickle.load(open('X_final.pkl')))) 83 | ``` 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /parameters.sh: -------------------------------------------------------------------------------- 1 | # Parameters 2 | export threads=16 #How many threads to use for samtools, bwa and L1EM 3 | export realignNM=3 #Number of mismatches allowed in bwa realignment 4 | export L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments 5 | export NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment 6 | export bwa_i=20 #bwa i parameter prevents indels near the edges of a read 7 | export error_prob=0.01 #Probability of a read error at a given position 8 | export max_start2start_len=500 #Max allowed template/fragment length 9 | export reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high. 10 | export EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time. 11 | export template_fraction=1 #Fraction of reads to consider when calculated median template length. 12 | -------------------------------------------------------------------------------- /run_L1EM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to execute L1-EM pipeline 4 | # Copyright (C) 2019 Wilson McKerrow 5 | 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa 20 | 21 | # Parameters 22 | threads=16 #How many threads to use for samtools, bwa and L1EM 23 | realignNM=3 #Number of mismatches allowed in bwa realignment 24 | L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments 25 | NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read 27 | error_prob=0.01 #Probability of a read error at a given position 28 | max_start2start_len=500 #Max allowed template/fragment length 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high. 30 | EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time. 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length. 32 | 33 | # If you need to specify paths to required packages 34 | bwa=$(which bwa) # version 0.7.17 tested 35 | samtools=$(which samtools) # version 1.9 tested 36 | python=$(which python) # use version 2.7 37 | 38 | # Command line arguments 39 | bamfile=$1 40 | L1EM_directory=$2 41 | hg38=$3 42 | 43 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed' 44 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa' 45 | L1EM_code_dir=$L1EM_directory'/L1EM/' 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/' 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/' 48 | 49 | # Try to realign unaligned reads using bwa aln. 50 | echo 'STEP 1: realign' 51 | mkdir idL1reads 52 | cd idL1reads 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 57 | samtools index realigned.bam 58 | 59 | # Extract L1HS/L1PA* aligning reads. 60 | echo 'STEP 2: extract' 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2 65 | cat temp.fq1 >> L1.fq1 66 | cat temp.fq2 >> L1.fq2 67 | # rm temp* 68 | 69 | # Split the L1 fastq files for parallel execution 70 | cd .. 71 | mkdir split_fqs 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}') 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1. 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2. 75 | cd split_fqs 76 | 77 | # Generate candidate alignments 78 | echo 'STEP 3: candidate alignments' 79 | for name in *.fq1.* 80 | do reads1=$name 81 | reads2=$(echo $name|sed 's/fq1/fq2/g') 82 | ref=$L1EM_fa 83 | base=$(echo $name|sed 's/.fq1//g') 84 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai 85 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai 86 | done 87 | for name in *.fq1.* 88 | do reads1=$name 89 | reads2=$(echo $name|sed 's/fq1/fq2/g') 90 | ref=$L1EM_fa 91 | base=$(echo $name|sed 's/.fq1//g') 92 | $bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 | $samtools view -bS - | $samtools sort -n - > $base.aln.bam & 93 | done 94 | wait 95 | 96 | # Make G_of_R matrix 97 | echo 'STEP 4: G(R) matrix construction' 98 | mkdir ../G_of_R 99 | cd ../G_of_R 100 | $python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt 101 | medianinsert=$(head -1 ../baminfo.txt) 102 | for bam in ../split_fqs/*.bam 103 | do $python ${L1EM_code_dir}G_of_R.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff & 104 | done 105 | wait 106 | 107 | # RUN EM 108 | echo 'STEP 5: Expectation maximization' 109 | mkdir ../L1EM/ 110 | cd ../L1EM/ 111 | ls ../G_of_R/*pk2 > G_of_R_list.txt 112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt 113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold 114 | 115 | #Write results as text file 116 | echo 'STEP 6: Writing results' 117 | 118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt 119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt 120 | $python ${L1EM_utilities_dir}report_l1hs_transcription.py > ../l1hs_transcript_counts.txt 121 | $python ${L1EM_utilities_dir}filtered_and_normalized_l1hs.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt 122 | 123 | #Clean up 124 | echo 'STEP 7: Clean up' 125 | cp *final.pkl ../ 126 | cd .. 127 | 128 | # rm idL1reads/* 129 | # rmdir idL1reads 130 | # rm split_fqs/* 131 | # rmdir split_fqs 132 | # rm G_of_R/* 133 | # rmdir G_of_R 134 | # rm L1EM/* 135 | # rmdir L1EM 136 | -------------------------------------------------------------------------------- /run_L1EM_fortcga.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/L1HS.fa 4 | 5 | # Parameters 6 | threads=16 #How many threads to use for samtools, bwa and L1EM 7 | realignNM=3 #Number of mismatches allowed in bwa realignment 8 | L1EM_NM=2 # Number of mismatches allowed when enumerated candidate alignments 9 | NMdiff=1 #Skip candidate alignments with greater than this many more mismatches than the best alignment 10 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read 11 | error_prob=0.01 #Probability of a read error at a given position 12 | max_start2start_len=500 #Max allowed template/fragment length 13 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high. 14 | EM_threshold=1e-6 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time. 15 | template_fraction=0.0001 #Fraction of reads to consider when calculated median template length. 16 | 17 | # If you need to specify paths to required packages 18 | bwa=$(which bwa) # version 0.7.17 tested 19 | samtools=$(which samtools) # version 1.9 tested 20 | python=$(which python) # use version 2.7 21 | 22 | # Command line arguments 23 | bamfile=$1 24 | L1EM_directory=$2 25 | L1HS=$3 26 | 27 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed' 28 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa' 29 | L1EM_code_dir=$L1EM_directory'/L1EM/' 30 | L1EM_utilities_dir=$L1EM_directory'/utilities/' 31 | 32 | # Try to realign unaligned reads using bwa aln. 33 | echo 'STEP 1: realign' 34 | mkdir idL1reads 35 | cd idL1reads 36 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2 37 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $L1HS unaligned.fq1 > 1.sai 38 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $L1HS unaligned.fq2 > 2.sai 39 | $bwa sampe $L1HS 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -f 2 -@ $threads - | $samtools sort -@ $threads - > realigned.bam 40 | 41 | # Extract L1HS/L1PA* aligning reads. 42 | echo 'STEP 2: extract' 43 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam 44 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2 45 | $samtools fastq realigned.bam -1 temp.fq1 -2 temp.fq2 46 | cat temp.fq1 >> L1.fq1 47 | cat temp.fq2 >> L1.fq2 48 | rm temp* 49 | 50 | # Split the L1 fastq files for parallel execution 51 | cd .. 52 | mkdir split_fqs 53 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}') 54 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1. 55 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2. 56 | cd split_fqs 57 | 58 | # Generate candidate alignments 59 | echo 'STEP 3: candidate alignments' 60 | for name in *.fq1.* 61 | do reads1=$name 62 | reads2=$(echo $name|sed 's/fq1/fq2/g') 63 | ref=$L1EM_fa 64 | base=$(echo $name|sed 's/.fq1//g') 65 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai 66 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai 67 | done 68 | for name in *.fq1.* 69 | do reads1=$name 70 | reads2=$(echo $name|sed 's/fq1/fq2/g') 71 | ref=$L1EM_fa 72 | base=$(echo $name|sed 's/.fq1//g') 73 | $bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 | $samtools view -bS - | $samtools sort -n - > $base.aln.bam & 74 | done 75 | wait 76 | 77 | # Make G_of_R matrix 78 | echo 'STEP 4: G(R) matrix construction' 79 | mkdir ../G_of_R 80 | cd ../G_of_R 81 | medianinsert=$($python ${L1EM_utilities_dir}median_template.py $bamfile $template_fraction) 82 | for bam in ../split_fqs/*.bam 83 | do $python ${L1EM_code_dir}G_of_R_unstranded.py -b $bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff & 84 | done 85 | wait 86 | 87 | # RUN EM 88 | echo 'STEP 5: Expectation maximization' 89 | mkdir ../L1EM/ 90 | cd ../L1EM/ 91 | ls ../G_of_R/*pk2 > G_of_R_list.txt 92 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt 93 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold 94 | 95 | #Write results as text file 96 | echo 'STEP 6: Writing results' 97 | 98 | $python ${L1EM_utilities_dir}report_l1_exp_counts_unstranded.py > ../full_counts.txt 99 | $python ${L1EM_utilities_dir}report_l1hs_transcription.py > ../l1hs_transcript_counts.txt 100 | 101 | #Clean up 102 | echo 'STEP 7: Clean up' 103 | cp *final.pkl ../ 104 | cd .. 105 | 106 | #rm idL1reads/* 107 | #rmdir idL1reads 108 | #rm split_fqs/* 109 | #rmdir split_fqs 110 | #rm G_of_R/* 111 | #rmdir G_of_R 112 | #rm L1EM/* 113 | #rmdir L1EM 114 | 115 | -------------------------------------------------------------------------------- /run_L1EM_mm39.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to execute L1-EM pipeline 4 | # Copyright (C) 2019 Wilson McKerrow 5 | 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa 20 | 21 | # Parameters 22 | threads=16 #How many threads to use for samtools, bwa and L1EM 23 | realignNM=2 #Number of mismatches allowed in bwa realignment 24 | L1EM_NM=2 # Number of mismatches allowed when enumerated candidate alignments 25 | NMdiff=1 #Skip candidate alignments with greater than this many more mismatches than the best alignment 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read 27 | error_prob=0.01 #Probability of a read error at a given position 28 | max_start2start_len=500 #Max allowed template/fragment length 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high. 30 | EM_threshold=1e-6 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time. 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length. 32 | 33 | # If you need to specify paths to required packages 34 | bwa=$(which bwa) # version 0.7.17 tested 35 | samtools=$(which samtools) # version 1.9 tested 36 | python=$(which python) # use version 2.7 37 | 38 | # Command line arguments 39 | bamfile=$1 40 | L1EM_directory=$2 41 | hg38=$3 42 | 43 | L1EM_bed=$L1EM_directory'/annotation/mm39.L1EM.bed' 44 | L1EM_fa=$L1EM_directory'/annotation/mm39.L1EM.400.fa' 45 | L1EM_code_dir=$L1EM_directory'/L1EM/' 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/' 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/' 48 | 49 | # Try to realign unaligned reads using bwa aln. 50 | echo 'STEP 1: realign' 51 | mkdir idL1reads 52 | cd idL1reads 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 57 | samtools index realigned.bam 58 | 59 | # Extract L1HS/L1PA* aligning reads. 60 | echo 'STEP 2: extract' 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2 65 | cat temp.fq1 >> L1.fq1 66 | cat temp.fq2 >> L1.fq2 67 | rm temp* 68 | 69 | # Split the L1 fastq files for parallel execution 70 | cd .. 71 | mkdir split_fqs 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*10*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}') 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1. 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2. 75 | cd split_fqs 76 | 77 | # Generate candidate alignments 78 | echo 'STEP 3: candidate alignments' 79 | for name in *.fq1.* 80 | do reads1=$name 81 | reads2=$(echo $name|sed 's/fq1/fq2/g') 82 | ref=$L1EM_fa 83 | base=$(echo $name|sed 's/.fq1//g') 84 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai 85 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai 86 | bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 > temp.$base.aln.sam 87 | samtools view -@ $threads -bS temp.$base.aln.sam > temp.$base.aln.bam 88 | samtools sort -@ $threads -n temp.$base.aln.bam > $base.aln.bam 89 | rm temp.$base.aln.sam temp.$base.aln.bam $base.R1.aln.sai $base.R2.aln.sai 90 | done 91 | 92 | # Make G_of_R matrix 93 | echo 'STEP 4: G(R) matrix construction' 94 | mkdir ../G_of_R 95 | cd ../G_of_R 96 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt 97 | medianinsert=$(head -1 ../baminfo.txt) 98 | ls ../split_fqs/*.bam > list_of_bams.txt 99 | split -l $threads list_of_bams.txt list_of_bams.txt. 100 | for bamlist in list_of_bams.txt.* 101 | do for bam in $(cat $bamlist) 102 | do python ${L1EM_code_dir}G_of_R.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff & 103 | done 104 | wait 105 | done 106 | 107 | # RUN EM 108 | echo 'STEP 5: Expectation maximization' 109 | mkdir ../L1EM/ 110 | cd ../L1EM/ 111 | ls ../G_of_R/*pk2 > G_of_R_list.txt 112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt 113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold 114 | 115 | #Write results as text file 116 | echo 'STEP 6: Writing results' 117 | 118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt 119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt 120 | $python ${L1EM_utilities_dir}filtered_and_normalized_active_l1md.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_active_L1Md_FPM.txt 121 | 122 | #Clean up 123 | echo 'STEP 7: Clean up' 124 | cp *final.pkl ../ 125 | cd .. 126 | 127 | # rm idL1reads/* 128 | # rmdir idL1reads 129 | # rm split_fqs/* 130 | # rmdir split_fqs 131 | # rm G_of_R/* 132 | # rmdir G_of_R 133 | # rm L1EM/* 134 | # rmdir L1EM 135 | -------------------------------------------------------------------------------- /run_L1EM_mm39_unstranded.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to execute L1-EM pipeline 4 | # Copyright (C) 2019 Wilson McKerrow 5 | 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa 20 | 21 | # Parameters 22 | threads=16 #How many threads to use for samtools, bwa and L1EM 23 | realignNM=2 #Number of mismatches allowed in bwa realignment 24 | L1EM_NM=2 # Number of mismatches allowed when enumerated candidate alignments 25 | NMdiff=1 #Skip candidate alignments with greater than this many more mismatches than the best alignment 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read 27 | error_prob=0.01 #Probability of a read error at a given position 28 | max_start2start_len=500 #Max allowed template/fragment length 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high. 30 | EM_threshold=1e-6 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time. 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length. 32 | 33 | # If you need to specify paths to required packages 34 | bwa=$(which bwa) # version 0.7.17 tested 35 | samtools=$(which samtools) # version 1.9 tested 36 | python=$(which python) # use version 2.7 37 | 38 | # Command line arguments 39 | bamfile=$1 40 | L1EM_directory=$2 41 | hg38=$3 42 | 43 | L1EM_bed=$L1EM_directory'/annotation/mm39.L1EM.bed' 44 | L1EM_fa=$L1EM_directory'/annotation/mm39.L1EM.400.fa' 45 | L1EM_code_dir=$L1EM_directory'/L1EM/' 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/' 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/' 48 | 49 | # Try to realign unaligned reads using bwa aln. 50 | echo 'STEP 1: realign' 51 | mkdir idL1reads 52 | cd idL1reads 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 57 | samtools index realigned.bam 58 | 59 | # Extract L1HS/L1PA* aligning reads. 60 | echo 'STEP 2: extract' 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2 65 | cat temp.fq1 >> L1.fq1 66 | cat temp.fq2 >> L1.fq2 67 | rm temp* 68 | 69 | # Split the L1 fastq files for parallel execution 70 | cd .. 71 | mkdir split_fqs 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*10*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}') 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1. 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2. 75 | cd split_fqs 76 | 77 | # Generate candidate alignments 78 | echo 'STEP 3: candidate alignments' 79 | for name in *.fq1.* 80 | do reads1=$name 81 | reads2=$(echo $name|sed 's/fq1/fq2/g') 82 | ref=$L1EM_fa 83 | base=$(echo $name|sed 's/.fq1//g') 84 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai 85 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai 86 | bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 > temp.$base.aln.sam 87 | samtools view -@ $threads -bS temp.$base.aln.sam > temp.$base.aln.bam 88 | samtools sort -@ $threads -n temp.$base.aln.bam > $base.aln.bam 89 | rm temp.$base.aln.sam temp.$base.aln.bam $base.R1.aln.sai $base.R2.aln.sai 90 | done 91 | 92 | # Make G_of_R matrix 93 | echo 'STEP 4: G(R) matrix construction' 94 | mkdir ../G_of_R 95 | cd ../G_of_R 96 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt 97 | medianinsert=$(head -1 ../baminfo.txt) 98 | ls ../split_fqs/*.bam > list_of_bams.txt 99 | split -l $threads list_of_bams.txt list_of_bams.txt. 100 | for bamlist in list_of_bams.txt.* 101 | do for bam in $(cat $bamlist) 102 | do python ${L1EM_code_dir}G_of_R_unstranded.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff & 103 | done 104 | wait 105 | done 106 | 107 | # RUN EM 108 | echo 'STEP 5: Expectation maximization' 109 | mkdir ../L1EM/ 110 | cd ../L1EM/ 111 | ls ../G_of_R/*pk2 > G_of_R_list.txt 112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt 113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold 114 | 115 | #Write results as text file 116 | echo 'STEP 6: Writing results' 117 | 118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt 119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt 120 | $python ${L1EM_utilities_dir}filtered_and_normalized_active_l1md_unstranded.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_active_L1Md_FPM.txt 121 | 122 | #Clean up 123 | echo 'STEP 7: Clean up' 124 | cp *final.pkl ../ 125 | cd .. 126 | 127 | # rm idL1reads/* 128 | # rmdir idL1reads 129 | # rm split_fqs/* 130 | # rmdir split_fqs 131 | # rm G_of_R/* 132 | # rmdir G_of_R 133 | # rm L1EM/* 134 | # rmdir L1EM 135 | -------------------------------------------------------------------------------- /run_L1EM_unstranded.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to execute L1-EM pipeline 4 | # Copyright (C) 2019 Wilson McKerrow 5 | 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa 20 | 21 | # Parameters 22 | threads=16 #How many threads to use for samtools, bwa and L1EM 23 | realignNM=3 #Number of mismatches allowed in bwa realignment 24 | L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments 25 | NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read 27 | error_prob=0.01 #Probability of a read error at a given position 28 | max_start2start_len=500 #Max allowed template/fragment length 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high. 30 | EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time. 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length. 32 | 33 | # If you need to specify paths to required packages 34 | bwa=$(which bwa) # version 0.7.17 tested 35 | samtools=$(which samtools) # version 1.9 tested 36 | python=$(which python) # use version 2.7 37 | 38 | # Command line arguments 39 | bamfile=$1 40 | L1EM_directory=$2 41 | hg38=$3 42 | 43 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed' 44 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa' 45 | L1EM_code_dir=$L1EM_directory'/L1EM/' 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/' 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/' 48 | 49 | # Try to realign unaligned reads using bwa aln. 50 | echo 'STEP 1: realign' 51 | mkdir idL1reads 52 | cd idL1reads 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 57 | samtools index realigned.bam 58 | 59 | # Extract L1HS/L1PA* aligning reads. 60 | echo 'STEP 2: extract' 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2 65 | cat temp.fq1 >> L1.fq1 66 | cat temp.fq2 >> L1.fq2 67 | # rm temp* 68 | 69 | # Split the L1 fastq files for parallel execution 70 | cd .. 71 | mkdir split_fqs 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}') 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1. 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2. 75 | cd split_fqs 76 | 77 | # Generate candidate alignments 78 | echo 'STEP 3: candidate alignments' 79 | for name in *.fq1.* 80 | do reads1=$name 81 | reads2=$(echo $name|sed 's/fq1/fq2/g') 82 | ref=$L1EM_fa 83 | base=$(echo $name|sed 's/.fq1//g') 84 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai 85 | $bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai 86 | done 87 | for name in *.fq1.* 88 | do reads1=$name 89 | reads2=$(echo $name|sed 's/fq1/fq2/g') 90 | ref=$L1EM_fa 91 | base=$(echo $name|sed 's/.fq1//g') 92 | $bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 | $samtools view -bS - | $samtools sort -n - > $base.aln.bam & 93 | done 94 | wait 95 | 96 | # Make G_of_R matrix 97 | echo 'STEP 4: G(R) matrix construction' 98 | mkdir ../G_of_R 99 | cd ../G_of_R 100 | $python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt 101 | medianinsert=$(head -1 ../baminfo.txt) 102 | for bam in ../split_fqs/*.bam 103 | do $python ${L1EM_code_dir}G_of_R_unstranded.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff & 104 | done 105 | wait 106 | 107 | # RUN EM 108 | echo 'STEP 5: Expectation maximization' 109 | mkdir ../L1EM/ 110 | cd ../L1EM/ 111 | ls ../G_of_R/*pk2 > G_of_R_list.txt 112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt 113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold 114 | 115 | #Write results as text file 116 | echo 'STEP 6: Writing results' 117 | 118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt 119 | $python ${L1EM_utilities_dir}report_l1_exp_counts_unstranded.py > ../full_counts.txt 120 | $python ${L1EM_utilities_dir}report_l1hs_transcription_unstranded.py > ../l1hs_transcript_counts.txt 121 | $python ${L1EM_utilities_dir}filtered_and_normalized_l1hs_unstranded.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt 122 | 123 | #Clean up 124 | echo 'STEP 7: Clean up' 125 | cp *final.pkl ../ 126 | cd .. 127 | 128 | # rm idL1reads/* 129 | # rmdir idL1reads 130 | # rm split_fqs/* 131 | # rmdir split_fqs 132 | # rm G_of_R/* 133 | # rmdir G_of_R 134 | # rm L1EM/* 135 | # rmdir L1EM 136 | -------------------------------------------------------------------------------- /run_L1EM_unstranded_fromdocker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to execute L1-EM pipeline 4 | # Copyright (C) 2019 Wilson McKerrow 5 | 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | 19 | # Usage: bash run_L1EM.sh parameters.sh /fullpathto/alignments.bam /fullpathto/hg38.fa 20 | 21 | # Command line arguments 22 | bash $1 23 | bamfile=$2 24 | hg38=$3 25 | 26 | # Locations within L1EM directory 27 | L1EM_bed=/annotation/L1EM.400.bed 28 | L1EM_fa=/annotation/L1EM.400.fa 29 | L1EM_code_dir=/L1EM/L1EM/ 30 | L1EM_utilities_dir=/L1EM/utilities/ 31 | L1EM_CGC_dir=/L1EM/CGC/ 32 | 33 | # Try to realign unaligned reads using bwa aln. 34 | echo 'STEP 1: realign' 35 | mkdir idL1reads 36 | cd idL1reads 37 | samtools view -@ $threads -b -F 2 $bamfile | samtools sort -@ $threads -n - | samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2 38 | bwa aln -k $realignNM -n $realignNM -t $threads -i bwa_i $hg38 unaligned.fq1 > 1.sai 39 | bwa aln -k $realignNM -n $realignNM -t $threads -i bwa_i $hg38 unaligned.fq2 > 2.sai 40 | bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | samtools view -b -@ $threads - | samtools sort -@ $threads - > realigned.bam 41 | samtools index realigned.bam 42 | 43 | # Extract L1HS/L1PA* aligning reads. 44 | echo 'STEP 2: extract' 45 | python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam 46 | samtools sort -@ $threads -n temp.bam | samtools fastq - -1 L1.fq1 -2 L1.fq2 47 | python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam 48 | samtools sort -@ $threads -n temp.bam | samtools fastq - -1 temp.fq1 -2 temp.fq2 49 | cat temp.fq1 >> L1.fq1 50 | cat temp.fq2 >> L1.fq2 51 | # rm temp* 52 | 53 | # Split the L1 fastq files for parallel execution 54 | cd .. 55 | mkdir split_fqs 56 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}') 57 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1. 58 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2. 59 | cd split_fqs 60 | 61 | # Generate candidate alignments 62 | echo 'STEP 3: candidate alignments' 63 | for name in *.fq1.* 64 | do reads1=$name 65 | reads2=$(echo $name|sed 's/fq1/fq2/g') 66 | ref=$L1EM_fa 67 | base=$(echo $name|sed 's/.fq1//g') 68 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai 69 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai 70 | done 71 | for name in *.fq1.* 72 | do reads1=$name 73 | reads2=$(echo $name|sed 's/fq1/fq2/g') 74 | ref=$L1EM_fa 75 | base=$(echo $name|sed 's/.fq1//g') 76 | bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 | samtools view -bS - | samtools sort -n - > $base.aln.bam & 77 | done 78 | wait 79 | 80 | # Make G_of_R matrix 81 | echo 'STEP 4: G(R) matrix construction' 82 | mkdir ../G_of_R 83 | cd ../G_of_R 84 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt 85 | medianinsert=$(head -1 ../baminfo.txt) 86 | for bam in ../split_fqs/*.bam 87 | do python ${L1EM_code_dir}G_of_R_unstranded.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff & 88 | done 89 | wait 90 | 91 | # RUN EM 92 | echo 'STEP 5: Expectation maximization' 93 | mkdir ../L1EM/ 94 | cd ../L1EM/ 95 | ls ../G_of_R/*pk2 > G_of_R_list.txt 96 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt 97 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold 98 | 99 | #Write results as text file 100 | echo 'STEP 6: Writing results' 101 | 102 | python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt 103 | python ${L1EM_utilities_dir}report_l1_exp_counts_unstranded.py > ../full_counts.txt 104 | python ${L1EM_utilities_dir}report_l1hs_transcription_unstranded.py > ../l1hs_transcript_counts.txt 105 | python ${L1EM_utilities_dir}filtered_and_normalized_l1hs_unstranded.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt 106 | 107 | #Clean up 108 | echo 'STEP 7: Clean up' 109 | cp *final.pkl ../ 110 | cd .. 111 | 112 | # rm idL1reads/* 113 | # rmdir idL1reads 114 | # rm split_fqs/* 115 | # rmdir split_fqs 116 | # rm G_of_R/* 117 | # rmdir G_of_R 118 | # rm L1EM/* 119 | # rmdir L1EM 120 | -------------------------------------------------------------------------------- /run_L1EM_withlessmemory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to execute L1-EM pipeline 4 | # Copyright (C) 2019 Wilson McKerrow 5 | 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | 19 | # Usage: bash run_L1EM.sh /fullpathto/alignments.bam /fullpathto/L1EM /fullpathto/hg38.fa 20 | 21 | # Parameters 22 | threads=16 #How many threads to use for samtools, bwa and L1EM 23 | realignNM=3 #Number of mismatches allowed in bwa realignment 24 | L1EM_NM=3 # Number of mismatches allowed when enumerated candidate alignments 25 | NMdiff=2 #Skip candidate alignments with greater than this many more mismatches than the best alignment 26 | bwa_i=20 #bwa i parameter prevents indels near the edges of a read 27 | error_prob=0.01 #Probability of a read error at a given position 28 | max_start2start_len=500 #Max allowed template/fragment length 29 | reads_per_pickle=10000 #Number of rows in each G(R) matrix chunk. Decrease if memory usage is too high. 30 | EM_threshold=1e-7 #Keep taking EM steps until no entry in X changes by more than this value. Increasing this parameter will shorten run time. 31 | template_fraction=1 #Fraction of reads to consider when calculated median template length. 32 | 33 | # If you need to specify paths to required packages 34 | bwa=$(which bwa) # version 0.7.17 tested 35 | samtools=$(which samtools) # version 1.9 tested 36 | python=$(which python) # use version 2.7 37 | 38 | # Command line arguments 39 | bamfile=$1 40 | L1EM_directory=$2 41 | hg38=$3 42 | 43 | L1EM_bed=$L1EM_directory'/annotation/L1EM.400.bed' 44 | L1EM_fa=$L1EM_directory'/annotation/L1EM.400.fa' 45 | L1EM_code_dir=$L1EM_directory'/L1EM/' 46 | L1EM_utilities_dir=$L1EM_directory'/utilities/' 47 | L1EM_CGC_dir=$L1EM_directory'/CGC/' 48 | 49 | # Try to realign unaligned reads using bwa aln. 50 | echo 'STEP 1: realign' 51 | mkdir idL1reads 52 | cd idL1reads 53 | $samtools view -@ $threads -b -F 2 $bamfile | $samtools sort -@ $threads -n - | $samtools fastq - -1 unaligned.fq1 -2 unaligned.fq2 54 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq1 > 1.sai 55 | $bwa aln -k $realignNM -n $realignNM -t $threads -i $bwa_i $hg38 unaligned.fq2 > 2.sai 56 | $bwa sampe $hg38 1.sai 2.sai unaligned.fq1 unaligned.fq2 | $samtools view -b -@ $threads - | $samtools sort -@ $threads - > realigned.bam 57 | samtools index realigned.bam 58 | 59 | # Extract L1HS/L1PA* aligning reads. 60 | echo 'STEP 2: extract' 61 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed $bamfile temp.bam 62 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 L1.fq1 -2 L1.fq2 63 | $python ${L1EM_utilities_dir}read_or_pair_overlap_bed.py $L1EM_bed realigned.bam temp.bam 64 | $samtools sort -@ $threads -n temp.bam | $samtools fastq - -1 temp.fq1 -2 temp.fq2 65 | cat temp.fq1 >> L1.fq1 66 | cat temp.fq2 >> L1.fq2 67 | # rm temp* 68 | 69 | # Split the L1 fastq files for parallel execution 70 | cd .. 71 | mkdir split_fqs 72 | split_fq_size=$(wc -l idL1reads/L1.fq1 | awk '{print $1/('$threads'*4)+1}' | cut -d '.' -f 1 | awk '{print $1*4}') 73 | split -l $split_fq_size idL1reads/L1.fq1 split_fqs/L1.fq1. 74 | split -l $split_fq_size idL1reads/L1.fq2 split_fqs/L1.fq2. 75 | cd split_fqs 76 | 77 | # Generate candidate alignments 78 | echo 'STEP 3: candidate alignments' 79 | for name in *.fq1.* 80 | do reads1=$name 81 | reads2=$(echo $name|sed 's/fq1/fq2/g') 82 | ref=$L1EM_fa 83 | base=$(echo $name|sed 's/.fq1//g') 84 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads1 > $base.R1.aln.sai 85 | bwa aln -t $threads -N -n $L1EM_NM -k $L1EM_NM -i $bwa_i -R 10000000 $ref $reads2 > $base.R2.aln.sai 86 | bwa sampe -n 10000000 -N 10000000 $ref $base.R1.aln.sai $base.R2.aln.sai $reads1 $reads2 > temp.$base.aln.sam 87 | samtools view -@ $threads -bS temp.$base.aln.sam > temp.$base.aln.bam 88 | samtools sort -@ $threads -n temp.$base.aln.bam > $base.aln.bam 89 | rm temp.$base.aln.sam temp.$base.aln.bam $base.R1.aln.sai $base.R2.aln.sai 90 | done 91 | 92 | # Make G_of_R matrix 93 | echo 'STEP 4: G(R) matrix construction' 94 | mkdir ../G_of_R 95 | cd ../G_of_R 96 | python ${L1EM_CGC_dir}median_template_and_pairs.py $bamfile 0.001 > ../baminfo.txt 97 | medianinsert=$(head -1 ../baminfo.txt) 98 | ls ../split_fqs/*.bam > list_of_bams.txt 99 | split -l $threads list_of_bams.txt list_of_bams.txt. 100 | for bamlist in list_of_bams.txt.* 101 | do for bam in $(cat $bamlist) 102 | do python ${L1EM_code_dir}G_of_R.py -b ../split_fqs/$bam -i $medianinsert -p $(echo $bam| cut -d '/' -f 3) -e $error_prob -m $max_start2start_len -r $reads_per_pickle -n $NMdiff & 103 | done 104 | wait 105 | done 106 | 107 | # RUN EM 108 | echo 'STEP 5: Expectation maximization' 109 | mkdir ../L1EM/ 110 | cd ../L1EM/ 111 | ls ../G_of_R/*pk2 > G_of_R_list.txt 112 | cp $(ls ../G_of_R/*TE_list.txt | head -1) TE_list.txt 113 | python ${L1EM_code_dir}L1EM.py -g G_of_R_list.txt -l TE_list.txt -t $threads -s $EM_threshold 114 | 115 | #Write results as text file 116 | echo 'STEP 6: Writing results' 117 | 118 | $python ${L1EM_utilities_dir}L1EM_readpairs.py >> ../baminfo.txt 119 | $python ${L1EM_utilities_dir}report_l1_exp_counts.py > ../full_counts.txt 120 | $python ${L1EM_utilities_dir}report_l1hs_transcription.py > ../l1hs_transcript_counts.txt 121 | $python ${L1EM_utilities_dir}filtered_and_normalized_l1hs.py names_final.pkl X_final.pkl $(head -2 ../baminfo.txt | tail -1) $(head -3 ../baminfo.txt | tail -1)> ../filter_L1HS_FPM.txt 122 | 123 | #Clean up 124 | echo 'STEP 7: Clean up' 125 | cp *final.pkl ../ 126 | cd .. 127 | 128 | # rm idL1reads/* 129 | # rmdir idL1reads 130 | # rm split_fqs/* 131 | # rmdir split_fqs 132 | # rm G_of_R/* 133 | # rmdir G_of_R 134 | # rm L1EM/* 135 | # rmdir L1EM 136 | -------------------------------------------------------------------------------- /utilities/L1EM_readpairs.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | 7 | """ 8 | Report the total numbr of read pairs passed to L1EM 9 | 10 | Copyright (C) 2019 Wilson McKerrow 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | """ 26 | 27 | total = 0 28 | for line in open('G_of_R_list.txt'): 29 | G_of_R = pickle.load(open(line.strip(),'rb')) 30 | if G_of_R != None: 31 | total += G_of_R.shape[0] 32 | 33 | print(total) 34 | -------------------------------------------------------------------------------- /utilities/filtered_and_normalized_active_l1md.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | import sys 7 | 8 | """ 9 | Extract the LINE-1 transcript estimates from mm39 version of L1EM. 10 | 11 | Copyright (C) 2021 Wilson McKerrow 12 | 13 | This program is free software: you can redistribute it and/or modify 14 | it under the terms of the GNU General Public License as published by 15 | the Free Software Foundation, either version 3 of the License, or 16 | (at your option) any later version. 17 | 18 | This program is distributed in the hope that it will be useful, 19 | but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 | GNU General Public License for more details. 22 | 23 | You should have received a copy of the GNU General Public License 24 | along with this program. If not, see . 25 | 26 | """ 27 | 28 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb')))) 29 | 30 | proper_pairs_in_original_bam = float(sys.argv[3]) 31 | 32 | total = float(sys.argv[4]) 33 | 34 | written_seqs = set([]) 35 | 36 | print("family.category.locus.strand\tonly\t3prunon") 37 | 38 | names = list(X_est.keys()) 39 | 40 | for name in names: 41 | if 'L1MdTf_' in name or 'L1MdGf_' in name or 'L1MdA_I' in name or 'L1MdA_II' in name or 'L1MdA_III' in name: 42 | seq_name = '_'.join(name.split('_')[:-1]) 43 | if seq_name in written_seqs: 44 | continue 45 | written_seqs.add(seq_name) 46 | print_string = seq_name.split('(')[0] 47 | only_name = seq_name+'_only' 48 | if only_name not in X_est: 49 | X_est[only_name]=0.0 50 | only_pairs = total*X_est[only_name] 51 | runon_name = seq_name+'_3prunon' 52 | if runon_name not in X_est: 53 | X_est[runon_name]=0.0 54 | runon_pairs = total*X_est[runon_name] 55 | runthroughS_name = seq_name+'_senserunthrough' 56 | if runthroughS_name not in X_est: 57 | X_est[runthroughS_name]=0.0 58 | runthrough_pairs = total*X_est[runthroughS_name] 59 | runthroughA_name = seq_name+'_antisenserunthrough' 60 | if runthroughA_name not in X_est: 61 | X_est[runthroughA_name]=0.0 62 | runthrough_pairs += total*X_est[runthroughA_name] 63 | if (only_pairs+runon_pairs > 10*runthrough_pairs) & (only_pairs+runon_pairs>100): 64 | print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6)) 65 | -------------------------------------------------------------------------------- /utilities/filtered_and_normalized_active_l1md_unstranded.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | import sys 7 | 8 | """ 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC 10 | to analyze TCGA data. 11 | 12 | Copyright (C) 2021 Wilson McKerrow 13 | 14 | This program is free software: you can redistribute it and/or modify 15 | it under the terms of the GNU General Public License as published by 16 | the Free Software Foundation, either version 3 of the License, or 17 | (at your option) any later version. 18 | 19 | This program is distributed in the hope that it will be useful, 20 | but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | GNU General Public License for more details. 23 | 24 | You should have received a copy of the GNU General Public License 25 | along with this program. If not, see . 26 | 27 | """ 28 | 29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb')))) 30 | 31 | proper_pairs_in_original_bam = float(sys.argv[3]) 32 | 33 | total = float(sys.argv[4]) 34 | 35 | written_seqs = set([]) 36 | 37 | print("family.category.locus.strand\tonly\t3prunon") 38 | 39 | names = list(X_est.keys()) 40 | 41 | for name in names: 42 | if if 'L1MdTf_' in name or 'L1MdGf_' in name or 'L1MdA_I' in name or 'L1MdA_II' in name or 'L1MdA_III' in name: 43 | seq_name = '_'.join(name.split('_')[:-1]) 44 | if seq_name in written_seqs: 45 | continue 46 | written_seqs.add(seq_name) 47 | print_string = seq_name.split('(')[0] 48 | only_name = seq_name+'_only' 49 | if only_name not in X_est: 50 | X_est[only_name]=0.0 51 | only_pairs = total*X_est[only_name] 52 | runon_name = seq_name+'_3prunon' 53 | if runon_name not in X_est: 54 | X_est[runon_name]=0.0 55 | runon_pairs = total*X_est[runon_name] 56 | runthrough_name = seq_name+'_runthrough' 57 | if runthrough_name not in X_est: 58 | X_est[runthrough_name]=0.0 59 | runthrough_pairs = total*X_est[runthrough_name] 60 | if (only_pairs+runon_pairs > 10*runthrough_pairs) & (only_pairs+runon_pairs>100): 61 | print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6)) 62 | -------------------------------------------------------------------------------- /utilities/filtered_and_normalized_l1hs.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | import sys 7 | 8 | """ 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC 10 | to analyze TCGA data. 11 | 12 | Copyright (C) 2019 Wilson McKerrow 13 | 14 | This program is free software: you can redistribute it and/or modify 15 | it under the terms of the GNU General Public License as published by 16 | the Free Software Foundation, either version 3 of the License, or 17 | (at your option) any later version. 18 | 19 | This program is distributed in the hope that it will be useful, 20 | but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | GNU General Public License for more details. 23 | 24 | You should have received a copy of the GNU General Public License 25 | along with this program. If not, see . 26 | 27 | """ 28 | 29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb')))) 30 | 31 | proper_pairs_in_original_bam = float(sys.argv[3]) 32 | 33 | total = float(sys.argv[4]) 34 | 35 | written_seqs = set([]) 36 | 37 | print("family.category.locus.strand\tonly\t3prunon") 38 | 39 | names = list(X_est.keys()) 40 | 41 | for name in names: 42 | if 'L1HS' in name: 43 | seq_name = '_'.join(name.split('_')[:-1]) 44 | if seq_name in written_seqs: 45 | continue 46 | written_seqs.add(seq_name) 47 | print_string = seq_name.split('(')[0] 48 | only_name = seq_name+'_only' 49 | if only_name not in X_est: 50 | X_est[only_name]=0.0 51 | only_pairs = total*X_est[only_name] 52 | runon_name = seq_name+'_3prunon' 53 | if runon_name not in X_est: 54 | X_est[runon_name]=0.0 55 | runon_pairs = total*X_est[runon_name] 56 | runthroughS_name = seq_name+'_senserunthrough' 57 | if runthroughS_name not in X_est: 58 | X_est[runthroughS_name]=0.0 59 | runthrough_pairs = total*X_est[runthroughS_name] 60 | runthroughA_name = seq_name+'_antisenserunthrough' 61 | if runthroughA_name not in X_est: 62 | X_est[runthroughA_name]=0.0 63 | runthrough_pairs += total*X_est[runthroughA_name] 64 | if (only_pairs+runon_pairs > 3*runthrough_pairs) & (only_pairs+runon_pairs>100): 65 | print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6)) 66 | -------------------------------------------------------------------------------- /utilities/filtered_and_normalized_l1hs_unstranded.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | import sys 7 | 8 | """ 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC 10 | to analyze TCGA data. 11 | 12 | Copyright (C) 2019 Wilson McKerrow 13 | 14 | This program is free software: you can redistribute it and/or modify 15 | it under the terms of the GNU General Public License as published by 16 | the Free Software Foundation, either version 3 of the License, or 17 | (at your option) any later version. 18 | 19 | This program is distributed in the hope that it will be useful, 20 | but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | GNU General Public License for more details. 23 | 24 | You should have received a copy of the GNU General Public License 25 | along with this program. If not, see . 26 | 27 | """ 28 | 29 | X_est = dict(zip(pickle.load(open(sys.argv[1],'rb')),pickle.load(open(sys.argv[2],'rb')))) 30 | 31 | proper_pairs_in_original_bam = float(sys.argv[3]) 32 | 33 | total = float(sys.argv[4]) 34 | 35 | written_seqs = set([]) 36 | 37 | print("family.category.locus.strand\tonly\t3prunon") 38 | 39 | names = list(X_est.keys()) 40 | 41 | for name in names: 42 | if 'L1HS' in name: 43 | seq_name = '_'.join(name.split('_')[:-1]) 44 | if seq_name in written_seqs: 45 | continue 46 | written_seqs.add(seq_name) 47 | print_string = seq_name.split('(')[0] 48 | only_name = seq_name+'_only' 49 | if only_name not in X_est: 50 | X_est[only_name]=0.0 51 | only_pairs = total*X_est[only_name] 52 | runon_name = seq_name+'_3prunon' 53 | if runon_name not in X_est: 54 | X_est[runon_name]=0.0 55 | runon_pairs = total*X_est[runon_name] 56 | runthrough_name = seq_name+'_runthrough' 57 | if runthrough_name not in X_est: 58 | X_est[runthrough_name]=0.0 59 | runthrough_pairs = total*X_est[runthrough_name] 60 | if (only_pairs+runon_pairs > 3*runthrough_pairs) & (only_pairs+runon_pairs>100): 61 | print(seq_name.split('(')[0]+'\t'+str(only_pairs/proper_pairs_in_original_bam*10**6)+'\t'+str(runon_pairs/proper_pairs_in_original_bam*10**6)) 62 | -------------------------------------------------------------------------------- /utilities/median_template.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pysam 3 | import random 4 | import numpy 5 | 6 | """ 7 | Estimate median template length of a bam file. 8 | 9 | Part of the L1-EM package. 10 | 11 | Copyright (C) 2019 Wilson McKerrow 12 | 13 | This program is free software: you can redistribute it and/or modify 14 | it under the terms of the GNU General Public License as published by 15 | the Free Software Foundation, either version 3 of the License, or 16 | (at your option) any later version. 17 | 18 | This program is distributed in the hope that it will be useful, 19 | but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 | GNU General Public License for more details. 22 | 23 | You should have received a copy of the GNU General Public License 24 | along with this program. If not, see . 25 | 26 | """ 27 | 28 | bamfile = sys.argv[1] 29 | fraction = float(sys.argv[2]) 30 | 31 | tlens = list() 32 | 33 | for read in pysam.AlignmentFile(bamfile): 34 | if not read.is_unmapped and random.random() < fraction: 35 | tlens.append(read.template_length) 36 | 37 | print(numpy.median(numpy.abs(tlens))) 38 | -------------------------------------------------------------------------------- /utilities/read_or_pair_overlap_bed.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import sys 3 | 4 | """ 5 | Extract reads or pairs of reads that overlap a bed file. 6 | 7 | Part of the L1-EM package. 8 | 9 | Copyright (C) 2019 Wilson McKerrow 10 | 11 | This program is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 | This program is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU General Public License for more details. 20 | 21 | You should have received a copy of the GNU General Public License 22 | along with this program. If not, see . 23 | 24 | """ 25 | 26 | def main(): 27 | bedfile = sys.argv[1] 28 | bamfile = sys.argv[2] 29 | outbamfile = sys.argv[3] 30 | if len(sys.argv) > 4: 31 | flanking = int(sys.argv[4]) 32 | else: 33 | flanking = 400 34 | if len(sys.argv) > 5: 35 | maxNM = int(sys.argv[5]) 36 | else: 37 | maxNM = 4 38 | 39 | inbam = pysam.AlignmentFile(bamfile,'rb') 40 | outbam = pysam.AlignmentFile(outbamfile,'wb',template=inbam) 41 | 42 | read_ids = set() 43 | for line in open(bedfile): 44 | chrom,start,stop = line.strip().split('\t')[:3] 45 | start = int(start)+flanking 46 | stop = int(stop)-flanking 47 | if chrom in inbam.references: 48 | for read in inbam.fetch(chrom,start,stop): 49 | if not read.is_unmapped: 50 | if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and (not read.has_tag('NM') or read.get_tag('NM')<=maxNM): 51 | read_ids.add(read.query_name) 52 | # if chrom[3:] in inbam.references: 53 | # for read in inbam.fetch(chrom[3:],start,stop): 54 | # if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3: 55 | # read_ids.add(read.query_name) 56 | # if '_' in chrom and chrom.split('_')[1].upper()+'.1' in inbam.references: 57 | # for read in inbam.fetch(chrom.split('_')[1].upper()+'.1',start,stop): 58 | # if not read.is_secondary and not read.is_supplementary and 'S' not in read.cigarstring and 'N' not in read.cigarstring and read.get_tag('NM')<=3: 59 | # read_ids.add(read.query_name) 60 | 61 | inbam.close() 62 | inbam = pysam.AlignmentFile(bamfile,'rb') 63 | 64 | for read in inbam: 65 | if read.query_name in read_ids: 66 | if not read.is_secondary and not read.is_supplementary: 67 | outbam.write(read) 68 | 69 | inbam.close() 70 | outbam.close() 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /utilities/report_l1_exp_counts.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | 7 | """ 8 | Extract the estimate of proper transcription of L1HS elements. 9 | 10 | Copyright (C) 2019 Wilson McKerrow 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | """ 26 | 27 | total = 0 28 | for line in open('G_of_R_list.txt'): 29 | G_of_R = pickle.load(open(line.strip(),'rb')) 30 | if G_of_R != None: 31 | total += pickle.load(open(line.strip(),'rb')).shape[0] 32 | 33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb')))) 34 | 35 | written_seqs = set([]) 36 | 37 | print("family.category.locus.strand\tonly\t3prunon\tpassive_sense\tpassive_antisense\tantisense") 38 | 39 | names = list(X_est.keys()) 40 | 41 | for name in names: 42 | if 'exon' not in name: 43 | seq_name = '_'.join(name.split('_')[:-1]) 44 | if seq_name in written_seqs: 45 | continue 46 | written_seqs.add(seq_name) 47 | print_string = seq_name.split('(')[0] 48 | only_name = seq_name+'_only' 49 | if only_name not in X_est: 50 | X_est[only_name]=0.0 51 | print_string += '\t'+str(total*X_est[only_name]) 52 | runon_name = seq_name+'_3prunon' 53 | if runon_name not in X_est: 54 | X_est[runon_name]=0.0 55 | print_string += '\t'+str(total*X_est[runon_name]) 56 | senserunthrough_name = seq_name+'_senserunthrough' 57 | if senserunthrough_name not in X_est: 58 | X_est[senserunthrough_name]=0.0 59 | print_string += '\t'+str(total*X_est[senserunthrough_name]) 60 | antisenserunthrough_name = seq_name+'_antisenserunthrough' 61 | if antisenserunthrough_name not in X_est: 62 | X_est[antisenserunthrough_name]=0.0 63 | print_string += '\t'+str(total*X_est[antisenserunthrough_name]) 64 | antisense_name = seq_name+'_antisense' 65 | if antisense_name not in X_est: 66 | X_est[antisense_name]=0.0 67 | print_string += '\t'+str(total*X_est[antisense_name]) 68 | print(print_string) 69 | -------------------------------------------------------------------------------- /utilities/report_l1_exp_counts_clip.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | 7 | """ 8 | Extract the estimate of proper transcription of L1HS elements. 9 | 10 | Copyright (C) 2019 Wilson McKerrow 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | """ 26 | 27 | total = 0 28 | for line in open('G_of_R_list.txt'): 29 | G_of_R = pickle.load(open(line.strip(),'rb')) 30 | if G_of_R != None: 31 | total += pickle.load(open(line.strip(),'rb')).shape[0] 32 | 33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb')))) 34 | 35 | written_seqs = set([]) 36 | 37 | print("family.category.locus.strand\sesne\tantisense") 38 | 39 | names = list(X_est.keys()) 40 | 41 | for name in names: 42 | if 'exon' not in name: 43 | seq_name = '_'.join(name.split('_')[:-1]) 44 | if seq_name in written_seqs: 45 | continue 46 | written_seqs.add(seq_name) 47 | print_string = seq_name.split('(')[0] 48 | sense_name = seq_name+'_sense' 49 | if sense_name not in X_est: 50 | X_est[sense_name]=0.0 51 | print_string += '\t'+str(total*X_est[sense_name]) 52 | antisense_name = seq_name+'_antisense' 53 | if antisense_name not in X_est: 54 | X_est[antisense_name]=0.0 55 | print_string += '\t'+str(total*X_est[antisense_name]) 56 | print(print_string) 57 | -------------------------------------------------------------------------------- /utilities/report_l1_exp_counts_unstranded.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | import sys 7 | 8 | """ 9 | Extract the LINE-1 transcript estimates from L1EM. This version is intended for use on CGC 10 | to analyze TCGA data. 11 | 12 | Copyright (C) 2019 Wilson McKerrow 13 | 14 | This program is free software: you can redistribute it and/or modify 15 | it under the terms of the GNU General Public License as published by 16 | the Free Software Foundation, either version 3 of the License, or 17 | (at your option) any later version. 18 | 19 | This program is distributed in the hope that it will be useful, 20 | but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | GNU General Public License for more details. 23 | 24 | You should have received a copy of the GNU General Public License 25 | along with this program. If not, see . 26 | 27 | """ 28 | 29 | total = 0 30 | for line in open('G_of_R_list.txt'): 31 | G_of_R = pickle.load(open(line.strip(),'rb')) 32 | if G_of_R != None: 33 | total += pickle.load(open(line.strip(),'rb')).shape[0] 34 | 35 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb')))) 36 | 37 | written_seqs = set([]) 38 | 39 | print("family.category.locus.strand\tonly\t3prunon\tpassive") 40 | 41 | names = list(X_est.keys()) 42 | 43 | for name in names: 44 | if 'exon' not in name: 45 | seq_name = '_'.join(name.split('_')[:-1]) 46 | if seq_name in written_seqs: 47 | continue 48 | written_seqs.add(seq_name) 49 | print_string = seq_name.split('(')[0] 50 | only_name = seq_name+'_only' 51 | if only_name not in X_est: 52 | X_est[only_name]=0.0 53 | print_string += '\t'+str(total*X_est[only_name]) 54 | runon_name = seq_name+'_3prunon' 55 | if runon_name not in X_est: 56 | X_est[runon_name]=0.0 57 | print_string += '\t'+str(total*X_est[runon_name]) 58 | runthrough_name = seq_name+'_runthrough' 59 | if runthrough_name not in X_est: 60 | X_est[runthrough_name]=0.0 61 | print_string += '\t'+str(total*X_est[runthrough_name]) 62 | print(print_string) 63 | -------------------------------------------------------------------------------- /utilities/report_l1hs_transcription.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | 7 | """ 8 | Extract the estimate of proper transcription of L1HS elements. 9 | 10 | Copyright (C) 2019 Wilson McKerrow 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | """ 26 | 27 | total = 0 28 | for line in open('G_of_R_list.txt'): 29 | G_of_R = pickle.load(open(line.strip(),'rb')) 30 | if G_of_R != None: 31 | total += G_of_R.shape[0] 32 | 33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb')))) 34 | 35 | written_seqs = set([]) 36 | 37 | print("family.category.locus.strand\tonly\t3prunon") 38 | 39 | names = list(X_est.keys()) 40 | 41 | for name in names: 42 | if 'L1HS' in name: 43 | seq_name = '_'.join(name.split('_')[:-1]) 44 | if seq_name in written_seqs: 45 | continue 46 | written_seqs.add(seq_name) 47 | print_string = seq_name.split('(')[0] 48 | 49 | total_proper = 0.0 50 | total_passive = 0.0 51 | 52 | only_name = seq_name+'_only' 53 | if only_name not in X_est: 54 | X_est[only_name]=0.0 55 | print_string += '\t'+str(total*X_est[only_name]) 56 | total_proper += total*X_est[only_name] 57 | runon_name = seq_name+'_3prunon' 58 | if runon_name not in X_est: 59 | X_est[runon_name]=0.0 60 | print_string += '\t'+str(total*X_est[runon_name]) 61 | total_proper += total*X_est[runon_name] 62 | senserunthrough_name = seq_name+'_senserunthrough' 63 | if senserunthrough_name not in X_est: 64 | X_est[senserunthrough_name]=0.0 65 | total_passive += total*X_est[senserunthrough_name] 66 | antisenserunthrough_name = seq_name+'_antisenserunthrough' 67 | if antisenserunthrough_name not in X_est: 68 | X_est[antisenserunthrough_name]=0.0 69 | total_passive += total*X_est[senserunthrough_name] 70 | if total_proper > 3*total_passive: 71 | print(print_string) 72 | -------------------------------------------------------------------------------- /utilities/report_l1hs_transcription_unstranded.py: -------------------------------------------------------------------------------- 1 | # On Python2 import cPickle for performance improvement, else import pickle (available to both Py2 and Py3). 2 | try: 3 | import cPickle as pickle 4 | except ImportError: 5 | import pickle 6 | 7 | """ 8 | Extract the estimate of proper transcription of L1HS elements. 9 | 10 | Copyright (C) 2019 Wilson McKerrow 11 | 12 | This program is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 | This program is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program. If not, see . 24 | 25 | """ 26 | 27 | total = 0 28 | for line in open('G_of_R_list.txt'): 29 | G_of_R = pickle.load(open(line.strip(),'rb')) 30 | if G_of_R != None: 31 | total += G_of_R.shape[0] 32 | 33 | X_est = dict(zip(pickle.load(open('names_final.pkl','rb')),pickle.load(open('X_final.pkl','rb')))) 34 | 35 | written_seqs = set([]) 36 | 37 | print("family.category.locus.strand\tonly\t3prunon") 38 | 39 | names = list(X_est.keys()) 40 | 41 | for name in names: 42 | if 'L1HS' in name: 43 | seq_name = '_'.join(name.split('_')[:-1]) 44 | if seq_name in written_seqs: 45 | continue 46 | written_seqs.add(seq_name) 47 | print_string = seq_name.split('(')[0] 48 | 49 | total_proper = 0.0 50 | total_passive = 0.0 51 | 52 | only_name = seq_name+'_only' 53 | if only_name not in X_est: 54 | X_est[only_name]=0.0 55 | print_string += '\t'+str(total*X_est[only_name]) 56 | total_proper += total*X_est[only_name] 57 | runon_name = seq_name+'_3prunon' 58 | if runon_name not in X_est: 59 | X_est[runon_name]=0.0 60 | print_string += '\t'+str(total*X_est[runon_name]) 61 | total_proper += total*X_est[runon_name] 62 | senserunthrough_name = seq_name+'_runthrough' 63 | if senserunthrough_name not in X_est: 64 | X_est[senserunthrough_name]=0.0 65 | total_passive += total*X_est[senserunthrough_name] 66 | if total_proper > 3*total_passive: 67 | print(print_string) 68 | --------------------------------------------------------------------------------