├── .gitmodules
├── LICENSE
├── README.md
├── allele_level_expression
    ├── CAST.SNPs.validated.vcf.gz
    ├── README.md
    ├── get_variant_overlap_CAST.R
    └── mouse_cross.yaml
└── ss3iso
    ├── LICENSE
    ├── README.md
    ├── isoform_reconstruction.png
    ├── pyModule
        ├── informative_reads.py
        ├── isoform_reconstruct.py
        └── reference.py
    ├── ss3_isoform.conf
    └── ss3_isoform.py


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "stitcher.py"]
2 | 	path = stitcher.py
3 | 	url = https://github.com/AntonJMLarsson/stitcher.py.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Smart-seq3
 2 | 
 3 | This repository contains the scripts and pipelines used to process and analyse Smart-seq3 libraries, as described in Hagemann-Jensen et al. 2020. https://doi.org/10.1038/s41587-020-0497-0
 4 | 
 5 | We here provide the code to perform the following steps, that are expanded upon in the dedicated sub-folders.
 6 | 
 7 | ### 1) Processing of Smart-seq3 data with zUMIs. 
 8 | We show how fastq files are efficiently processed to BAM files in a manner that simultaneously distinguishes 5' from internal reads, and error-corrects both cell barcodes and molecular barcodes using [zUMIs](https://github.com/sdparekh/zUMIs).
 9 | 
10 | First, you should obtain raw fastq files *without demultiplexing*, as the data will be processed in a pooled fashion. When running the [bcl2fastq](https://support.illumina.com/sequencing/sequencing_software/bcl2fastq-conversion-software.html) conversion, be sure to keep index read fastq files.
11 | 
12 | Example for a dual-index, 150 bp PE run: 
13 | `bcl2fastq --use-bases-mask Y150N,I8,I8,Y150N --no-lane-splitting --create-fastq-for-index-reads -R /mnt/storage1/NextSeqNAS/191011_NB502120_0154_AHVG7JBGXB`
14 | 
15 | Next, prepare your config file in [YAML format for zUMIs](https://github.com/sdparekh/zUMIs/wiki/Usage#setup-using-the-yaml-config-file). The UMI sequence needs to be correctly extracted from 5' reads in Smart-seq3. These will always be the first Illumina read and are recognized by our unique 11bp tag sequence. Thus, you need to set the following settings:
16 | 
17 | ```
18 | file1:
19 |     name: /mnt/storage2/temp_workdir/Undetermined_S0_L003_R1_001.fastq.gz
20 |     base_definition:
21 |       - cDNA(23-150)
22 |       - UMI(12-19)
23 |     find_pattern: ATTGCGCAATG
24 | ```
25 | 
26 | You can find an [example YAML file here](https://github.com/sandberg-lab/Smart-seq3/blob/master/allele_level_expression/mouse_cross.yaml).
27 | 
28 | Note that we advise caution when using STARs 2-pass mapping mode, as we have observed some spurious novel splice junctions being used that may distort molecule reconstructions.
29 | 
30 | ### 2) Scripts to reconstruct RNA molecules based on the zUMIs prepared BAM files.
31 | Using our python script [*stitcher.py*](https://github.com/AntonJMLarsson/stitcher.py/tree/57330b5af97a338d914b4504121a5d018eb2c3d5) we in silico reconstruct RNA molecules based on the read pair alignments in the zUMIs generated BAM files. Note that for RNA reconstruction, paired-end sequencing data is required. This step results in a new BAM file where each entry is a reconstructed molecule.
32 | 
33 | https://github.com/AntonJMLarsson/stitcher.py/tree/57330b5af97a338d914b4504121a5d018eb2c3d5
34 | 
35 | ### 3) Scripts to assign reconstructed RNA molecules to allelic origins.
36 | We provide a stand-alone Rscript that assigns molecules to their allele of origin.
37 | 
38 | https://github.com/sandberg-lab/Smart-seq3/tree/master/allele_level_expression
39 | 
40 | ### 4) Scripts to assign reconstructed RNA molecules to transcript isoforms.
41 | Using a [couple of python scripts](https://github.com/sandberg-lab/Smart-seq3/tree/master/ss3iso), we assign each RNA molecule to a set of compatible isoforms (including unique assignments). The resulting assignments are reported in tab-delimited text files.
42 | 
43 | https://github.com/sandberg-lab/Smart-seq3/tree/master/ss3iso
44 | 
45 | ### 5) Notebooks.
46 | Here we post notebooks that show the analysis workflows for selected analyses from Hagemann-Jensen et al. as R or Python Jupyter notebooks.
47 | 
48 | 


--------------------------------------------------------------------------------
/allele_level_expression/CAST.SNPs.validated.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sandberg-lab/Smart-seq3/5d5938475039f5c98d0d94faf89db917f66fe8ac/allele_level_expression/CAST.SNPs.validated.vcf.gz


--------------------------------------------------------------------------------
/allele_level_expression/README.md:
--------------------------------------------------------------------------------
 1 | # CAST and C57/BL6 allele-specific expression
 2 | Here we provide tools to classify molecules to their allele of origin for the CAST X C57/BL6 F1 mouse cells.
 3 | 
 4 | First, sequencing data should be processed using zUMIs from fastq files to aligned bam files and UMI count tables.
 5 | Of note, the genome positions with strain-specific variation should be masked with N to avoid a mapping bias towards the reference allele.
 6 | [SNPsplit](https://github.com/FelixKrueger/SNPsplit) can be used to generate the N-masked genome fasta file.  
 7 | 
 8 | 
 9 |  `zUMIs-master.sh -y mouse_cross.yaml`
10 | 
11 | Based on the zUMIs output, you can run the allele-specific expression script.
12 | It requires only the config file used for zUMIs and a VCF file of CAST specific SNPs.
13 | In this repository, we provide the VCF file used for the publication analyses. This file contains CAST/EiJ strain specific SNPs, obtained from the
14 | mouse genome project dbSNP version 142 and filtered for variants clearly observed in existing CAST/EiJ x C57/Bl6J F1 data.
15 | 
16 |  `Rscript get_variant_overlap_CAST.R --help`
17 |  
18 |  `Rscript get_variant_overlap_CAST.R --yaml mouse_cross.yaml --vcf CAST.SNPs.validated.vcf.gz`
19 | 
20 | 
21 | For users with a working zUMIs installation, the script does not require additional dependencies.
22 | The output contains files for both directly assigned molecules and total UMI counts broken down by the observed gene-wise allele-fractions.
23 | 


--------------------------------------------------------------------------------
/allele_level_expression/get_variant_overlap_CAST.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # packages ----------------------------------------------------------------
  3 | suppressPackageStartupMessages(library(data.table))
  4 | #suppressPackageStartupMessages(library(vcfR))
  5 | suppressPackageStartupMessages(library(GenomicAlignments))
  6 | suppressPackageStartupMessages(library(GenomicRanges))
  7 | suppressPackageStartupMessages(library(optparse))
  8 | suppressPackageStartupMessages(library(yaml))
  9 | 
 10 | Sys.time()
 11 | print("allele level expression v1.1")
 12 | 
 13 | # number crunching function -----------------------------------------------
 14 | 
 15 | load_reads <- function(is_UMI, cellBCs, filename, return_map, read_layout){
 16 |   if( is_UMI ){
 17 |     cols_to_read <- c(2,4,5,6,7,8)
 18 |     colname_vec <- c("pos","cigar","seq","BC","UB","GeneID")
 19 |   }else{
 20 |     cols_to_read <- c(2,4,5,6,7)
 21 |     colname_vec <- c("pos","cigar","seq","BC","GeneID")
 22 |   }
 23 | 
 24 |   reads <- fread(file = filename,
 25 |                  sep = "\t",
 26 |                  header = F, fill = T,
 27 |                  select = cols_to_read, #read only necessary cls
 28 |                  col.names = colname_vec)[ BC %in% cellBCs ][ ! GeneID == "" ] #directly drop unnecessary rows
 29 | 
 30 |   if(return_map == FALSE){
 31 |     system(paste("pigz -p ",ncores,filename))
 32 |   }
 33 |   if(read_layout == "SE"){
 34 |     reads[, readID := paste0("r_",1:nrow(reads))]
 35 |   }else{
 36 |     reads[, readID := paste0("r_",1:nrow(reads))] #for now, keep PE reads as individual reads, because I did not make sure that we load proper pairs adjacent to each other!
 37 |   }
 38 |   return(reads)
 39 | }
 40 | 
 41 | variant_parsing <- function(reads, variant_positions, is_UMI){
 42 |   #parse all cigars to reference seq
 43 |   ops <- c("M", "=", "X")
 44 |   ranges_on_ref <- cigarRangesAlongReferenceSpace(reads$cigar, pos=reads$pos, ops=ops)
 45 |   ranges_on_query <- cigarRangesAlongQuerySpace(reads$cigar, ops=ops)
 46 |   gc(verbose = F)
 47 |   range_group <- togroup(PartitioningByWidth(ranges_on_ref))
 48 |   ranges_on_ref <- unlist(ranges_on_ref, use.names=FALSE)
 49 |   ranges_on_query <- unlist(ranges_on_query, use.names=FALSE)
 50 |   query2ref_shift <- start(ranges_on_ref) - start(ranges_on_query)
 51 | 
 52 |   var_pos <- variant_positions
 53 |   hits <- findOverlaps(var_pos, ranges_on_ref)
 54 |   hits_at_in_x <- var_pos[queryHits(hits)] - query2ref_shift[subjectHits(hits)]
 55 |   hits_group <- range_group[subjectHits(hits)]
 56 |   fetched_bases <- subseq(reads[hits_group,]$seq, start=hits_at_in_x, width=1L)
 57 | 
 58 |   #now add everything together in the output data.table
 59 |   out_vars <- data.table(
 60 |     obs_base = fetched_bases,
 61 |     pos = var_pos[queryHits(hits)]
 62 |   )
 63 |   out_vars[, c("BC","GeneID","readID") := reads[hits_group, c("BC", "GeneID", "readID"), with = F] ]
 64 |   if( is_UMI ){
 65 |     out_vars[, UB := reads[hits_group]$UB ]
 66 |   }
 67 | 
 68 |   out_vars <- out_vars[obs_base %in% c("A","C","G","T") ]
 69 |   setnames(out_vars,"pos","POS")
 70 | 
 71 |   return(out_vars)
 72 | }
 73 | 
 74 | calc_coverage_new_return_map <- function(vcf_chunk, out, cellBCs, type, read_layout){
 75 |   chr <- unique(vcf_chunk$CHROM)
 76 |   is_UMI <- any(grepl("UMIs", type))
 77 |   print(paste("Starting to read data for chr ", chr))
 78 |   Sys.time()
 79 | 
 80 |   reads <- load_reads(is_UMI = is_UMI, cellBCs = cellBCs, filename = paste0(out,chr,".var_overlap.readsout"), return_map = TRUE, read_layout = read_layout)
 81 | 
 82 |   print("Reading complete, processing reads & cigar values...")
 83 |   Sys.time()
 84 | 
 85 |   out_vars <- variant_parsing(reads, variant_positions = as.integer(vcf_chunk$POS), is_UMI = is_UMI)
 86 | 
 87 |   #crunch the numbers :-)
 88 |   out_vars <- merge(out_vars,vcf_chunk,by = "POS" )
 89 | 
 90 |   out_vars[          , basecall := "other"][
 91 |       obs_base == REF, basecall := "c57"][
 92 |       obs_base == ALT, basecall := "cast"]
 93 | 
 94 |   out_reads <- out_vars[, .(readcall = read_decision(basecall)), by = c("BC","GeneID","readID")]
 95 |   if( is_UMI ){
 96 |     out_UMIs <- out_vars[! UB == "" , .(UMIcall = read_decision(basecall)), by = c("BC","GeneID","UB")]
 97 |     return(out_UMIs)
 98 |   }else{
 99 |     return(out_reads)
100 |   }
101 | }
102 | 
103 | calc_coverage_new <- function(vcf_chunk, out, cellBCs, type, read_layout){
104 |   chr <- unique(vcf_chunk$CHROM)
105 |   is_UMI <- any(grepl("UMIs", type))
106 |   print(paste("Starting to read data for chr ", chr))
107 |   Sys.time()
108 |   reads <- load_reads(is_UMI = is_UMI, cellBCs = cellBCs, filename = paste0(out,chr,".var_overlap.readsout"), return_map = FALSE, read_layout = read_layout)
109 | 
110 |   print("Reading complete, processing reads & cigar values...")
111 |   Sys.time()
112 | 
113 |   out_vars <- variant_parsing(reads, variant_positions = as.integer(vcf_chunk$POS), is_UMI = is_UMI)
114 | 
115 |   #crunch the numbers :-)
116 |   out_vars <- merge(out_vars,vcf_chunk,by = "POS" )
117 | 
118 |   out_vars[              , basecall := "other"][
119 |           obs_base == REF, basecall := "c57"][
120 |           obs_base == ALT, basecall := "cast"]
121 | 
122 |   out_reads <- out_vars[, .(readcall = read_decision(basecall)), by = c("BC","GeneID","readID")]
123 |   if( is_UMI ){
124 |     out_UMIs <- out_vars[! UB == "" , .(UMIcall = read_decision(basecall)), by = c("BC","GeneID","UB")]
125 |   }
126 |   rm(out_vars)
127 | 
128 |   out_dat <- out_reads[
129 |     , .N, by=.(BC,GeneID,readcall)][
130 |       , chr := chr]
131 | 
132 |   rm(out_reads)
133 | 
134 |   out_dat <- dcast(out_dat, formula = chr+BC+GeneID ~ readcall, value.var = "N", fill = 0)
135 |   out_dat[, total := c57+cast+other]
136 | 
137 |   out_dat <- out_dat[other/total < 0.33]
138 | 
139 |   out_dat[, CAST_fraction := cast/(cast+c57), by = c("BC","GeneID")]
140 | 
141 |   print("Done!")
142 | 
143 |   if( is_UMI ){
144 |     out_dat_UMIs <- out_UMIs[
145 |       , .N, by=c("BC","GeneID","UMIcall")][
146 |         , chr := chr]
147 | 
148 |     rm(out_UMIs)
149 | 
150 |     out_dat_UMIs <- dcast(out_dat_UMIs, formula = chr+BC+GeneID ~ UMIcall, value.var = "N", fill = 0)
151 |     out_dat_UMIs[, total := c57+cast+other]
152 | 
153 |     out_dat_UMIs <- out_dat_UMIs[other/total < 0.33]
154 | 
155 |     out_dat_UMIs[, CAST_fraction := cast/(cast+c57), by = c("BC","GeneID")]
156 | 
157 |     out_list <- list(reads = out_dat,
158 |                      UMIs = out_dat_UMIs)
159 |     return(out_list)
160 |   }else{
161 |     return(out_dat)
162 |   }
163 | 
164 | }
165 | 
166 | makeWide <- function(allele_dat, metric = c("cast","c57","CAST_fraction")){
167 |   dat <- allele_dat[, c("BC","GeneID",metric), with = F]
168 |   fill_val <- ifelse(metric %in% c("cast","c57"), 0, NA)
169 |   dat_w <- dcast(dat, formula = GeneID ~ BC, fill=fill_val, value.var = metric)
170 |   return(dat_w)
171 | }
172 | 
173 | makeUMIs <- function(dge_path, CASTfracts){
174 |   dge <- readRDS(dge_path)
175 |   ex <- as.matrix(dge$umicount$exon$all)
176 |   fract_mat <- as.matrix(CASTfracts)
177 |   row.names(fract_mat) <- fract_mat[,1]
178 |   fract_mat <- fract_mat[,-1]
179 |   class(fract_mat) <- "numeric"
180 | 
181 |   shared_genes <- intersect(row.names(fract_mat),row.names(ex))
182 |   shared_cells <- intersect(colnames(fract_mat),colnames(ex))
183 | 
184 |   fract_mat <- fract_mat[shared_genes,shared_cells]
185 |   ex <- ex[shared_genes,shared_cells]
186 | 
187 |   no_expr <- (ex == 0)
188 |   umis_CAST <- round(fract_mat*ex,0)
189 |   umis_BL6 <- round((1-fract_mat)*ex,0)
190 | 
191 |   umis_CAST[no_expr] <- 0
192 |   umis_BL6[no_expr] <- 0
193 | 
194 |   outlist <- list(
195 |     umis_CAST = umis_CAST,
196 |     umis_BL6 = umis_BL6
197 |   )
198 | 
199 |   return(outlist)
200 | }
201 | 
202 | read_decision <- function(basecalls){
203 |   if(length(basecalls) == 1){
204 |     return(basecalls)
205 |   }else{
206 |     ux <- unique(basecalls)
207 |     basecall_summary <- tabulate(match(basecalls, ux))
208 |     names(basecall_summary) <- ux
209 |     majority_basecall <- ux[which.max(basecall_summary)]
210 |     if(basecall_summary[majority_basecall]/sum(basecall_summary) >= 0.66){
211 |       return(majority_basecall)
212 |     }else{
213 |       return("other")
214 |     }
215 |   }
216 | }
217 | 
218 | check_nonUMIcollapse <- function(seqfiles){
219 |   #decide wether to run in UMI or no-UMI mode
220 |   UMI_check <- lapply(seqfiles,
221 |                       function(x) {
222 |                         if(!is.null(x$base_definition)) {
223 |                           if(any(grepl("^UMI",x$base_definition))) return("UMI method detected.")
224 |                         }
225 |                       })
226 | 
227 |   umi_decision <- ifelse(length(unlist(UMI_check))>0,"UMI","nonUMI")
228 |   return(umi_decision)
229 | }
230 | 
231 | 
232 | # startup variables -------------------------------------------------------
233 | option_list <- list(
234 |   make_option(c("-y", "--yaml"), type="character",
235 |               help="Coordinate sorted bam file. Mandatory"),
236 |   make_option(c("-v", "--vcf"), type="character",
237 |               help="SNP position list (VCF file) with variant annotation. Mandatory"),
238 |   make_option(c("-t","--tagBC"), type="character",
239 |                 help="Bam tag containing cell barcodes. Default: BC",
240 |               default="BC"),
241 |   make_option(c("-m","--minCount"), type="integer",
242 |                 help="Cutoff for minimum coverage in a Cell/Gene pair. Default: 0",
243 |                 default=0),
244 |   make_option(c("-u", "--umi_map"), action="store_true", default=FALSE,
245 |               help="Print UMI-allele mapping table")
246 | )
247 | opt <- parse_args(OptionParser(option_list=option_list))
248 | 
249 | if (any(is.null(opt$yaml),is.null(opt$vcf))) {
250 |   stop("All mandatory parameters must be provided. See script usage (--help)")
251 | }
252 | 
253 | 
254 | #####
255 | #testing
256 | #####
257 | #BCtag <- "BC"
258 | #path_snps <- "/home/chrisz/resources/genomes/Mouse/old_validated_cast_c57_snps.mm10.vcf"
259 | #path_snps <- "/home/chrisz/resources/genomes/Mouse/CAST.SNPs.superset.vcf.gz"
260 | #minC <- 0
261 | #opt   <- read_yaml("/home/perj/moved_data/mmu/per_fibroblasts_final/zUMIs_rerun/zUMIs_rerun.yaml")
262 | #outpath <- paste0(opt$out_dir,"/zUMIs_output/allelic/")
263 | #####
264 | #/testing
265 | #####
266 | 
267 | 
268 | BCtag <- opt$tagBC
269 | path_snps <- opt$vcf
270 | minC <- opt$minCount
271 | map_flag <- opt$umi_map
272 | 
273 | opt   <- read_yaml(opt$yaml)
274 | outpath <- paste0(opt$out_dir,"/zUMIs_output/allelic/")
275 | 
276 |   if(!dir.exists(outpath)){
277 |     try(system(paste("mkdir",outpath)))
278 |   }
279 | 
280 |   outpath <- paste0(outpath,opt$project,".")
281 |   ncores <- opt$num_threads
282 |   cellBCs <- paste0(opt$out_dir,"/zUMIs_output/",opt$project,"kept_barcodes.txt")
283 | 
284 |   setwd(opt$out_dir)
285 |   setDTthreads(ncores)
286 | 
287 | 
288 |   UMIdata_flag <- check_nonUMIcollapse(opt$sequence_files)
289 | 
290 | 
291 | # read stuff --------------------------------------------------------------
292 | 
293 | cellBCs <- fread(cellBCs)
294 | cellBCs <- cellBCs$XC
295 | 
296 | print("Reading Variants...")
297 | if(grepl(path_snps, pattern = ".gz$")){
298 |   vcf <- fread(cmd = paste("zcat",path_snps," | grep -v '^#'","| cut -f1,2,4,5"), col.names = c("CHROM","POS","REF","ALT"))
299 | }else{
300 |   vcf <- fread(cmd = paste("grep -v '^#'",path_snps,"| cut -f1,2,4,5"), col.names = c("CHROM","POS","REF","ALT"))
301 | }
302 | 
303 | print("Done!")
304 | Sys.time()
305 | 
306 | chroms_todo <- unique(vcf$CHROM)
307 | chroms_todo <- chroms_todo[! chroms_todo %in% c("Y","chrY")]
308 | 
309 | 
310 | # detect if zUMIs >= 2.6.0 is used ----------------------------------------
311 | if( file.exists(paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.sorted.bam")) || file.exists(paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")) ){
312 |   genetag <- "GE"
313 |   if( file.exists(paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")) ){
314 |     hammingflag <- TRUE
315 |     path_bam <- paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
316 |   }else{
317 |     hammingflag <- FALSE
318 |     path_bam <- paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.sorted.bam")
319 |   }
320 | }else{
321 |   genetag <- "XT"
322 |   if( file.exists( paste0(opt$out_dir,"/",opt$project,".filtered.tagged.Aligned.out.bam.ex.featureCounts.UBfix.bam")) ){
323 |     hammingflag <- TRUE
324 |     path_bam <- file.exists( paste0(opt$out_dir,"/",opt$project,".filtered.tagged.Aligned.out.bam.ex.featureCounts.UBfix.bam"))
325 |   }else{
326 |     hammingflag <- FALSE
327 |     path_bam <- paste0(opt$out_dir,"/",opt$project,".filtered.tagged.Aligned.out.bam.ex.featureCounts.bam")
328 |   }
329 | }
330 | 
331 | # extract unique maps per chromosome  -------------------------------------
332 | if( file.exists( paste0(outpath,chroms_todo[[1]],".var_overlap.readsout") ) | file.exists( paste0(outpath,chroms_todo[[1]],".var_overlap.readsout.gz") ) ){
333 |   zipped_files <- list.files(path=paste0(opt$out_dir,"/zUMIs_output/allelic/"), pattern=".var_overlap.readsout.gz", full.names=T)
334 |   print("Decompressing reads...")
335 |   for(f in zipped_files){
336 |     system(paste("pigz -d -p",ncores,f))
337 |   }
338 | }else{
339 |   print("Extracting reads...")
340 |   samtoolsexc <- opt$samtools_exec
341 |   if(UMIdata_flag == "UMI"){
342 |     if(hammingflag){
343 |       samtools_cmd1 <- "view -@2 -x QB -x QU -x ES -x IS -x EN -x IN -x GI -x BX -x UX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x vA -x vG -x vW"
344 |       samtools_cmd2 <- paste0(" | cut -f3,4,5,6,10,12,13,14 | grep '",genetag,"' | sed 's/",genetag,":Z://' | sed 's/UB:Z://' | sed 's/",BCtag,":Z://' | awk 'BEGIN{IFS=\"\t\";OFS=\"\t\";}{print $1,$2,$3,$4,$5,$6,$8,$7;}' | awk '{if($3 == \"255\"){print > \"",outpath,"\"$1\".var_overlap.readsout\"}}'")
345 |     }else{
346 |       samtools_cmd1 <- "view -@2 -x QB -x QU -x ES -x IS -x EN -x IN -x GI -x BX -x UX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x vA -x vG -x vW"
347 |       samtools_cmd2 <- paste0(" | cut -f3,4,5,6,10,12,13,14 | grep '",genetag,"' | sed 's/",genetag,":Z://' | sed 's/UB:Z://' | sed 's/",BCtag,":Z://' | awk '{if($3 == \"255\"){print > \"",outpath,"\"$1\".var_overlap.readsout\"}}'")
348 |     }
349 |   }else{
350 |     samtools_cmd1 <- "view -@2 -x QB -x QU -x ES -x IS -x EN -x IN -x GI -x BX -x UX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x vA -x vG -x vW -x UB"
351 |     samtools_cmd2 <- paste0(" | cut -f3,4,5,6,10,12,13 | grep '",genetag,"' | sed 's/",genetag,":Z://' | sed 's/",BCtag,":Z://' | awk '{if($3 == \"255\"){print > \"",outpath,"\"$1\".var_overlap.readsout\"}}'")
352 |   }
353 |   samtools_cmd <- paste(samtoolsexc,samtools_cmd1,path_bam,samtools_cmd2)
354 |   system(samtools_cmd)
355 | }
356 | 
357 | print("Done")
358 | Sys.time()
359 | 
360 | 
361 | # crunch data ---------------------------------------------------------------
362 | 
363 | vcf_list <- split(vcf[CHROM %in% chroms_todo], by = "CHROM")
364 | 
365 | if(UMIdata_flag == "UMI"){
366 |   if(map_flag){
367 |     print("Producing molecule assignment map...")
368 |     map_out_list <- lapply(vcf_list, function(x) calc_coverage_new_return_map(vcf_chunk = x, out = outpath, cellBCs = cellBCs, type = "UMIs", read_layout = opt$read_layout ))
369 |     map_out <- rbindlist(map_out_list)
370 |     fwrite(map_out, file = paste0(outpath,"molecule_assignments.txt" ), sep= "\t", quote = F)
371 |     print("Continuing with allelic expression tables...")
372 |   }
373 | 
374 |   out_list <- lapply(vcf_list, function(x) calc_coverage_new(vcf_chunk = x, out = outpath, cellBCs = cellBCs, type = c("reads","UMIs"), read_layout = opt$read_layout ))
375 |   read_list <- lapply(out_list, function(x) x$reads)
376 |   UMI_list <- lapply(out_list, function(x) x$UMIs)
377 | 
378 |   out_reads <- rbindlist(read_list)
379 |   out_UMIs <- rbindlist(UMI_list)
380 | }else{
381 |   out_list <- lapply(vcf_list, function(x) calc_coverage_new(vcf_chunk = x, out = outpath, cellBCs = cellBCs, type = "reads", read_layout = opt$read_layout ))
382 |   out_reads <- rbindlist(out_list)
383 | }
384 | 
385 | 
386 | print("Finalizing converting & output ...")
387 | Sys.time()
388 | out_reads <- out_reads[ (cast+c57) >= minC ]
389 | 
390 | CAST_reads <- makeWide(allele_dat = out_reads, metric = "cast")
391 | BL6_reads <- makeWide(allele_dat = out_reads, metric = "c57")
392 | fract_CAST <- makeWide(allele_dat = out_reads, metric = "CAST_fraction")
393 | 
394 | 
395 | print("Processing complete, writing output...")
396 | Sys.time()
397 | fwrite(CAST_reads, file = paste0(outpath,"CAST_reads.txt" ), sep= "\t", quote = F)
398 | fwrite(BL6_reads, file = paste0(outpath,"BL6_reads.txt" ), sep= "\t", quote = F)
399 | fwrite(fract_CAST, file = paste0(outpath,"fract_CAST_reads.txt" ), sep= "\t",na = "NA", quote = F)
400 | 
401 | 
402 | if(UMIdata_flag == "UMI"){
403 |   out_UMIs <- out_UMIs[ (cast+c57) >= minC ]
404 |   #get directly counted UMIs and write them
405 |   CAST_UMIs <- makeWide(allele_dat = out_UMIs, metric = "cast")
406 |   BL6_UMIs <- makeWide(allele_dat = out_UMIs, metric = "c57")
407 |   fract_CAST_UMIs <- makeWide(allele_dat = out_UMIs, metric = "CAST_fraction")
408 | 
409 |   fwrite(CAST_UMIs, file = paste0(outpath,"CAST_direct_UMIs.txt" ), sep= "\t", quote = F)
410 |   fwrite(BL6_UMIs, file = paste0(outpath,"BL6_direct_UMIs.txt" ), sep= "\t", quote = F)
411 |   fwrite(fract_CAST_UMIs, file = paste0(outpath,"fract_CAST_direct_UMIs.txt" ), sep= "\t",na = "NA", quote = F)
412 | 
413 |   #also convert total UMI counts into fractional allele counts with read count derived allele fractions
414 |   dge <- paste(opt$out_dir,"/zUMIs_output/expression/",opt$project,".dgecounts.rds",sep="")
415 |   UMIs <- makeUMIs(dge_path = dge, fract_CAST)
416 |   write.table(UMIs$umis_CAST, file = paste0(outpath,"CAST_fractional_UMIs.txt" ), sep= "\t", quote = F)
417 |   write.table(UMIs$umis_BL6, file = paste0(outpath,"BL6_fractional_UMIs.txt" ), sep= "\t", quote = F)
418 | }
419 | 
420 | 
421 | paste("DONE")
422 | Sys.time()
423 | 


--------------------------------------------------------------------------------
/allele_level_expression/mouse_cross.yaml:
--------------------------------------------------------------------------------
 1 | project: Smartseq3_Fibroblasts
 2 | sequence_files:
 3 |   file1:
 4 |     name: /mnt/storage2/temp_workdir/Undetermined_S0_L003_R1_001.fastq.gz
 5 |     base_definition:
 6 |       - cDNA(23-150)
 7 |       - UMI(12-19)
 8 |     find_pattern: ATTGCGCAATG
 9 |   file2:
10 |     name: /mnt/storage2/temp_workdir/Undetermined_S0_L003_R2_001.fastq.gz
11 |     base_definition:
12 |       - cDNA(1-150)
13 |   file3:
14 |     name: /mnt/storage2/temp_workdir/Undetermined_S0_L003_I1_001.fastq.gz
15 |     base_definition:
16 |       - BC(1-8)
17 |   file4:
18 |     name: /mnt/storage2/temp_workdir/Undetermined_S0_L003_I2_001.fastq.gz
19 |     base_definition:
20 |       - BC(1-8)
21 | reference:
22 |   STAR_index: /mnt/storage1/genomes/Mouse_CAST_Nmasked/STAR5idx_noGTF/
23 |   GTF_file: /mnt/storage1/genomes/Mouse/Mus_musculus.GRCm38.91.chr.clean.gtf
24 |   additional_STAR_params: '--limitSjdbInsertNsj 2000000 --clip3pAdapterSeq CTGTCTCTTATACACATCT'
25 |   additional_files:
26 | out_dir: /mnt/storage2/temp_workdir/zUMIs_nmask/
27 | num_threads: 50
28 | mem_limit: 100
29 | filter_cutoffs:
30 |   BC_filter:
31 |     num_bases: 3
32 |     phred: 20
33 |   UMI_filter:
34 |     num_bases: 3
35 |     phred: 20
36 | barcodes:
37 |   barcode_num: ~
38 |   barcode_file: /mnt/storage2/temp_workdir/expected_barcodes.txt
39 |   automatic: no
40 |   BarcodeBinning: 1
41 |   nReadsperCell: 100
42 |   demultiplex: yes
43 | counting_opts:
44 |   introns: yes
45 |   downsampling: '0'
46 |   strand: 0
47 |   Ham_Dist: 1
48 |   write_ham: yes
49 |   velocyto: no
50 |   primaryHit: yes
51 |   twoPass: no
52 | make_stats: yes
53 | which_Stage: Filtering
54 | 


--------------------------------------------------------------------------------
/ss3iso/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2020 Ping Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ss3iso/README.md:
--------------------------------------------------------------------------------
 1 | # Welcome to ss3iso 
 2 | 
 3 | <img src="https://github.com/sandberg-lab/Smart-seq3/blob/master/ss3iso/isoform_reconstruction.png" alt="UMI reads" width="600"/>
 4 | 
 5 | ss3iso is a Python pipeline developed for isoform reconstruction of UMI-linking fragments from Smart-seq3. For detailed information, please read our paper [Single-cell RNA counting at allele- and isoform-resolution using Smart-seq3](https://www.biorxiv.org/content/10.1101/817924v1).
 6 | 
 7 | ss3iso uses [zUMIs](https://github.com/sdparekh/zUMIs) output BAM tagged with corrected cell and UMI barcodes as input. The pipeline requires GTF annotations (Ensembl, RefSeq or Gencode) and needs to be specified by **gtf_source** in configuration file.
 8 | 
 9 | ## Dependencies
10 | 
11 | Make sure the following softwares and Python packages are installed before running ss3iso.
12 | 
13 | ```
14 | Python3
15 | tabix
16 | bedtools (v2.26.0)
17 | samtools
18 | 
19 | optparse (python module)
20 | glob (python module)
21 | configparser (python module)
22 | re (python module)
23 | pybedtools (python module)
24 | subprocess (python module)
25 | pysam (python module)
26 | pandas (python module)
27 | collections (python module)
28 | numpy (python module)
29 | multiprocessing (python module)
30 | functools (python module)
31 | ```
32 | 
33 | ## Installation
34 | 
35 | Checkout ss3iso repository to your prefered folder on a computing server using following command. No futher installation is needed. 
36 | 
37 | ``` git clone https://github.com/sandberg-lab/Smart-seq3/ss3iso.git ```
38 | 
39 | ## Usage
40 | 
41 | Execute ss3iso pipeline using the following command line.
42 | ```
43 | python ss3_isoform.py -i [path/to/inputBAM] -c [path/to/configuration file] -e [experiment] -o [path/to/output directory] -p [number of processes] -s [species] -P -Q
44 | ```
45 | 
46 | Options:
47 | ```
48 | -i, --inputBAM: input ZUMIs BAM path. Note: Use '*filtered.tagged.Aligned.out.bam.ex.featureCounts.UBfix.sort.bam' generated by zUMIs. Every read should have a UB:Z tag.
49 | -c, --config: the required pipeline configuration file
50 | -e, --experiment: the name of the experiment/study
51 | -o, --outputDir: the output directory
52 | -p, --process: the number of processes for parallel computing (default: 8)
53 | -s, --species: the species under study (default: hg38)
54 | -P, --Preprocess: run preprocessing on input BAM
55 | -Q, --Quantification: run isoform reconstruction and quantification
56 | ```
57 | 
58 | Example contents in the input BAM:
59 | ```
60 | NB502120:154:HVG7JBGXB:2:21104:11500:9869       163     1       14409   3       85M65S  =       14692   410     GCTCAGTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCATGAAGTACAGGCAGACAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCACTCCTTG  AAAAAEEEEEEEEEEEEEEEEEEE6EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEAEEAEEEEE/EEAEEEEEEEAEAAAAE<AEEEAEEA/EE<EAE/AEE/AEAAEEEE6//  NH:i:2  HI:i:1  AS:i:206        nM:i:2  BX:Z:AAGCCGTTTGAACGCT   BC:Z:AAGCCGTTTGAACGCT   UX:Z:TAATCTCT   XS:Z:Unassigned_Ambiguity       UB:Z:TAATCTCT
61 | NB502120:154:HVG7JBGXB:1:13112:25990:9712       163     1       14414   255     150M    =       14749   602     GTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACTGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAGCACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCGCTCCTTGAA  AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAE/EEEEE<AAAAEEEEEE/EEAEEEEAEEEEEEEE<EEEEAA<EEEEEE/EAEEE/EAEEAAE<AA//6A/EA<<6/6<6  NH:i:1  HI:i:1  AS:i:275        nM:i:1  BX:Z:AAGCCGTTGAGGTTAG   BC:Z:AAGCCGTTGAGGTTAG   UX:Z:GCCAAGGG   XS:Z:Assigned3  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GCCAAGGG
62 | NB502120:154:HVG7JBGXB:4:23406:11214:8076       163     1       14414   3       98M2D51M1S      =       14692   405     GTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGAAGTACAGGCAGACAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCGCTCCTTGAAGC  6AAAAEEEAEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEE<E/EEEEEEEE6E/EEAEEEEEEAEEAA<<AAA/AEE<AEAEAEEEEEE<E//E//EAAEEEE//66<AEAAEEEEE6AE<A/6A6/AA<A</<6A//  NH:i:2  HI:i:1  AS:i:262        nM:i:3  BX:Z:AAGCCGTTGAGGTTAG   BC:Z:AAGCCGTTGAGGTTAG   UX:Z:GTGACTCT   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GTGACTCT
63 | NB502120:154:HVG7JBGXB:1:12205:16478:4384       163     1       14414   3       80M70S  =       14692   405     GTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCATGAAGTACAGGCAGACAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCACTCCTTGAAGCT  AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEAEEEEEAEEEEEEE/EEEEEE6AAEAEEA/EEEEEEEEEEEEEEAEEEAAEEEA<E<EEEA<AAEEEEA  NH:i:2  HI:i:1  AS:i:201        nM:i:2  BX:Z:AAGCCGTTTGAACGCT   BC:Z:AAGCCGTTTGAACGCT   UX:Z:TAATCTCT   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:TAATCTCT
64 | NB502120:154:HVG7JBGXB:1:21202:15138:15924      163     1       14414   255     42M5I102M1S     =       14683   405     GTTCTTTATTGATTGGTGTGCCATTTTCTCTGGAAGCCTCTTTAGAGAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAGCACAGGCAGACAGAAGTCCCCCCCCCAGCTGTGTGGCCTCAGGCCAGCCTTCCGCTCC  AAAAAAEEEEEEEEEAEEEEEEEEEEEAE//EEEEEEEEEE<EAEEEEEE/EEEEEEEEEEEEEEEEEEEE6EAAEEEEEEEEEEEEEAE<AEAE/EEEE/<EEEEEEEEAEE/<<<</EA6/6<////AAAEE/<EE/E</6AAAAAAA  NH:i:1  HI:i:1  AS:i:216        nM:i:9  BX:Z:GCATGTCTGAACCTGT   BC:Z:GCATGTCTGAACCTGT   UX:Z:GCATCTGG   XS:Z:Assigned3  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GCATCTGG
65 | NB502120:154:HVG7JBGXB:4:11511:5522:12140       163     1       14414   3       80M70S  =       14668   382     GTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCATGAAGTACAGGCAGACAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCACTCCTTGAAGCT  AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE6E6AAAEA<EEAAE<E<EEEEEEEEEEAEEE<E<EAAA<A<AAEEEA<  NH:i:2  HI:i:1  AS:i:204        nM:i:1  BX:Z:TACCGTCTTAGCAAGC   BC:Z:TACCGTCTTAGCAAGC   UX:Z:AACGGTGT   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:AACGGTGT
66 | NB502120:154:HVG7JBGXB:4:21509:2326:6303        163     1       14414   255     150M    =       14692   405     GTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAGCACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCGCTCCTTGAA  AAAAAEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEAEEEAEEEEEEAEEEEEAEEEEEAEE<E<AAAEE/AEA/E<EEEEE<EEEEEEAEEEEAEEAEEEE<<6EE  NH:i:1  HI:i:1  AS:i:275        nM:i:0  BX:Z:TTCCGTTCCCTCTTCA   BC:Z:TTCCGTTCCCTCTTCA   UX:Z:TGCATCTC   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:TGCATCTC
67 | NB502120:154:HVG7JBGXB:4:23502:2511:18043       163     1       14414   255     150M    =       14692   405     GTTCTATATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGACCCCCCATGGAGCACAGGCAGACACAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTACCGCTCCTTGAA  AAAAA/EEEE/EE//6AE/EEEAEEEE/EAEAEAEEEEEE/EEE6EEE/AE//AAA/AEE/E6EAEE6EEEAEE/<<EEAEA/A<<///A/<<EAEEEA/EA//A/6<<A/<E/<AE<EEEA/EE//AA/E<//EA//<A<E<<A/6<AA  NH:i:1  HI:i:1  AS:i:265        nM:i:5  BX:Z:TTCCGTTCCCTCTTCA   BC:Z:TTCCGTTCCCTCTTCA   UX:Z:TGCATCTC   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:TGCATCTC
68 | NB502120:154:HVG7JBGXB:1:11308:5428:8170        163     1       14419   3       150M    =       14757   604     TTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAGCACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTACCGCTCCATGAAGCAGG  A/AAAE6///AAE//E/E//E/EAEEE/E66EEEE<EEEE//EAEE6EE/EEEA/AEE/AE//E/EEEEA/EEEEE/AEE/EEEE//EEEE/E<EE/EEEEEEAEEEEE/E</AA<EAEE<EEA/E/AE<E/AE6<A/AE/////<<///  NH:i:2  HI:i:1  AS:i:268        nM:i:4  BX:Z:CGCAAGAACGGTTGTT   BC:Z:CGCAAGAACGGTTGTT   UX:Z:GCTGGGCG   XS:Z:Assigned3  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GCTGGGCG
69 | NB502120:154:HVG7JBGXB:1:21206:19630:11590      163     1       14433   3       79M2D70M1S      =       14692   386     GCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGAAGTACAGGCAGACAAGTCCCCGCCCCAGCAGTGTGGCCTCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTA  AAAAAEEEEEEEEEEEEEEEEEEEEE6EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEA<EEEEEEA<EEEEE/EEEAEAAEE/EEEEA/E/EEEEEAEA6A/<AEE/<AA/AEE<E<EE<6AAE/AE<<EEE<AAAEAA//<A/A  NH:i:2  HI:i:1  AS:i:260        nM:i:4  BX:Z:TTGGAACCCGGTTGTT   BC:Z:TTGGAACCCGGTTGTT   UX:Z:GCAGATTC   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GCAGATTC
70 | NB502120:154:HVG7JBGXB:1:21202:9431:15912       163     1       14453   255     149M1S  =       14692   365     CTTAAGAACACTGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAGCACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCCTCCC  AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEAEEEEEEEEEAEAEEEE</EEEEEEAAAEEEEE/EEAEA<EAEEEEAEEEEEE/A/EE<E/E<AEEE/A</AAEEE/A<<<6  NH:i:1  HI:i:1  AS:i:269        nM:i:2  BX:Z:AAGCCGTTCCACATAG   BC:Z:AAGCCGTTCCACATAG   UX:Z:GCATCCTG   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GCATCCTG
71 | NB502120:154:HVG7JBGXB:3:21405:1669:3957        163     1       14453   255     149M1S  =       14692   365     CTTAAGAACACTGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAGCACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCCTCCC  AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEE/EEEEEAEE6EEEEEEEEEEEAEEEEEEEEE/EE//EE/EEAE/A/<AEE/EEEA<EA/E/EE<<EAE/EEAEEE<A<AAEEE//<E<<A</<EEE6/A<EEE<AAA  NH:i:1  HI:i:1  AS:i:269        nM:i:2  BX:Z:AAGCCGTTCCACATAG   BC:Z:AAGCCGTTCCACATAG   UX:Z:GCATCCTG   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GCATCCTG
72 | NB502120:154:HVG7JBGXB:1:22107:18366:16253      163     1       14453   3       59M2D90M1S      =       14692   366     CTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGAAGTACAGGCAGACAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTAGTTCCATCACCCCCTCCCAG  AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEE/EEEEAEEEEEEEEEAEEEEEEEEEEEEEAE6EE6EEE<<EEAEEEE/EAEEEEAEE<EEEA/E/A/<EAEEEEEAEEE<<66<<<<6<AAA/  NH:i:2  HI:i:1  AS:i:258        nM:i:5  BX:Z:TCCAAGTCGAACCTGT   BC:Z:TCCAAGTCGAACCTGT   UX:Z:GGCTCTCG   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GGCTCTCG
73 | NB502120:154:HVG7JBGXB:3:21601:21748:12734      163     1       14453   3       149M1S  =       14728   403     CTTAAGAACACTGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAGCACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCCTCCC  AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEEAEAEEEEEEEEEEEEEEEEE<E/EEEEEEEEEA<<EAEEEEAEEEEEEEEEE/AEE6EEEEEEEEEEEEE/E/AEEEAEEEEEAAEAEE<<E<A/AEAEEA<A<A/E<A  NH:i:2  HI:i:1  AS:i:273        nM:i:1  BX:Z:TCCAAGTCGAACCTGT   BC:Z:TCCAAGTCGAACCTGT   UX:Z:ACTTGGGT   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:ACTTGGGT
74 | NB502120:154:HVG7JBGXB:2:13107:5473:12860       163     1       14455   255     150M    =       14692   364     TAAGAACACTGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAGCACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCCTCCCAA  A6AAAEEEEEEEEEAEEAEEEAEEEAEE6E6AEEEEEEEEEEEEEEEEEAEEEEE/EEEEEEEAA</6A<A/AEAAEEE6EEEEEEE6EEEAEEEA<AEEEEEE/<EAEE<AEAAEAAAAE<A/AA<AA/EEE/EEEE/<6<<<</<<//  NH:i:1  HI:i:1  AS:i:273        nM:i:1  BX:Z:CACCTAACCAGATTCG   BC:Z:CACCTAACCAGATTCG   UX:Z:GTATCGAC   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GTATCGAC
75 | NB502120:154:HVG7JBGXB:1:23309:15624:7432       163     1       14455   255     150M    =       14692   362     TAAGAACACTGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAGCACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCTTCCGCTCCATGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCCCCCCCTCCCAA  A<A66EEEEEEEE6A6/6E6EEAE/6EE/EEEEEA<EEE/E/EEE6EEE/EEEAEEAE/AEE/AAE//EE<E//<EE/A<AAEEEEAEEEA6/EEE6/<EA/E//A<EE/EEAEEE</<EE/EA/AEEAEE<EE//A//A6/6//AEA/E  NH:i:1  HI:i:1  AS:i:265        nM:i:4  BX:Z:TTAGGCCACCATCCAA   BC:Z:TTAGGCCACCATCCAA   UX:Z:GAGTTGAG   XS:Z:Assigned1  XN:i:1  XT:Z:ENSG00000227232    UB:Z:GAGTTGAG
76 | ```
77 | 
78 | ## Getting help
79 | If you have any questions and suggestions on our pipeline, feel free to contact us by email (ping.chen@ki.se, rickard.sandberg@ki.se).
80 | 
81 | 


--------------------------------------------------------------------------------
/ss3iso/isoform_reconstruction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sandberg-lab/Smart-seq3/5d5938475039f5c98d0d94faf89db917f66fe8ac/ss3iso/isoform_reconstruction.png


--------------------------------------------------------------------------------
/ss3iso/pyModule/informative_reads.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Developer: Ping Chen
  3 | # Contact: ping.chen@ki.se
  4 | # Date: 2020-01-10
  5 | # Version: 0.1.3
  6 | 
  7 | import re
  8 | import os
  9 | import subprocess
 10 | import pysam
 11 | import pandas as pd
 12 | from collections import defaultdict
 13 | import numpy as np
 14 | import multiprocessing as mp
 15 | from functools import partial
 16 | import pybedtools
 17 | import glob
 18 | import warnings
 19 | 
 20 | def get_exons(chrom, start, end, strand, gid, outdir):
 21 |     
 22 |     p = subprocess.Popen(['tabix', '%s/exon.sorted.gff.gz' %(outdir), '%s:%s-%s' %(chrom, start, end)], stdout=subprocess.PIPE)
 23 |     rcds = p.communicate()[0].decode("utf-8")
 24 |     
 25 |     rcds = [item for item in rcds.strip().split('\n') if re.search(gid, item)]
 26 |     
 27 |     tmpfile = '%s/.tempDir/_temp_%s.bed' %(outdir, gid)
 28 |     outF = open(tmpfile, "w")
 29 |     for rcd in rcds:
 30 |         items = rcd.split('\t')
 31 |         outF.write('\t'.join([items[0], str(int(items[3])-1), items[4], '.', '.', items[6]]))
 32 |         outF.write('\n')
 33 |     outF.close()
 34 |     
 35 |     os.system('sort -k1,1 -k2,2n %s | bedtools merge -s -d -1 -i - > %s/.tempDir/_temp_%s.merged.bed' %(tmpfile, outdir, gid))
 36 |     
 37 |     ordered_exons = pd.read_table('%s/.tempDir/_temp_%s.merged.bed' %(outdir, gid), header=None, index_col=None, sep="\t")
 38 |     
 39 |     return ordered_exons
 40 | 
 41 | class geneObj(object):
 42 |     
 43 |     def __init__(self, in_bam_uniq, in_bam_multi, outdir):
 44 |         
 45 |         self.strand_flags = {'+': [99, 147], '-': [83, 163]}
 46 |             
 47 |         self.gene = None
 48 |         self.exons = None
 49 |         self.ex_bed = None
 50 |         self.in_bam_uniq = in_bam_uniq
 51 |         self.in_bam_multi = in_bam_multi
 52 |         self.outdir = outdir
 53 |         self.chrom = None
 54 |         self.start = None
 55 |         self.end = None
 56 |         self.strand = None
 57 |         self.uniq_aligned_reads = None
 58 |         self.multi_aligned_reads = None
 59 |         self.uniq_r_bclist = None
 60 |         
 61 |     def get_exon_coordinates(self, gene):
 62 |         
 63 |         fds = gene.split('\t')
 64 |         gene_id = fds[-1].split(';')[0].split('=')[1]
 65 |         self.gene = gene_id
 66 |         
 67 |         exons = get_exons(fds[0], fds[3], fds[4], fds[6], gene_id, self.outdir)
 68 |         
 69 |         exon_idx = pd.DataFrame(list(range(1,exons.shape[0]+1)))
 70 |         exons = pd.concat([exons, exon_idx], axis=1)
 71 |         exons.to_csv('%s/.tempDir/_%s' %(self.outdir, self.gene), index=False, header=False, sep="\t")
 72 |         self.exons = exons
 73 |         self.ex_bed = pybedtools.BedTool('%s/.tempDir/_%s' %(self.outdir, self.gene))
 74 |         
 75 |         self.chrom = str(exons.iloc[0,0])
 76 |         self.start = np.min([exons.iloc[0,1], exons.iloc[-1,2]])
 77 |         self.end = np.max([exons.iloc[0,1], exons.iloc[-1,2]])
 78 |         self.strand = exons.iloc[0,3]
 79 |         
 80 |         return
 81 |     
 82 |     def get_aligned_reads(self, n_read_limit, passed_cells):
 83 |         
 84 |         samfile = pysam.AlignmentFile(self.in_bam_uniq, "rc")
 85 |         try:
 86 |             r_iterator = samfile.fetch(self.chrom, int(self.start), int(self.end))
 87 |         except:
 88 |             return None
 89 |         
 90 |         nreads = len([r_idx for r_idx, x in enumerate(r_iterator) if x.flag in self.strand_flags[self.strand]])
 91 |         if nreads > n_read_limit: return self.gene
 92 |               
 93 |         r_iterator = samfile.fetch(self.chrom, int(self.start), int(self.end))
 94 |         read_dict = {r_idx: _make_dict(x, self.chrom, self.strand, self.gene, r_idx) for r_idx, x in enumerate(r_iterator) if x.flag in self.strand_flags[self.strand] and list(filter(regx1.match, x.to_dict()['tags']))[0].replace('BC:Z:','') in passed_cells}
 95 |         samfile.close()
 96 |         
 97 |         df = [read_dict[r_idx]['r_blocks'] for r_idx in read_dict.keys()]
 98 |         
 99 |         if len(df) == 0: return None
100 |         pd.concat(df, axis=0).to_csv('%s/.tempDir/_%s_reads_blocks.bed' %(self.outdir, self.gene), index=False, sep="\t", header=False)
101 |         read_bed = pybedtools.BedTool('%s/.tempDir/_%s_reads_blocks.bed' %(self.outdir, self.gene))
102 |         
103 |         tmp = self.ex_bed.intersect(read_bed, wa=True, wb=True)
104 |         if os.stat(tmp.fn).st_size == 0:
105 |             return None
106 |         
107 |         intersect_all = tmp.to_dataframe()
108 |         read_idx_list = list(set(intersect_all.iloc[:,9].values))
109 |     
110 |         ex_coord = ','.join(self.exons.apply(lambda x: '%s-%s' %(x[1],x[2]), axis=1).values)
111 |         
112 |         aligned_reads = [_make_list_aligned_reads2(r_idx, read_dict, intersect_all, ex_coord) for r_idx in read_idx_list]
113 |         
114 |         colnames = ['name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar',
115 |                     'next_ref_name', 'next_ref_pos', 'length', 'seq', 'qual', 'tags',
116 |                     'read_mapped_position', 'geneid', 'Exon_Index', 'Category', 'BC', 'UB', 'exon_coordinates']
117 |         self.uniq_aligned_reads = pd.DataFrame(aligned_reads, columns=colnames).drop_duplicates()
118 |         self.uniq_r_bclist = list(set(self.uniq_aligned_reads.apply(lambda x: '%s+%s' %(x['BC'], x['UB']), axis=1).values))
119 |         self.uniq_aligned_reads.insert(19, 'MapFlag', 'unique')
120 |         
121 |         return None
122 |     
123 |     def get_aligned_reads_from_multi(self, passed_cells):
124 |         
125 |         samfile = pysam.AlignmentFile(self.in_bam_multi, "rc")
126 |         try:
127 |             r_iterator = samfile.fetch(self.chrom, int(self.start), int(self.end))
128 |         except:
129 |             return None
130 |         
131 |         read_dict = {r_idx: _make_dict2(x, self.chrom, self.strand, self.gene, self.uniq_r_bclist, r_idx) for r_idx, x in enumerate(r_iterator) if x.flag in self.strand_flags[self.strand] and list(filter(regx1.match, x.to_dict()['tags']))[0].replace('BC:Z:','') in passed_cells}
132 |         df = [read_dict[r_idx]['r_blocks'] for r_idx in read_dict.keys() if read_dict[r_idx] is not None]
133 |         
134 |         if len(df) == 0: return None
135 |         pd.concat(df, axis=0).to_csv('%s/.tempDir/_%s_reads_blocks.bed' %(self.outdir, self.gene), index=False, sep="\t", header=False)
136 |         read_bed = pybedtools.BedTool('%s/.tempDir/_%s_reads_blocks.bed' %(self.outdir, self.gene))
137 |        
138 |         tmp = self.ex_bed.intersect(read_bed, wa=True, wb=True)
139 |         if os.stat(tmp.fn).st_size == 0:
140 |             return None
141 |         
142 |         intersect_all = tmp.to_dataframe()
143 |         read_idx_list = list(set(intersect_all.iloc[:,9].values))
144 |         
145 |         ex_coord = ','.join(self.exons.apply(lambda x: '%s-%s' %(x[1],x[2]), axis=1).values)
146 |         aligned_reads = [_make_list_aligned_reads2(r_idx, read_dict, intersect_all, ex_coord) for r_idx in read_idx_list]
147 |         
148 |         colnames = ['name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar',
149 |                     'next_ref_name', 'next_ref_pos', 'length', 'seq', 'qual', 'tags',
150 |                     'read_mapped_position', 'geneid', 'Exon_Index', 'Category', 'BC', 'UB', 'exon_coordinates']
151 |         self.multi_aligned_reads = pd.DataFrame(aligned_reads, columns=colnames).drop_duplicates()
152 |         self.multi_aligned_reads.insert(19, 'MapFlag', 'multi')
153 |         
154 |         return None
155 | 
156 | def _initialize_make_list_aligned():
157 |     
158 |     global my_read_dict
159 |     global my_intersect_all
160 |     global my_ex_coord
161 |     
162 | regx1 = re.compile("BC:Z:")
163 | regx2= re.compile("UB:Z:")
164 | def get_aligned_reads_mp(obj, nproc, passed_cells):
165 |     
166 |     global my_read_dict
167 |     global my_intersect_all
168 |     global my_ex_coord
169 |     
170 |     my_intersect_all = None
171 |     my_read_dict = None
172 |     my_ex_coord = None
173 |     
174 |     samfile = pysam.AlignmentFile(obj.in_bam_uniq, "rc")
175 |     try:
176 |         r_iterator = samfile.fetch(obj.chrom, int(obj.start), int(obj.end))
177 |     except:
178 |         return obj
179 |     
180 |     rcds = np.array([[r_idx, x.to_dict(), x.get_blocks()] for r_idx, x in enumerate(r_iterator) if x.flag in obj.strand_flags[obj.strand] and list(filter(regx1.match, x.to_dict()['tags']))[0].replace('BC:Z:','') in passed_cells])
181 |     pool = mp.Pool(processes=nproc)
182 |     func = partial(_make_dict_mp, obj.chrom, obj.strand, obj.gene)
183 |     read_dict_list = pool.map(func, rcds, chunksize=1)
184 |     pool.close()
185 |     
186 |     my_read_dict = {}
187 |     tmp = [my_read_dict.update(elemt) for elemt in read_dict_list]
188 |     df = [my_read_dict[r_idx]['r_blocks'] for r_idx in my_read_dict.keys()]
189 |     samfile.close()
190 |         
191 |     if len(df) == 0: return obj
192 |     pd.concat(df, axis=0).to_csv('%s/.tempDir/_%s_reads_blocks.bed' %(obj.outdir, obj.gene), index=False, sep="\t", header=False) 
193 |     read_bed = pybedtools.BedTool('%s/.tempDir/_%s_reads_blocks.bed' %(obj.outdir, obj.gene))
194 |         
195 |     tmp = obj.ex_bed.intersect(read_bed, wa=True, wb=True)
196 |     if os.stat(tmp.fn).st_size == 0:
197 |             return obj
198 |     
199 |     my_intersect_all = tmp.to_dataframe()
200 |     read_idx_list = list(set(my_intersect_all.iloc[:,9].values))
201 |     
202 |     my_ex_coord = ','.join(obj.exons.apply(lambda x: '%s-%s' %(x[1],x[2]), axis=1).values)
203 |     
204 |     pool = mp.Pool(processes=nproc, initializer=_initialize_make_list_aligned)
205 |     aligned_reads = pool.map(_make_list_aligned_reads_mp, read_idx_list, chunksize=1)
206 |     pool.close()
207 |     
208 |     colnames = ['name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar',
209 |                 'next_ref_name', 'next_ref_pos', 'length', 'seq', 'qual', 'tags',
210 |                 'read_mapped_position', 'geneid', 'Exon_Index', 'Category', 'BC', 'UB', 'exon_coordinates']
211 |     obj.uniq_aligned_reads = pd.DataFrame(aligned_reads, columns=colnames).drop_duplicates()
212 |     obj.uniq_r_bclist = list(set(obj.uniq_aligned_reads.apply(lambda x: '%s+%s' %(x['BC'], x['UB']), axis=1).values))
213 |     obj.uniq_aligned_reads.insert(19, 'MapFlag', 'unique')
214 |         
215 |     return obj
216 | 
217 | def get_aligned_reads_from_multi_mp(obj, nproc, passed_cells):
218 |     
219 |     global my_read_dict
220 |     global my_intersect_all
221 |     global my_ex_coord
222 |     global my_uniq_r_bclist
223 |     
224 |     my_intersect_all = None
225 |     my_read_dict = None
226 |     my_ex_coord = None
227 |     my_uniq_r_bclist = obj.uniq_r_bclist.copy()
228 |         
229 |     samfile = pysam.AlignmentFile(obj.in_bam_multi, "rc")
230 |     try:
231 |         r_iterator = samfile.fetch(obj.chrom, int(obj.start), int(obj.end))
232 |     except:
233 |         return obj
234 |         
235 |     rcds = np.array([[r_idx, x.to_dict(), x.get_blocks()] for r_idx, x in enumerate(r_iterator) if x.flag in obj.strand_flags[obj.strand] and list(filter(regx1.match, x.to_dict()['tags']))[0].replace('BC:Z:','') in passed_cells])
236 |     pool = mp.Pool(processes=nproc)
237 |     func = partial(_make_dict2_mp, obj.chrom, obj.strand, obj.gene)
238 |     
239 |     read_dict_list = pool.map(func, rcds, chunksize=1)
240 |     pool.close()
241 |     
242 |     my_read_dict = {}
243 |     tmp = [my_read_dict.update(elemt) for elemt in read_dict_list if elemt is not None]  # fast!!
244 |     df = [my_read_dict[r_idx]['r_blocks'] for r_idx in my_read_dict.keys()]
245 |     samfile.close()
246 |         
247 |     if len(df) == 0: return obj
248 |     pd.concat(df, axis=0).to_csv('%s/.tempDir/_%s_multi_reads_blocks.bed' %(obj.outdir, obj.gene), index=False, sep="\t", header=False)
249 |     read_bed = pybedtools.BedTool('%s/.tempDir/_%s_multi_reads_blocks.bed' %(obj.outdir, obj.gene))
250 |        
251 |     tmp = obj.ex_bed.intersect(read_bed, wa=True, wb=True)
252 |     if os.stat(tmp.fn).st_size == 0:
253 |             return obj
254 |         
255 |     my_intersect_all = tmp.to_dataframe()
256 |     read_idx_list = list(set(my_intersect_all.iloc[:,9].values))
257 |         
258 |     my_ex_coord = ','.join(obj.exons.apply(lambda x: '%s-%s' %(x[1],x[2]), axis=1).values)
259 |     
260 |     pool = mp.Pool(processes=nproc, initializer=_initialize_make_list_aligned)
261 |     aligned_reads = pool.map(_make_list_aligned_reads_mp, read_idx_list, chunksize=1)
262 |     pool.close()
263 |     
264 |     colnames = ['name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar',
265 |                 'next_ref_name', 'next_ref_pos', 'length', 'seq', 'qual', 'tags',
266 |                 'read_mapped_position', 'geneid', 'Exon_Index', 'Category', 'BC', 'UB', 'exon_coordinates']
267 |     
268 |     obj.multi_aligned_reads = pd.DataFrame(aligned_reads, columns=colnames).drop_duplicates()
269 |     obj.multi_aligned_reads.insert(19, 'MapFlag', 'multi') 
270 |     
271 |     return obj
272 | 
273 | def gtf2exon(gtf, outdir, include_spikein=False):
274 |     
275 |     filename = '%s/exon.gff' %(outdir)
276 |     outF = open(filename, "w")
277 |     
278 |     if include_spikein:
279 |         with open(gtf, 'r') as f:
280 |             for line in f:
281 |                 if re.match('#', line): continue
282 |                 fds = line.strip().split('\t')
283 |                 if fds[0] in ['diySpike']:
284 |                     if fds[2] == 'exon':
285 |                         annot = fds[8].replace(' "', '=').replace('"; ',';').replace('";','').replace('; ',';').replace(' ','=').split(';')
286 |                         annot = 'loc=%s:%s-%s:%s;%s' %(fds[0],fds[3],fds[4],fds[6],';'.join([annot[i] for i in [0,2,1]]))
287 |                         outF.write('%s\t%s\n' %('\t'.join(fds[:8]), annot))
288 |                 else:
289 |                     if fds[2] == 'exon':
290 |                         annot = fds[8].replace(' "', '=').replace('"; ',';').replace('";','').replace('; ',';').replace(' ','=')
291 |                         annot = 'loc=%s:%s-%s:%s;%s' %(fds[0],fds[3],fds[4],fds[6],annot)
292 |                         outF.write('%s\t%s\n' %('\t'.join(fds[:8]), annot))
293 |         outF.close()
294 |     else:
295 |         with open(gtf, 'r') as f:
296 |             for line in f:
297 |                 if re.match('#', line): continue
298 |                 fds = line.strip().split('\t')
299 |                 if fds[2] == 'exon':
300 |                     annot = fds[8].replace(' "', '=').replace('"; ',';').replace('";','').replace('; ',';').replace(' ','=')
301 |                     annot = 'loc=%s:%s-%s:%s;%s' %(fds[0],fds[3],fds[4],fds[6],annot)
302 |                     outF.write('%s\t%s\n' %('\t'.join(fds[:8]), annot))
303 |         outF.close()
304 |     
305 |     os.system('sort -k1,1 -k4,4n %s | bgzip > %s/exon.sorted.gff.gz' %(filename, outdir))
306 |     os.system('tabix -p gff %s/exon.sorted.gff.gz' %(outdir))
307 |     os.system('zless %s/exon.sorted.gff.gz | bedtools merge -i - -s -d -1 -c 1 -o count > %s/exon_merged.bed' %(outdir, outdir)) 
308 |     
309 |     return
310 | 
311 | def _make_dict(x, chrom, strand, gene, r_idx):
312 |    
313 |     print(r_idx)
314 |     curr = x.to_dict()
315 |     r_blocks  = pd.DataFrame(x.get_blocks(), columns=['start','end'])
316 |     r_blocks.insert(0,'chr', chrom)
317 |     r_blocks.insert(3,'strand', strand)
318 |     r_blocks.insert(4,'rid', r_idx)
319 |       
320 |     curr['r_blocks'] = r_blocks
321 |     curr['read_mapped_position'] = ','.join(r_blocks.apply(lambda x: '%s-%s' %(x[1],x[2]), axis=1).values)   # start: 0-based; end: 1-based
322 |     curr['geneid'] = gene
323 |     
324 |     return curr
325 | 
326 | def _make_dict_mp(chrom, strand, gene, rcd):
327 |    
328 |     r_idx, curr_dict, block = rcd
329 |    
330 |     print(r_idx)
331 |     
332 |     r_blocks  = pd.DataFrame(block, columns=['start','end'])
333 |     r_blocks.insert(0,'chr', chrom)
334 |     r_blocks.insert(3,'strand', strand)
335 |     r_blocks.insert(4,'rid', r_idx)
336 |       
337 |     curr_dict['r_blocks'] = r_blocks
338 |     curr_dict['read_mapped_position'] = ','.join(r_blocks.apply(lambda x: '%s-%s' %(x[1],x[2]), axis=1).values)   # start: 0-based; end: 1-based
339 |     curr_dict['geneid'] = gene
340 |     
341 |     return {r_idx: curr_dict}
342 | 
343 | def _make_dict2(x, chrom, strand, gene, uniq_r_bclist, r_idx):
344 |     
345 |     regx1 = re.compile("BC:Z:")
346 |     regx2= re.compile("UB:Z:")
347 |    
348 |     print(r_idx)
349 |     curr = x.to_dict()
350 |     
351 |     bc = list(filter(regx1.match, curr['tags']))[0].replace('BC:Z:','')
352 |     ub = list(filter(regx2.match, curr['tags']))[0].replace('UB:Z:','')
353 |     if '%s+%s' %(bc, ub) not in uniq_r_bclist: return None
354 |     
355 |     r_blocks  = pd.DataFrame(x.get_blocks(), columns=['start','end'])
356 |     r_blocks.insert(0,'chr', chrom)
357 |     r_blocks.insert(3,'strand', strand)
358 |     r_blocks.insert(4,'rid', r_idx)
359 |       
360 |     curr['r_blocks'] = r_blocks
361 |     curr['read_mapped_position'] = ','.join(r_blocks.apply(lambda x: '%s-%s' %(x[1],x[2]), axis=1).values)   # start: 0-based; end: 1-based
362 |     curr['geneid'] = gene
363 |     
364 |     return curr
365 | 
366 | def _make_list_aligned_reads2(r_idx, read_dict, intersect_all, ex_coord):
367 |     
368 |     regx1 = re.compile("BC:Z:")
369 |     regx2= re.compile("UB:Z:")
370 |     
371 |     exon_category_dict = {1: 'exon'}
372 |     
373 |     print(r_idx)
374 |     curr = read_dict[r_idx].copy()
375 |     
376 |     intersect = sorted(set(intersect_all.loc[intersect_all['blockCount']==r_idx,'score'].values))
377 |     n_intersect = len(intersect)
378 | 
379 |     curr['Exon_Index'] = ','.join(map(str,intersect))
380 |     curr['Category'] = exon_category_dict.get(n_intersect, 'junction')
381 |     
382 |     bc = list(filter(regx1.match, curr['tags']))[0]
383 |     ub = list(filter(regx2.match, curr['tags']))[0]
384 |     
385 |     curr['BC'] = bc.replace('BC:Z:','')
386 |     curr['UB'] = ub.replace('UB:Z:','')
387 |     curr['tags'] = ';'.join(curr['tags'])
388 |     curr['exon_coordinates'] = ex_coord
389 |     del curr['r_blocks']
390 |     
391 |     return list(pd.Series(curr).values)
392 | 
393 | def _make_list_aligned_reads_mp(r_idx):
394 |     
395 |     regx1 = re.compile("BC:Z:")
396 |     regx2= re.compile("UB:Z:")
397 |     
398 |     exon_category_dict = {1: 'exon'}
399 |     print(r_idx)
400 |     curr = my_read_dict[r_idx].copy()
401 |     
402 |     intersect = sorted(set(my_intersect_all.loc[my_intersect_all['blockCount']==r_idx,'score'].values))
403 |     n_intersect = len(intersect)
404 |     
405 |     curr['Exon_Index'] = ','.join(map(str,intersect))
406 |     curr['Category'] = exon_category_dict.get(n_intersect, 'junction')
407 |     
408 |     bc = list(filter(regx1.match, curr['tags']))[0]
409 |     ub = list(filter(regx2.match, curr['tags']))[0]
410 |     
411 |     curr['BC'] = bc.replace('BC:Z:','')
412 |     curr['UB'] = ub.replace('UB:Z:','')
413 |     curr['tags'] = ';'.join(curr['tags'])
414 |     curr['exon_coordinates'] = my_ex_coord
415 |     del curr['r_blocks']
416 |     
417 |     return list(pd.Series(curr).values)
418 | 
419 | def gtf2gene(gtf, outdir, field):
420 |     
421 |     filename = '%s/gene.gff' %(outdir)
422 |     outF = open(filename, "w")
423 |     
424 |     if field == 'gene':   
425 |         with open(gtf, 'r') as f:
426 |             for line in f:
427 |                 if re.match('#', line): continue
428 |                 fds = line.strip().split('\t')
429 |                 if fds[2] == 'gene':
430 |                     annot = fds[8].replace(' "', '=').replace('"; ',';').replace('";','').replace('; ',';').replace(' ','=')
431 |                     outF.write('%s\t%s\n' %('\t'.join(fds[:8]), annot))
432 |     else:
433 |         gene_dict = defaultdict(dict)
434 |         with open(gtf, 'r') as f:
435 |             for line in f:
436 |                 fds = line.strip().split('\t')
437 |                 if fds[2] == 'transcript':
438 |                     annot = fds[8].replace(' "', '=').replace('"; ',';').replace('";','').replace('; ',';').replace(' ','=')
439 |                     curr_gene = annot.split(';')[0].split('=')[1]
440 |                     if curr_gene not in gene_dict.keys():
441 |                         gene_dict[curr_gene]['start'] = []
442 |                         gene_dict[curr_gene]['end'] = []
443 |                         gene_dict[curr_gene]['transcript'] = []
444 |                     gene_dict[curr_gene]['chrom'] = fds[0]
445 |                     gene_dict[curr_gene]['start'].append(int(fds[3]))
446 |                     gene_dict[curr_gene]['end'].append(int(fds[4]))
447 |                     gene_dict[curr_gene]['strand'] = fds[6]
448 |                     gene_dict[curr_gene]['transcript'].append(annot.split(';')[1].split('=')[1])
449 |                     
450 |         for gene in gene_dict.keys():
451 |             outF.write('%s\trefseq\tgene\t%s\t%s\t.\t%s\t.\tgene_id=%s;transcript_id=%s;gene_name=%s\n' %(gene_dict[gene]['chrom'], np.min(gene_dict[gene]['start']), np.max(gene_dict[gene]['end']), gene_dict[gene]['strand'], gene, ','.join(gene_dict[gene]['transcript']), gene))
452 |     
453 |     outF.close()
454 |     return
455 | 
456 | def _fetch_exonic_reads(outdir, in_bam):
457 |     
458 |     exBed = '%s/exon_merged.bed' %(outdir)
459 |     in_bam_filename = 'ex_%s' %(os.path.basename(in_bam))
460 |     os.system('bedtools intersect -abam %s -b %s -wa -u > %s/%s' %(in_bam, exBed, outdir, in_bam_filename))
461 |     os.system('samtools index %s/%s' %(outdir, in_bam_filename))
462 |     
463 |     return
464 | 
465 | def _get_reads(in_bam_uniq, in_bam_multi, outdir, chrom, nproc, n_read_limit, passed_cells, mRds, gene):
466 |     
467 |     gobj = geneObj(in_bam_uniq, in_bam_multi, outdir)
468 |     gobj.get_exon_coordinates(gene)
469 |     
470 |     if not os.path.exists('%s/keptReads/%s/%s_aligned_reads.csv' %(outdir, chrom, gobj.gene)): 
471 |         os.system('echo "Start gene %s..." >> %s/keptReads/%s/_log' %(gobj.gene, outdir, chrom))
472 |     else:
473 |         os.system('echo "Gene %s exists in output directory...Skip..." >> %s/keptReads/%s/_log' %(gobj.gene, outdir, chrom))
474 |         return
475 |     
476 |     report_gene = None
477 |     
478 |     if nproc < 2:
479 |         report_gene = gobj.get_aligned_reads(n_read_limit, passed_cells)
480 |     else:
481 |         gobj = get_aligned_reads_mp(gobj, nproc, passed_cells)
482 |         
483 |     if report_gene is not None: return report_gene
484 |     if gobj.uniq_aligned_reads is None: return None
485 |     
486 |     if mRds:
487 |         if nproc < 2:
488 |             gobj.get_aligned_reads_from_multi(passed_cells)
489 |         else:
490 |             gobj = get_aligned_reads_from_multi_mp(gobj, nproc, passed_cells)
491 |     
492 |         if gobj.multi_aligned_reads is not None:
493 |             aligned = pd.concat([gobj.uniq_aligned_reads, gobj.multi_aligned_reads], axis=0)
494 |         else:
495 |             aligned = gobj.uniq_aligned_reads
496 |     else:
497 |         aligned = gobj.uniq_aligned_reads
498 |        
499 |     os.system('echo "%s has %s aligned reads..." >> %s/keptReads/%s/_log' %(gobj.gene, aligned.shape[0], outdir, chrom))
500 |     
501 |     p = subprocess.Popen('rm %s/.tempDir/_%s*' %(outdir, gobj.gene), shell=True)
502 |     (output, err) = p.communicate()  
503 |     
504 |     if aligned.shape[0] > 0:
505 |         aligned.to_csv('%s/keptReads/%s/%s_aligned_reads.csv' %(outdir, chrom, gobj.gene), sep="\t", index=False, header=False)
506 |     return None
507 | 
508 | 
509 | def fetch_gene_reads(in_bam_uniq, in_bam_multi, conf, species, outdir, spikein=False):
510 |     
511 |     gtf = conf['annotation']['%s_%s_gtf' %(species,conf['annotation']['gtf_source'])]
512 |     if conf['annotation']['gtf_source'] == 'refseq':
513 |         field = 'transcript'
514 |     else:
515 |         field = 'gene'
516 |     gtf2exon(gtf, outdir, spikein)
517 |     gtf2gene(gtf, outdir, field)
518 |     
519 |     pool = mp.Pool(2)
520 |     func = partial(_fetch_exonic_reads, outdir)
521 |     pool.map(func, [in_bam_uniq, in_bam_multi], chunksize=1)
522 |     in_bam_uniq = '%s/ex_%s' %(outdir, os.path.basename(in_bam_uniq))
523 |     in_bam_multi = '%s/ex_%s' %(outdir, os.path.basename(in_bam_multi))
524 |     
525 |     cells_to_use = list(pd.read_table(conf['annotation']['zumi_keptbarcode'], header=None, index_col=None, sep=",").iloc[:,0].values)
526 |     
527 |     cmd = 'cut -f1 %s/gene.gff | sort | uniq' %(outdir)
528 |     p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
529 |     rcds = p.communicate()[0].decode("utf-8")
530 |     chrom_list = [item for item in rcds.strip().split('\n')]
531 |     
532 |     if not os.path.exists('%s/.tempDir' %outdir): os.makedirs('%s/.tempDir' %outdir)
533 |     if not os.path.exists('%s/keptReads' %outdir): os.makedirs('%s/keptReads' %outdir)
534 |     
535 |     for chrom in chrom_list:
536 |         
537 |         print('...for genes on %s' %(chrom))
538 |         if not os.path.exists('%s/keptReads/%s' %(outdir,chrom)): os.makedirs('%s/keptReads/%s' %(outdir,chrom))
539 |         os.system('> %s/keptReads/%s/_log' %(outdir,chrom))
540 |         
541 |         os.system('echo "*** genes on %s ***" >> %s/keptReads/%s/_log' %(chrom, outdir, chrom))
542 |         cmd = 'grep ^%s[[:space:]] %s/gene.gff' %(chrom, outdir)
543 |         p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
544 |         rcds = p.communicate()[0].decode("utf-8")
545 |         
546 |         genes = [item for item in rcds.strip().split('\n')]
547 |         
548 |         pool = mp.Pool(processes=int(conf['expression']['nproc']))
549 |         func = partial(_get_reads, in_bam_uniq, in_bam_multi, outdir, chrom, 1, int(conf['expression']['n_read_limit']), cells_to_use, False) 
550 |         report_genes = pool.map(func, genes, chunksize=1)
551 |         pool.close()
552 |             
553 |         report_genes = list(filter(None, report_genes))
554 |         for gname in report_genes:
555 |             print(gname)
556 |             gene = [gg for gg in genes if re.search(gname, gg)][0]
557 |             results = _get_reads(in_bam_uniq, in_bam_multi, outdir, chrom, int(conf['expression']['nproc']), int(conf['expression']['n_read_limit']), cells_to_use, False, gene) 
558 |         
559 |         p = subprocess.Popen('rm -rf %s/.tmp' %(outdir), shell=True)
560 |         (output, err) = p.communicate()
561 |         
562 |         if not os.path.exists('%s/.tmp' %outdir): os.makedirs('%s/.tmp' %outdir)
563 |         
564 |     return


--------------------------------------------------------------------------------
/ss3iso/pyModule/isoform_reconstruct.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Developer: Ping Chen
  3 | # Contact: ping.chen@ki.se
  4 | # Date: 2020-01-10
  5 | # Version: 0.1.3
  6 | 
  7 | import re
  8 | import os
  9 | import subprocess
 10 | import pysam
 11 | import pandas as pd
 12 | from collections import defaultdict
 13 | import numpy as np
 14 | import multiprocessing as mp
 15 | from functools import partial
 16 | import pybedtools
 17 | import glob
 18 | import warnings
 19 | 
 20 | 
 21 | def convert_ref_to_dict(ref):
 22 |     
 23 |     ref_dict = defaultdict(dict)
 24 |     for i in ref.index:
 25 |         bool_array = np.zeros(int(ref.iloc[i]['Total_n_exons']))
 26 |         ex_idx = [int(ii)-1 for ii in ref.iloc[i]['Exon_Index'].split(',')]
 27 |         bool_array[ex_idx] = 1
 28 |         
 29 |         if 'Transcripts' not in ref_dict[ref.iloc[i]['Gene']].keys():
 30 |             ref_dict[ref.iloc[i]['Gene']]['Transcripts'] = defaultdict(dict)
 31 |             ref_dict[ref.iloc[i]['Gene']]['Total_n_exons'] = ''
 32 |         
 33 |         ref_dict[ref.iloc[i]['Gene']]['Transcripts'][ref.iloc[i]['Transcript']] = {'Exon_bool_array': bool_array, 'Exon_Loc': ref.iloc[i]['Exon_Loc'],
 34 |                                                                                   'Junction': ref.iloc[i]['Junction']}
 35 |         ref_dict[ref.iloc[i]['Gene']]['Total_n_exons'] = int(ref.iloc[i]['Total_n_exons'])
 36 |         ref_dict[ref.iloc[i]['Gene']]['chrom'] = ref.iloc[i]['chrom']
 37 |         
 38 |     return ref_dict
 39 | 
 40 | def get_overlapping_genes(gff_merged):
 41 |     
 42 |     overlaped_df = gff_merged.loc[gff_merged[8]!=gff_merged[17]]
 43 |     aa = overlaped_df[8].str.split(';', expand=True)[0].replace('gene_id=','',regex=True).to_list()
 44 |     bb = overlaped_df[17].str.split(';', expand=True)[0].replace('gene_id=','',regex=True).to_list()
 45 |     df = pd.DataFrame([aa,bb],index=['A','B']).T
 46 |     
 47 |     overlaped_gene_dict = df.groupby(by='A').apply(lambda x: x['B'].to_list()).to_dict()
 48 |     
 49 |     return overlaped_gene_dict
 50 | 
 51 | def _filter_reads_from_other_gene(raw_aligned, gene_neighbors, indir, gene):
 52 |     
 53 |     gene_neighbors = gene_neighbors + [gene]
 54 |     
 55 |     gneighbor = ref_iso.loc[ref_iso['Gene'].isin(gene_neighbors)][['Transcript','Gene','Exon_Loc']]
 56 |     gneighbor['Exon_Loc'] = gneighbor['Exon_Loc'].replace(';',',',regex=True)
 57 |     gneighbor = gneighbor.assign(Exon_Loc=gneighbor.Exon_Loc.str.split(',')).explode('Exon_Loc')
 58 |     
 59 |     gneighbor_bed = gneighbor.Exon_Loc.str.split('-', expand=True)
 60 |     gneighbor_bed.insert(0,'chrom','chr1')
 61 |     gneighbor_bed.insert(3,'Transcript',gneighbor['Transcript'])
 62 |     gneighbor_bed.insert(4,'Gene',gneighbor['Gene'])
 63 |     gneighbor_bed.to_csv('%s/.tmp/_%s_neighbors.bed' %(indir, gene), sep="\t", index=False)
 64 |     gneighbor_bed_obj = pybedtools.BedTool('%s/.tmp/_%s_neighbors.bed' %(indir, gene))
 65 |     
 66 |     aligned_info = raw_aligned.iloc[:,[0,1,12]]
 67 |     aligned_info.columns = ['Read','Flag','Region']
 68 |     aligned_info = aligned_info.assign(Region=aligned_info.Region.str.split(',')).explode('Region')
 69 |     aligned_info_bed = aligned_info.Region.str.split('-', expand=True)
 70 |     aligned_info_bed.insert(0,'chrom','chr1')
 71 |     aligned_info_bed.insert(3,'Read',aligned_info['Read'])
 72 |     aligned_info_bed.insert(4,'Flag',aligned_info['Flag'])
 73 |     aligned_info_bed.to_csv('%s/.tmp/_%s_raw_aligned.bed' %(indir, gene), sep="\t", index=False)
 74 |     aligned_info_bed_obj = pybedtools.BedTool('%s/.tmp/_%s_raw_aligned.bed' %(indir, gene))
 75 |     
 76 |     tmp = aligned_info_bed_obj.intersect(gneighbor_bed_obj, wo=True)
 77 |     my_intersect_all = tmp.to_dataframe().drop_duplicates()
 78 |     read_overlaps = my_intersect_all.groupby(by=['name','itemRgb','blockCount']).apply(lambda x: np.sum(x['blockSizes']))
 79 |     read_overlaps = read_overlaps.reset_index()
 80 |     read_overlaps.columns = list(read_overlaps.columns)[:-1] + ['Len']
 81 |     
 82 |     max_frag_overlap = read_overlaps.groupby(by='name').apply(lambda x: x.loc[x['Len']==x['Len'].max()])
 83 |     
 84 |     kept_read_names = list(set(max_frag_overlap.loc[max_frag_overlap['blockCount']==gene]['name'].values))
 85 |     kept_reads = raw_aligned.loc[raw_aligned[0].isin(kept_read_names)]
 86 |     
 87 |     return kept_reads
 88 | 
 89 | def correct_bool_array(bool_array, junc_list):
 90 |     
 91 |     multi_ex_juncs = [junc for junc in junc_list if len(junc.split('^'))>2]
 92 |     junc_list = [junc for junc in junc_list if junc not in multi_ex_juncs]
 93 |     
 94 |     if len(junc_list) > 0:
 95 |         junc_df = pd.DataFrame(junc_list, columns=['Start']).Start.str.split('^',expand=True)
 96 |         junc_df.columns = ['Start','End']
 97 |         ambig_junc_same_start = junc_df.groupby(by="Start").apply(lambda x: x.apply(lambda y: '%s^%s' %(y[0],y[1]), axis=1).to_list())
 98 |         ambig_juncs1 = ambig_junc_same_start.loc[ambig_junc_same_start.apply(lambda x: len(x)) > 1].to_list()
 99 |         ambig_juncs1 = sum(ambig_juncs1,[])
100 |         ambig_junc_same_stop = junc_df.groupby(by="End").apply(lambda x: x.apply(lambda y: '%s^%s' %(y[0],y[1]), axis=1).to_list())
101 |         ambig_juncs2 = ambig_junc_same_stop.loc[ambig_junc_same_stop.apply(lambda x: len(x)) > 1].to_list()
102 |         ambig_juncs2 = sum(ambig_juncs2,[])
103 |         ambig = list(set(ambig_juncs1 + ambig_juncs2))
104 |         junc_list = list(set(junc_list) - set(ambig))
105 |         
106 |     junc_list = junc_list + multi_ex_juncs
107 |     junc_ex_idx = [list(map(int,junc.split('^'))) for junc in junc_list]
108 |     skipped_exon_idx = [list(set(range(np.min(ex_idx),np.max(ex_idx)+1)) - set(ex_idx)) for ex_idx in junc_ex_idx]
109 |     skipped_exon_idx = list(set(sum(skipped_exon_idx,[])))
110 |     
111 |     flag = np.isnan(bool_array)
112 |     skipped_exon_idx = list(set(np.array(range(len(bool_array)))[flag]+1) & set(skipped_exon_idx))
113 |     
114 |     if len(skipped_exon_idx) > 0:
115 |         bool_array[np.array(skipped_exon_idx) - 1] = 0
116 |     
117 |     return bool_array
118 | 
119 | def _infer_isoform(exon_bool_array, ref):
120 |     
121 |     scores = []
122 |     for trans in ref['Transcripts'].keys():
123 |         curr_ref_iso_bool = ref['Transcripts'][trans]['Exon_bool_array']
124 |         flag = exon_bool_array == curr_ref_iso_bool
125 |         scores.append(len(flag[flag]))
126 |         
127 |     max_score = np.max(scores)
128 |     infered_isoforms = np.array(list(ref['Transcripts'].keys()))[np.where(scores==max_score)]
129 |     n_infered = len(infered_isoforms)
130 |     
131 |     infered_ref_transcript_bool_string = [''.join(list(map(str,map(int,ref['Transcripts'][iso]['Exon_bool_array'])))) for iso in infered_isoforms]
132 |     
133 |     return {'max_score': max_score, 'infered_transcripts': ','.join(infered_isoforms), 'n_infered_transcripts': n_infered,
134 |             'total_n_ref_transcripts': len(ref['Transcripts'].keys()), 'infered_ref_transcript_bool_string': ','.join(infered_ref_transcript_bool_string)}
135 | 
136 | def _isoform_inference_of_single_molec(aligned_reads_df, ref):
137 |     
138 |     mapped_ex_junc = list(set(aligned_reads_df[14].values))
139 |     exon_idx_list = [str(ii) for ii in mapped_ex_junc if not re.search(',', str(ii))]
140 |     
141 |     junc_list = [ii.replace(',','^') for ii in mapped_ex_junc if re.search(',', str(ii))]
142 |     multi_ex_juncs = [junc for junc in junc_list if len(junc.split('^'))>2]
143 |     multi_ex_juncs_nn = [['^'.join(junc.split('^')[idx:(idx+2)]) for idx in range(len(junc.split('^'))-1)] for junc in multi_ex_juncs]
144 |     multi_ex_juncs_nn = sum(multi_ex_juncs_nn, [])
145 |     junc_list = [junc for junc in junc_list if junc not in multi_ex_juncs]
146 |     junc_list = list(set(junc_list + multi_ex_juncs_nn))
147 |     rm_junc1, rm_junc2 = [[],[]]
148 |     
149 |     if len(junc_list) > 0:
150 |         junc_df = pd.DataFrame(junc_list, columns=['Start']).Start.str.split('^',expand=True)
151 |         junc_df.columns = ['Start','End']
152 |         ambig_junc_same_start = junc_df.groupby(by="Start").apply(lambda x: x.apply(lambda y: '%s^%s' %(y[0],y[1]), axis=1).to_list())
153 |         ambig_juncs = ambig_junc_same_start.loc[ambig_junc_same_start.apply(lambda x: len(x)) > 1].to_list()
154 |         no_ambig_juncs = ambig_junc_same_start.loc[ambig_junc_same_start.apply(lambda x: len(x)) == 1].to_list()
155 |         filtered = [[junc for junc in juncL if junc.split('^')[1] in exon_idx_list] for juncL in ambig_juncs]
156 |         rm_junc1 = [[junc for junc in juncL if junc.split('^')[1] not in exon_idx_list] for juncL in ambig_juncs]
157 |         junc_list = filtered + no_ambig_juncs
158 |         junc_list = sum(junc_list,[])
159 |         
160 |     if len(junc_list) > 0:
161 |         junc_df = pd.DataFrame(junc_list, columns=['Start']).Start.str.split('^',expand=True)
162 |         junc_df.columns = ['Start','End']
163 |         ambig_junc_same_stop = junc_df.groupby(by="End").apply(lambda x: x.apply(lambda y: '%s^%s' %(y[0],y[1]), axis=1).to_list())
164 |         ambig_juncs = ambig_junc_same_stop.loc[ambig_junc_same_stop.apply(lambda x: len(x)) > 1].to_list()
165 |         no_ambig_juncs = ambig_junc_same_stop.loc[ambig_junc_same_stop.apply(lambda x: len(x)) == 1].to_list()
166 |         filtered = [[junc for junc in juncL if junc.split('^')[0] in exon_idx_list] for juncL in ambig_juncs]
167 |         rm_junc2 = [[junc for junc in juncL if junc.split('^')[0] not in exon_idx_list] for juncL in ambig_juncs]
168 |         junc_list = filtered + no_ambig_juncs
169 |         junc_list = sum(junc_list,[])
170 |         
171 |     rm_junc_list = [junc.replace('^',',') for junc in set(sum(rm_junc1+rm_junc2,[]))]
172 |     if len(rm_junc_list)>0:
173 |         flag_df = pd.concat([pd.DataFrame(aligned_reads_df[14].str.match(junc,na=False)) for junc in rm_junc_list], axis=1)
174 |         aligned_reads_df = aligned_reads_df.loc[flag_df.sum(axis=1) == 0]
175 |                  
176 |     ex_from_junc = [junc.split('^') for junc in junc_list]
177 |     ex_from_junc = list(set(sum(ex_from_junc,[])))
178 |     
179 |     exon_idx_list = list(set(exon_idx_list + ex_from_junc))
180 |     exon_idx_list = list(map(int, exon_idx_list))
181 |     exon_idx_list.sort()
182 |     
183 |     read_coord = aligned_reads_df.groupby(by=0).apply(lambda x: '|'.join(x[12]))
184 |     n_fragment = read_coord.shape[0]
185 |     read_coord_list = ';'.join((list(read_coord.values)))
186 |     
187 |     exon_bool_array = np.zeros(int(ref['Total_n_exons']))
188 |     exon_bool_array[:] = np.nan
189 |     exon_bool_array[[ii-1 for ii in exon_idx_list]] = 1
190 |     if len(junc_list)>0:
191 |         exon_bool_array = correct_bool_array(exon_bool_array, junc_list)
192 |     
193 |     infered = _infer_isoform(exon_bool_array, ref)
194 |     exon_bool_string = ''.join(['N' if np.isnan(bb) else str(int(bb)) for bb in exon_bool_array])
195 |     
196 |     if aligned_reads_df.shape[0] == 0: return []
197 |     out = [aligned_reads_df[16].iloc[0], aligned_reads_df[17].iloc[0], 
198 |            n_fragment, ','.join(map(str,exon_idx_list)), ','.join(junc_list),
199 |            read_coord_list, aligned_reads_df[13].iloc[0], ref['Total_n_exons'], infered['total_n_ref_transcripts'],
200 |            infered['infered_transcripts'],
201 |            infered['n_infered_transcripts'], infered['infered_ref_transcript_bool_string'],
202 |            exon_bool_string, infered['max_score']]
203 |     
204 |     return out
205 | 
206 | def _run_isoform(indir, outdir, ref_iso_dict, kept_cell_BCs, conf, overlaped_gene_dict, gene):
207 |     
208 |     print(gene)
209 |     
210 |     if os.path.exists('%s/%s/%s' %(outdir, ref_iso_dict[gene]['chrom'], gene)): return
211 |     raw_aligned_reads = pd.read_table('%s/keptReads/%s/%s_aligned_reads.csv' %(indir, ref_iso_dict[gene]['chrom'], gene), header=None, index_col=None, sep="\t")
212 |     
213 |     if gene in overlaped_gene_dict.keys():
214 |         aligned_reads = _filter_reads_from_other_gene(raw_aligned_reads, overlaped_gene_dict[gene], indir, gene)
215 |     else:
216 |         aligned_reads = raw_aligned_reads
217 |     
218 |     if aligned_reads.shape[0] == 0: return
219 |     chrom = aligned_reads.iloc[0,2]
220 |     outdir = '%s/%s' %(outdir, chrom)
221 |     if not os.path.exists(outdir): os.makedirs(outdir)
222 |     
223 |     bc = aligned_reads.apply(lambda x: '%s+%s' %(x[16],x[17]), axis=1)
224 |     bc.name = 'BC_UB'
225 |     aligned = pd.concat([aligned_reads, bc], axis=1)
226 |     
227 |     results = aligned.groupby(by='BC_UB').apply(_isoform_inference_of_single_molec, ref_iso_dict[gene])
228 |     df = pd.DataFrame(list(results.values)).dropna()
229 |     df.to_csv('%s/%s' %(outdir, gene), sep="\t", index=False, header=False)
230 |     
231 |     return
232 | 
233 | 
234 | def get_junction(ass, trans_df):
235 |     
236 |     tt = pd.DataFrame(ass.coordinates.str.split(';').to_list(), index=pd.MultiIndex.from_frame(ass[['Exon_Idx','flag','Transcripts']])).stack()
237 |     tt = tt.reset_index()
238 |     tt.columns = ['Exon_Idx','flag','Transcripts','rm','coordinates']
239 |     ass = tt[ass.columns]
240 |     
241 |     curr_ass = pd.concat([pd.DataFrame(ass.coordinates.str.split('-').tolist()), pd.DataFrame(ass.Transcripts.str.split(',').tolist())], axis=1)
242 |     curr_ass.columns = ['start','end','transcript','exon_idx']
243 |     
244 |     ass_start = curr_ass.groupby(by="exon_idx").apply(lambda x: len(set(x['start'].values)))
245 |     ass_end = curr_ass.groupby(by="exon_idx").apply(lambda x: len(set(x['end'].values)))
246 |     
247 |     ass_start_exid = list(ass_start[ass_start>1].index)
248 |     ass_end_exid = list(ass_end[ass_end>1].index)
249 |     
250 |     max_n_exons = trans_df['Exon_Idx'].max()
251 |     
252 |     ass_start_exid = [eid for eid in ass_start_exid if eid!='1']
253 |     ass_end_exid = [eid for eid in ass_end_exid if eid!=str(max_n_exons)]
254 |     
255 |     ass_start_junc = None
256 |     ass_end_junc = None
257 |     
258 |     if len(ass_start_exid) > 0:
259 |         tmp = curr_ass.loc[curr_ass["exon_idx"].isin(ass_start_exid)]
260 |         ass_start_junc = tmp.apply(_get_junc_start, axis=1, trans_df=trans_df)
261 |         ass_start_junc = pd.DataFrame(list(ass_start_junc[~ass_start_junc.isnull()].values))
262 |     
263 |     if len(ass_end_exid) > 0:
264 |         tmp = curr_ass.loc[curr_ass["exon_idx"].isin(ass_end_exid)]
265 |         ass_end_junc = tmp.apply(_get_junc_end, axis=1, trans_df=trans_df)
266 |         ass_end_junc = pd.DataFrame(list(ass_end_junc[~ass_end_junc.isnull()].values))
267 |     
268 |     return {'ass_start': ass_start_junc, 'ass_end': ass_end_junc}
269 | 
270 | def _get_junc_start(x, trans_df):
271 |     
272 |     row_idx = list(trans_df.query('Exon_Idx=="%s" and Transcripts=="%s"' %(x[3], x[2])).index)[0]
273 |     min_ex_idx = trans_df.query('Transcripts=="%s"' %(x[2]))['Exon_Idx'].min()
274 |     if int(x['exon_idx'])==min_ex_idx: return
275 |     
276 |     junc_ex_pos = trans_df.loc[row_idx-1]['coordinates'].split('-')[1]
277 |     junc_idx = '%s^%s' %(trans_df.loc[row_idx-1]['Exon_Idx'], x[3])
278 |     
279 |     return [x[2], x[3], junc_idx, '%s,%s' %(junc_ex_pos, x[0])]
280 | 
281 | def _get_junc_end(x, trans_df):
282 |     
283 |     row_idx = list(trans_df.query('Exon_Idx=="%s" and Transcripts=="%s"' %(x[3], x[2])).index)[0]
284 |     max_ex_idx = trans_df.query('Transcripts=="%s"' %(x[2]))['Exon_Idx'].max()
285 |     if int(x['exon_idx'])==max_ex_idx: return
286 |     
287 |     junc_ex_pos = trans_df.loc[row_idx+1]['coordinates'].split('-')[0]
288 |     junc_idx = '%s^%s' %(x[3],trans_df.loc[row_idx+1]['Exon_Idx'])
289 |     
290 |     return [x[2], x[3], junc_idx, '%s,%s' %(x[1], junc_ex_pos)]
291 | 
292 | def isoform_inference_correction_by_ass_v2(expr_indir, ref, outdir, gene_file):
293 |     
294 |     chrom, gene = gene_file.split('/')[-2:]
295 |     print(gene)
296 |     
297 |     outdir = '%s/%s' %(outdir, chrom)
298 |     if not os.path.exists(outdir): os.makedirs(outdir)
299 |     
300 |     if os.stat(gene_file).st_size == 0: return
301 |     initial_infered = pd.read_table(gene_file, header=None, index_col=None, sep="\t")
302 |     initial_infered.index = initial_infered.apply(lambda x: '%s_%s' %(x[0], x[1]), axis=1)
303 |     infered_to_correct = initial_infered.loc[initial_infered[10]>1]
304 |     if initial_infered.iloc[0,8]==1 or infered_to_correct.shape[0]==0:
305 |         initial_infered[14] = ['no' for ii in range(initial_infered.shape[0])]
306 |         initial_infered[[0,1,3,5,9,10,12]].to_csv('%s/%s' %(outdir, gene), sep="\t", index=False, header=False)
307 |         return
308 |     
309 |     trans_list = []
310 |     for trans in ref[gene]['Transcripts'].keys():
311 |         exon_bool = list(ref[gene]['Transcripts'][trans]['Exon_bool_array'])
312 |         exon_idx = [ii+1 for ii in range(len(exon_bool))]
313 |         
314 |         exon_idx = pd.DataFrame([exon_idx, exon_bool], index=['Exon_Idx','flag']).T.query('flag==1')
315 |         exon_idx['Exon_Idx'] = exon_idx['Exon_Idx'].astype(int)
316 |         exon_idx.index = range(exon_idx.shape[0])
317 |         exon_idx = pd.concat([exon_idx, pd.DataFrame(ref[gene]['Transcripts'][trans]['Exon_Loc'].split(','), columns=['coordinates']), pd.DataFrame([trans for ii in range(exon_idx.shape[0])], columns=['Transcripts'])], axis=1)
318 |         trans_list.append(exon_idx)
319 |     
320 |     trans_df = pd.concat(trans_list, axis=0)
321 |     trans_df.index = range(trans_df.shape[0])
322 |     ass = trans_df.groupby(by='Exon_Idx').apply(get_comm_exon_ass).dropna(how='all')
323 |     if ass.shape[0] == 0:
324 |         initial_infered[14] = ['no' for ii in range(initial_infered.shape[0])]
325 |         initial_infered[[0,1,3,5,9,10,12]].to_csv('%s/%s' %(outdir, gene), sep="\t", index=False, header=False)
326 |         return
327 |     
328 |     ass['Exon_Idx'] = ass['Exon_Idx'].astype(int)
329 |     ass['Transcripts'] = ass.apply(lambda x: '%s,%s' %(x[-1],x[0]), axis=1)
330 |     new_ass = pd.DataFrame(ass.coordinates.str.split(';').tolist(), index=ass.Transcripts).stack()
331 |     new_ass = new_ass.reset_index([0, 'Transcripts'])
332 |     ass_exon_reg = pd.DataFrame([['chr1']+coord.split('-')+['.'] for coord in new_ass.iloc[:,1].values])
333 |     ass_exon_reg[4] = new_ass['Transcripts']
334 |     ass_exon_reg.to_csv('%s/../.tmp/_%s_ass' %(outdir, gene), sep="\t", index=False, header=False)
335 |     ass_exon_reg_bed = pybedtools.BedTool('%s/../.tmp/_%s_ass' %(outdir, gene))
336 |     
337 |     ass_junc = get_junction(ass, trans_df)
338 |     
339 |     aligned_list = []
340 |     for idx in infered_to_correct.index:
341 |         region = pd.DataFrame([['chr1']+reg.split('-')+['.'] for reg in initial_infered.loc[idx][5].replace(';',',').replace('|',',').split(',')])
342 |         region[4] = [idx for i in range(region.shape[0])]
343 |         aligned_list.append(region)
344 |     aligned_reg = pd.concat(aligned_list, axis=0)
345 |     aligned_reg.to_csv('%s/../.tmp/_aligned_region_in_%s' %(outdir, gene), sep="\t", index=False, header=False)
346 |     aligned_reg_bed = pybedtools.BedTool('%s/../.tmp/_aligned_region_in_%s' %(outdir, gene))
347 |     
348 |     tmp = aligned_reg_bed.intersect(ass_exon_reg_bed, wo=True)
349 |     if os.stat(tmp.fn).st_size==0:
350 |         initial_infered[14] = ['no' for ii in range(initial_infered.shape[0])]
351 |         initial_infered[[0,1,3,5,9,10,12]].to_csv('%s/%s' %(outdir, gene), sep="\t", index=False, header=False)
352 |         
353 |         return
354 |     
355 |     intersect = tmp.to_dataframe()
356 |     intersect = intersect.drop_duplicates()
357 |     intersect[['trans','exon_idx']] = intersect.iloc[:,-2].str.split(',',expand=True)
358 |        
359 |     trans_idx = intersect.groupby(by='score').apply(_get_max_overlap_transcript, infered_to_correct, ass_junc)
360 |     trans_counts = trans_idx.apply(lambda x: len(x.split(',')))
361 |     
362 |     infered = initial_infered.copy()
363 |     infered.loc[trans_idx.index,9] = trans_idx
364 |     infered.loc[trans_idx.index,10] = trans_counts
365 |     infered[14] = ['no' for i in range(infered.shape[0])]
366 |     infered.loc[trans_idx.index,14] = 'yes'
367 |     infered_out = infered[[0,1,3,5,9,10,12]]
368 |     
369 |     infered_out.to_csv('%s/%s' %(outdir, gene), sep="\t", index=False, header=False)
370 |    
371 |     return
372 | 
373 | 
374 | def score_junction_mapping(infered_trans, junc_r, ass_junc):
375 |     
376 |     if len(junc_r) == 0: return infered_trans
377 |     
378 |     ass_start = pd.DataFrame(ass_junc['ass_start'], columns=[0,1,2,3])
379 |     ass_end = pd.DataFrame(ass_junc['ass_end'], columns=[0,1,2,3])
380 |     
381 |     ass_start = ass_start.loc[ass_start[0].isin(infered_trans)]
382 |     ass_end = ass_end.loc[ass_end[0].isin(infered_trans)]
383 |     
384 |     mapping1 = ass_start.loc[ass_start[3].isin(junc_r)]
385 |     mapping2 = ass_end.loc[ass_end[3].isin(junc_r)]
386 |     
387 |     if mapping1.shape[0]==0 and mapping2.shape[0]==0:
388 |         return infered_trans
389 |     
390 |     mapped1 = pd.DataFrame(mapping1.groupby(by=1).apply(lambda x: list(x[0].values)),columns=[0])
391 |     mapped2 = pd.DataFrame(mapping2.groupby(by=1).apply(lambda x: list(x[0].values)),columns=[0])
392 |     
393 |     trans_list = []
394 |     if mapped1.shape[0] > 0:
395 |         trans_list.extend(mapped1[0].sum())
396 |     
397 |     if mapped2.shape[0] > 0:
398 |         trans_list.extend(mapped2[0].sum())
399 |     
400 |     score = pd.Series(trans_list).value_counts()
401 |     infered_trans =  list(score[score==score.max()].index)
402 |     
403 |     return infered_trans
404 | 
405 | def _get_overlap_len(df):
406 |     
407 |     set_list = [set(range(df.iloc[ii]['start'],df.iloc[ii]['end'])) for ii in range(df.shape[0])]
408 |     base_overlap = len(set.union(*set_list))
409 |     return base_overlap
410 | 
411 | def _get_max_overlap_transcript(x, infered_to_correct, ass_junc):
412 |     
413 |     infered_trans = list(set(infered_to_correct.loc[x.iloc[0,4]][9].split(',')))
414 |     xx = x.loc[x['trans'].isin(infered_trans)]
415 |     if xx.shape[0] == 0:
416 |         junc_r = [reg for reg in infered_to_correct.loc[x.iloc[0,4]][5].split('-') if re.search(',', reg)]
417 |         corr_trans = score_junction_mapping(infered_trans, junc_r, ass_junc)
418 |         return ','.join(corr_trans)
419 |     
420 |     overlap_len = pd.DataFrame(xx.groupby(by='blockCount').apply(_get_overlap_len), columns=['len'])
421 |     overlap_len = pd.concat([overlap_len, pd.DataFrame(overlap_len.index, index=overlap_len.index)['blockCount'].str.split(',',expand=True)], axis=1)
422 |     overlap_len.columns = ['len','trans','exon_idx']
423 |     
424 |     tmp = overlap_len.groupby(by='exon_idx').apply(lambda x: x.loc[x['len']==np.max(x['len'])])
425 |     tmp.index.names = ['idx1','idx2']
426 |     trans = tmp.groupby(by='exon_idx').apply(lambda x: list(set(x['trans'].values)))
427 |     trans_len = trans.apply(len)
428 |     corr_trans = list(set(trans[trans_len[trans_len==np.min(trans_len)].index].sum()))
429 |     
430 |     if len(corr_trans) > 1:
431 |         junc_r = [reg for reg in infered_to_correct.loc[x.iloc[0,4]][5].split('-') if re.search(',', reg)]
432 |         corr_trans = score_junction_mapping(corr_trans, junc_r, ass_junc)
433 |     
434 |     out = ','.join(corr_trans)
435 |     
436 |     return out
437 | 
438 | 
439 | def get_comm_exon_ass(df):
440 |     
441 |     if df.shape[0] == 1: return
442 |     if len(set(df['coordinates'].values)) == 1: return
443 |     
444 |     return df
445 | 
446 | def get_isoforms(conf, indir, ref):
447 |     
448 |     global ref_iso
449 |     ref_iso = ref
450 |     
451 |     os.system('bedtools intersect -s -a %s/gene.gff -b %s/gene.gff -wo > %s/gene_merged.gff' %(indir, indir, indir))
452 |     df = pd.read_table('%s/gene_merged.gff' %(indir), header=None, sep="\t", index_col=None)
453 |     overlaped_gene_dict = get_overlapping_genes(df)
454 |     
455 |     outdir = '%s/.R1' %(indir)
456 |     if not os.path.exists(outdir): os.makedirs(outdir)
457 |     
458 |     kept_cell_BCs = list(pd.read_table(conf['annotation']['zumi_keptbarcode'],header=0, index_col=0, sep=",").index)
459 |     ref_iso_dict = convert_ref_to_dict(ref_iso)
460 |     
461 |     genes = list(set(ref_iso['Gene'].values))
462 |     gene_files = glob.glob('%s/.R1/*/*' %(indir))
463 |     infered_genes = [val.split('/')[-1] for val in gene_files if not re.search('_log', val)]
464 |     remain_genes = list(set(genes) - set(infered_genes))
465 |     print('%s remaining files' %(len(remain_genes)))
466 |     
467 |     os.system('> %s/_log' %(outdir))
468 |     
469 |     pool = mp.Pool(processes=int(conf['expression']['nproc']))
470 |     func = partial(_run_isoform, indir, outdir, ref_iso_dict, kept_cell_BCs, conf, overlaped_gene_dict)
471 |     pool.map(func, remain_genes, chunksize=1)
472 |     pool.close()
473 |     
474 |     gene_files = glob.glob('%s/.R1/*/*' %(indir))
475 |     infered_gene_paths = [val for val in gene_files if not re.search('_log', val)]
476 |     outdir = '%s/assigned_isoforms' %(indir)
477 |     if not os.path.exists(outdir): os.makedirs(outdir)
478 |     
479 |     infered_gene_paths = [gene_file for gene_file in infered_gene_paths if not os.path.exists(gene_file.replace('.R1','assigned_isoforms')) or os.stat(gene_file.replace('.R1','assigned_isoforms')).st_size==0]
480 |     print('%s remaining files' %(len(infered_gene_paths)))
481 |     
482 |     if not os.path.exists('%s/.tmp' %outdir): os.makedirs('%s/.tmp' %outdir)
483 |     pool = mp.Pool(processes=int(conf['expression']['nproc']))
484 |     func = partial(isoform_inference_correction_by_ass_v2, indir, ref_iso_dict, outdir)
485 |     pool.map(func, infered_gene_paths, chunksize=1)
486 |     pool.close()
487 |         
488 |     p = subprocess.Popen('rm -rf %s/.tmp' %(outdir), shell=True)
489 |     (output, err) = p.communicate()
490 |     
491 |     p = subprocess.Popen('rm -rf %s/.tempDir' %(indir), shell=True)
492 |     (output, err) = p.communicate()
493 |     
494 |     p = subprocess.Popen('rm -rf %s/.R1' %(indir), shell=True)
495 |     (output, err) = p.communicate()
496 |     
497 |     
498 |     return
499 | 


--------------------------------------------------------------------------------
/ss3iso/pyModule/reference.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Developer: Ping Chen
  3 | # Contact: ping.chen@ki.se
  4 | # Date: 2020-01-10
  5 | # Version: 0.1.3
  6 | 
  7 | import re
  8 | import os
  9 | import subprocess
 10 | import pysam
 11 | import pandas as pd
 12 | from collections import defaultdict
 13 | import numpy as np
 14 | import multiprocessing as mp
 15 | from functools import partial
 16 | import pybedtools
 17 | import glob
 18 | import warnings
 19 | from .informative_reads import *
 20 | 
 21 | def _ref_transcript_struc(df, total_n_ex, gene_id):
 22 |     
 23 |     coordinates = df.groupby(by="blockCount").apply(lambda x: ';'.join(list(x.apply(lambda y: '%s-%s' %(y[1],y[2]), axis=1).values)))
 24 |     ex_idx = list(coordinates.index)
 25 |     ex_idx.sort()
 26 |     coordinates = coordinates[ex_idx]
 27 |     junc = ['%s^%s' %(ex_idx[ii], ex_idx[ii+1]) for ii in range(len(ex_idx)-1)]
 28 |     out = [gene_id, ','.join(map(str,ex_idx)), ','.join(list(coordinates.values)), ','.join(junc), str(total_n_ex)]
 29 |     
 30 |     return pd.Series(out)
 31 | 
 32 | def _build_gene_ref(indir, outdir, gene_info, gene):
 33 | 
 34 |     print(gene)
 35 |     os.system('echo "%s" >> %s/_log' %(gene, outdir))
 36 | 
 37 |     rcds = '\t'.join(map(str,gene_info.loc[gene].values))
 38 |     
 39 |     obj = geneObj(None, None, indir)
 40 |     obj.get_exon_coordinates(rcds.strip())
 41 |     obj.outdir = outdir
 42 |     
 43 |     curr_df = sm.query('gene=="%s"' %(obj.gene)).iloc[:,[0,1,2,3,5]]
 44 |     if curr_df.shape[0]==0: return None 
 45 |     curr_df.to_csv('%s/_%s' %(outdir, obj.gene), sep="\t", index=False, header=False)
 46 |     
 47 |     gene_bed = pybedtools.BedTool('%s/_%s' %(outdir, obj.gene))
 48 |     tmp = gene_bed.intersect(obj.ex_bed, wa=True, wb=True)
 49 |     intersect = tmp.to_dataframe()
 50 |     res = intersect.groupby(by="score").apply(_ref_transcript_struc, obj.ex_bed.to_dataframe()['score'].max(), obj.gene)
 51 |     
 52 |     return res
 53 | 
 54 |    
 55 | def build_reference(conf, indir):
 56 |     
 57 |     outdir = '%s/.reference' %(indir)
 58 |     if not os.path.exists(outdir): os.makedirs(outdir)
 59 |     
 60 |     gene_info = pd.read_table('%s/gene.gff' %(indir), header=None, index_col=None, sep='\t')
 61 |     gene_info.index = gene_info.apply(lambda x: x[8].split(';')[0].split('=')[1], axis=1)
 62 |     
 63 |     if conf['annotation']['gtf_source'] == 'ensembl':
 64 |         t_idx = 3
 65 |     else:
 66 |         t_idx = 2
 67 |         
 68 |     exon_ref = pd.read_table('%s/exon.gff' %(indir), header=None, index_col=None, sep="\t")
 69 |     genes = pd.DataFrame([val.split(';')[1].split('=')[1] for val in exon_ref.iloc[:,8].values])
 70 |     trans = pd.DataFrame([val.split(';')[t_idx].split('=')[1] for val in exon_ref.iloc[:,8].values])
 71 |     annot = pd.concat([genes,trans], axis=1)
 72 |     
 73 |     global sm
 74 |     sm = exon_ref.iloc[:,[0,3,4,6]]
 75 |     sm = pd.concat([sm, annot], axis=1)
 76 |     sm.columns = ['chrom','start','end','strand','gene','transcript']
 77 |     sm['start'] = sm['start'] - 1
 78 |     
 79 |     keptfiles = glob.glob('%s/keptReads/*/*.csv' %(indir))
 80 |     genes = ['_'.join(val.split('/')[-1].split('_')[:-2]) for val in keptfiles]
 81 |     chrom_dict = {'_'.join(val.split('/')[-1].split('_')[:-2]): val.split('/')[-2] for val in keptfiles}
 82 |     os.system('> %s/_log' %(outdir))
 83 |     
 84 |     pool = mp.Pool(processes=int(conf['expression']['nproc']))
 85 |     func = partial(_build_gene_ref, indir, outdir, gene_info)
 86 |     results = pool.map(func, genes, chunksize=1)
 87 |     
 88 |     filtered = [item for item in results if item is not None]
 89 |     
 90 |     out_df = pd.concat(filtered, axis=0)
 91 |     out_df.index.name = 'Transcript'
 92 |     out_df.reset_index(inplace=True)
 93 |     out_df.columns = ['Transcript','Gene','Exon_Index','Exon_Loc','Junction','Total_n_exons']
 94 |     
 95 |     out_chr = pd.DataFrame([chrom_dict[gene] for gene in out_df['Gene'].values], columns=['chrom'])
 96 |     ref_iso = pd.concat([out_df, out_chr], axis=1)
 97 |     
 98 |     p = subprocess.Popen('rm -rf %s/../.tmp/*' %(outdir), shell=True)
 99 |     (output, err) = p.communicate()  
100 |     
101 |     p = subprocess.Popen('rm -rf %s/../.tempDir' %(outdir), shell=True)
102 |     (output, err) = p.communicate()  
103 |     
104 |     os.system('mkdir %s/../.tempDir' %(outdir))
105 |     
106 |     p = subprocess.Popen('rm -rf %s' %(outdir), shell=True)
107 |     (output, err) = p.communicate()  
108 |     
109 |     return ref_iso


--------------------------------------------------------------------------------
/ss3iso/ss3_isoform.conf:
--------------------------------------------------------------------------------
 1 | # Configuration file for Smartseq3 isoform reconstruction pipeline #
 2 | 
 3 | [preprocess]
 4 | memory=2G
 5 | 
 6 | [genome]
 7 | hg38_fasta=GRCh38.primary_assembly.genome.fa
 8 | mm10_fasta=GRCm38.primary_assembly.genome.fa
 9 | 
10 | [annotation]
11 | zumi_keptbarcode=/path/to/keptbarcode.txt
12 | 
13 | gtf_source=ensembl
14 | hg38_ensembl_gtf=/path/to/ensembl human gtf file
15 | mm10_ensembl_gtf=/path/to/ensembl mouse gtf file
16 | 
17 | [expression]
18 | nproc=50
19 | n_read_limit=1000000
20 | min_n_reads=2
21 | 


--------------------------------------------------------------------------------
/ss3iso/ss3_isoform.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # Developer: Ping Chen
  3 | # Contact: ping.chen@ki.se
  4 | # Date: 2020-01-10
  5 | # Version: 0.1.3
  6 | 
  7 | # ------------------------------------------------ #
  8 | #      SS3 isoform reconstruction pipeline         #
  9 | # ------------------------------------------------ #
 10 | import os
 11 | from optparse import OptionParser
 12 | import glob
 13 | import configparser
 14 | import re
 15 | from pyModule.informative_reads import *
 16 | from pyModule.reference import *
 17 | from pyModule.isoform_reconstruct import *
 18 | import pybedtools
 19 | 
 20 | def main():
 21 |     
 22 |     parser=OptionParser()
 23 |     
 24 |     parser.add_option('-i', '--inputBAM', dest='inputBAM', 
 25 |                       help='Aligned BAM from zUMI filtering+mapping steps with cell barcode and umi barcode correction.')
 26 |     
 27 |     parser.add_option('-c', '--config', dest='config', 
 28 |                       help='A configuration file for required files and parameters.')
 29 |     
 30 |     parser.add_option('-e', '--experiment', dest='experiment', 
 31 |                       help='Experiment name.')
 32 |     
 33 |     parser.add_option('-o', '--outputDir', dest='outputDir', default='ss3rnaseq',
 34 |                       help='The output directory for the experiment.')
 35 |     
 36 |     parser.add_option('-p', '--process', dest='process', default=8,
 37 |                       help='The number of processes for parallel computing.')
 38 |     
 39 |     parser.add_option('-s', '--species', dest='species', default='hg38',
 40 |                       help='The species under study.')
 41 |     
 42 |     parser.add_option("-P", "--Preprocess", action="store_true", dest='preprocess',
 43 |                       help="Preprocess the input BAM for downstream analysis.")
 44 |     
 45 |     parser.add_option("-R", "--Reconstruction", action="store_true", dest='reconstruction',
 46 |                       help="Run isoform reconstruction.")
 47 | 
 48 | 
 49 |     (op, args) = parser.parse_args()
 50 |     inputBAM = op.inputBAM
 51 |     conf = op.config
 52 |     experiment = op.experiment
 53 |     outdir = op.outputDir
 54 |     nprocess = int(op.process)
 55 | 
 56 |     if op.species == 'hg38' or op.species == 'hg19': species = 'hsa'
 57 |     elif op.species == 'mm9' or op.species == 'mm10': species = 'mmu'
 58 |     
 59 |     config = configparser.ConfigParser()
 60 |     config.read(conf)
 61 |     conf_data = config._sections
 62 |     
 63 |     if not os.path.exists(outdir): os.makedirs(outdir)
 64 |     if not os.path.exists('%s/%s' %(outdir, species)): os.makedirs('%s/%s' %(outdir, species))
 65 |     if not os.path.exists('%s/%s/%s' %(outdir, species, experiment)): os.makedirs('%s/%s/%s' %(outdir, species, experiment))
 66 |     
 67 |     umi_file_prefix = 'UBfix.sort.bam'
 68 |     if op.preprocess:
 69 |         print('Preprocessing on input BAM ...')
 70 |         preDir = os.path.join(outdir, species, experiment, "preprocess")
 71 |         if not os.path.exists(preDir): os.makedirs(preDir)
 72 |         
 73 |         cmd = 'samtools sort -m %s -O bam -@ %s -o %s/%s %s' %(conf_data['preprocess']['memory'], nprocess, preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted.bam',os.path.basename(inputBAM)), inputBAM)
 74 |         os.system(cmd)
 75 |         
 76 |         cmd = 'samtools view -b -q 255 %s/%s > %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted.bam',os.path.basename(inputBAM)), preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_unique.bam',os.path.basename(inputBAM)))
 77 |         os.system(cmd)
 78 |         
 79 |         cmd = 'samtools view -h %s/%s | awk \'$12 != "NH:i:1"\' | samtools view -bS - > %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted.bam',os.path.basename(inputBAM)), preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_multi.bam',os.path.basename(inputBAM)))
 80 |         os.system(cmd)
 81 |         
 82 |         os.system('samtools index %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_unique.bam',os.path.basename(inputBAM))))
 83 |         os.system('samtools index %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_multi.bam',os.path.basename(inputBAM))))
 84 | 
 85 |     if op.reconstruction:
 86 |         
 87 |         print('Collect informative reads per gene...')
 88 |         in_bam_uniq = '%s/%s' %(os.path.join(outdir, species, experiment, "preprocess"), re.sub(umi_file_prefix,'UBfix.coordinateSorted_unique.bam',os.path.basename(inputBAM)))
 89 |         in_bam_multi = '%s/%s' %(os.path.join(outdir, species, experiment, "preprocess"), re.sub(umi_file_prefix,'UBfix.coordinateSorted_multi.bam',os.path.basename(inputBAM)))
 90 |     
 91 |         out_path = os.path.join(outdir, species, experiment, "isoforms_%s" %(conf_data['annotation']['gtf_source']))
 92 |         if not os.path.exists(out_path): os.makedirs(out_path)
 93 |         
 94 |         sys_tmp_dir = '%s/.tmp' %(out_path)
 95 |         if not os.path.exists(sys_tmp_dir): os.makedirs(sys_tmp_dir)
 96 |         pybedtools.set_tempdir(sys_tmp_dir)
 97 |         pybedtools.cleanup(remove_all=True)
 98 |         
 99 |         fetch_gene_reads(in_bam_uniq, in_bam_multi, conf_data, op.species, out_path)
100 |         
101 |         print('Build reference isoforms...')
102 |         ref = build_reference(conf_data, out_path)
103 |         
104 |         print('Start isoform reconstruction...')
105 |         get_isoforms(conf_data, out_path, ref)
106 |         
107 | 
108 | if __name__ == '__main__':
109 |         main()
110 | 
111 |     
112 | 


--------------------------------------------------------------------------------