├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── cwl └── apps │ └── EricScript.json ├── ericscript-0.5.5.tar.bz2 ├── ericscript.pl └── lib ├── R ├── BuildExonUnionModel.R ├── BuildFasta.R ├── BuildNeighbourhoodSequences.R ├── CalcBreakpointPositions.R ├── CalcStats.R ├── CheckDB.R ├── CheckSelfHomology.R ├── ConvertTxt2R.R ├── CreateDataEricTheSimulator.R ├── DownloadDB.R ├── EstimateSpanningReads.R ├── ExtractInsertSize.R ├── ImportResults.R ├── MakeAdjacencyMatrix.R ├── MakeEmptyResults.R ├── MakeResults.R ├── RecalibrateJunctions.R ├── RetrieveRefId.R ├── SimulateFusions.R └── UpdateDB.R ├── bash ├── BuildSeq.sh ├── Ftp2Ensembl.sh └── RunEric.sh ├── data └── _resources │ ├── BlackList.RData │ └── DataModel.RData ├── demo ├── myreads_1.fq.gz └── myreads_2.fq.gz └── perl ├── retrievefrombiomart.pl ├── trimfq.pl └── xa2multi.pl /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | *.Rproj 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | WORKDIR /opt 4 | 5 | RUN apt-get update && apt-get install -y \ 6 | libcurl4-gnutls-dev \ 7 | libgnutls-dev \ 8 | python \ 9 | python-pip \ 10 | python-dev \ 11 | build-essential \ 12 | libncurses5-dev \ 13 | libncursesw5-dev \ 14 | pypy \ 15 | git \ 16 | wget \ 17 | r-base \ 18 | zlib1g-dev \ 19 | samtools 20 | 21 | RUN pip install --upgrade pip 22 | RUN pip install --upgrade virtualenv 23 | 24 | RUN mkdir /opt/bin 25 | ENV PATH /opt/bin:$PATH 26 | 27 | # Install R 28 | RUN wget -O /tmp/ada_2.0-3.tar.gz https://cran.r-project.org/src/contrib/ada_2.0-5.tar.gz; \ 29 | R CMD INSTALL /tmp/ada_2.0-3.tar.gz 30 | 31 | # Install BWA 32 | RUN git clone https://github.com/lh3/bwa.git; \ 33 | cd bwa; \ 34 | make; \ 35 | ln -s /opt/bwa/bwa /opt/bin/ 36 | 37 | # Install seqtk 38 | RUN git clone https://github.com/lh3/seqtk.git; \ 39 | cd seqtk; \ 40 | make; \ 41 | ln -s /opt/seqtk/seqtk /opt/bin/ 42 | 43 | # Install bedtools 44 | RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.26.0/bedtools-2.26.0.tar.gz; \ 45 | tar xvfz bedtools-2.26.0.tar.gz; \ 46 | cd bedtools2; \ 47 | make; \ 48 | ln -s /opt/bedtools2/bin/bedtools /opt/bin/ 49 | 50 | # Install BLAT 51 | RUN wget -P /opt/bin/ http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v287/blat/blat; \ 52 | chmod +x /opt/bin/blat 53 | 54 | # Install ericscript 55 | RUN git clone https://github.com/cgrlab/EricScript.git; \ 56 | chmod +x EricScript/ericscript.pl 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## EricScript Readme v2.1 (Feb 2016) 2 | Please feel free to email the author if you have any questions or issues. 3 | matteo.benelli AT gmail.com 4 | 5 | ### INFORMATION 6 | EricScript is a software package developed in R, perl and bash scripts. 7 | EricScript uses the BWA aligner to perform the mapping on the transcriptome reference and samtools to handle with SAM/BAM files. Recalibration of the exon junction reference is performed by using BLAT. 8 | 9 | 10 | ### REQUIREMENTS 11 | Download and install R: http://cran.r-project.org/ 12 | Download and install the "ada" R package: http://cran.r-project.org/web/packages/ada/index.html 13 | Download and install BWA: http://bio-bwa.sourceforge.net 14 | Download and install SAMtools (>0.1.17): http://samtools.sourceforge.net/ 15 | Download and install bedtools (>2.15): http://code.google.com/p/bedtools/ 16 | Download and install BLAT binaries: http://genome-test.cse.ucsc.edu/~kent/exe/ 17 | Download and install seqtk: https://github.com/lh3/seqtk 18 | Be sure that all of these programs are included in your PATH. 19 | 20 | ### RUNNING ERIC 21 | 22 | Once you have downloaded EricScript, extract the package 23 | 24 | tar -xjf ericscript.tar.bz2 25 | 26 | Make a copy of the program folder to your favorite location. Before running for the first time EricScript, you need to make ericscript.pl executable: 27 | 28 | chmod +x /PATH/TO/ERIC/ericscript.pl 29 | 30 | To get information about running EricScript, digit: 31 | 32 | /PATH/TO/ERIC/ericscript.pl --help 33 | 34 | In order to perform chimeric transcript detection, you need to download and build the Ensembl Database of a genome. To list the available genomes, digit: 35 | 36 | /PATH/TO/ERIC/ericscript.pl --printdb 37 | 38 | After a reference id is selected, you need to download and build the corresponding Ensembl Database. In the example below, it's shown how to prepare the database for saccharomyces cerevisiae. 39 | 40 | /PATH/TO/ERIC/ericscript.pl --downdb --refid saccharomyces_cerevisiae -db /PATH/TO/YOUR/DBFOLDER 41 | 42 | You can also select a specific ensembl release (>= 70) to download 43 | 44 | /PATH/TO/ERIC/ericscript.pl --downdb --refid saccharomyces_cerevisiae -db /PATH/TO/YOUR/DBFOLDER --ensversion 74 45 | 46 | To run EricScript with default parameters (if parameter "refid" is not specified the analysis takes the homo sapiens species as default): 47 | 48 | /PATH/TO/ERIC/ericscript.pl -db /PATH/TO/YOUR/DBFOLDER --refid saccharomyces_cerevisiae -name SAMPLENAME -o /PATH/TO/OUTPUT/ YOUR_FASTQ_1 YOUR_FASTQ_2 49 | 50 | You can check if your database is up-to-date by the following: 51 | 52 | /PATH/TO/ERIC/ericscript.pl --checkdb 53 | 54 | ### OUTPUT FILES 55 | 56 | The /PATH/TO/OUTPUT/ folder contains the results of the analysis. Predicted gene fusion products are reported in 2 files: 57 | samplename.results.total.csv: contains all the predicted gene fusions. 58 | samplename.results.filtered.csv: contains the predicted gene fusions with EricScore > 0.50. 59 | -------------------------------------------------------------------------------- /ericscript-0.5.5.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/ericscript/edd6f8b5cd80f9c828f5b8f54a46ae4b2e6648d5/ericscript-0.5.5.tar.bz2 -------------------------------------------------------------------------------- /lib/R/BuildExonUnionModel.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | ericscriptfolder <- split.vars [1] 5 | refid <- split.vars[2] 6 | dbfolder <- split.vars [3] 7 | tmpfolder <- split.vars [4] 8 | 9 | 10 | formatfasta <- function(myfasta, step = 50) { 11 | totalchar <- nchar(myfasta) 12 | if (totalchar > step) { 13 | steps <- seq(1, totalchar, by = step) 14 | newfasta <- rep("", (length(steps) - 1)) 15 | for (j in 1: (length(steps) - 1)) { 16 | aa <- substr(myfasta, steps[j], (steps[j] + (step - 1))) 17 | newfasta[j] <- aa 18 | } 19 | if ((totalchar - tail(steps, n = 1)) > 0) { 20 | newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar)) 21 | } 22 | } else 23 | { 24 | newfasta <- substr(myfasta, 1, totalchar) 25 | } 26 | return(newfasta) 27 | } 28 | 29 | convertToComplement <- function(x) { 30 | 31 | bases <- c("A", "C", "G", "T") 32 | #xx <- unlist(strsplit(toupper(x), NULL)) 33 | xx <- rev(unlist(strsplit(toupper(x), NULL))) 34 | paste(unlist(lapply(xx, function(bbb) { 35 | if (bbb=="A") compString <- "T" 36 | if (bbb=="C") compString <- "G" 37 | if (bbb=="G") compString <- "C" 38 | if (bbb=="T") compString <- "A" 39 | if (!bbb %in% bases) compString <- "N" 40 | return(compString) 41 | })), collapse="") 42 | 43 | } 44 | 45 | refid.folder <- file.path(dbfolder, "data", refid) 46 | if (file.exists(refid.folder) == F) { 47 | dir.create(refid.folder) 48 | } 49 | x <- scan(file.path(tmpfolder, "subseq.fa"), what = "", quiet = T) 50 | x.bed <- read.delim(file.path(tmpfolder, "exonstartend.mrg.txt"), sep = "\t", header = F) 51 | refid.bed <- paste(as.character(x.bed[[1]]), paste((as.numeric(as.character(x.bed[[2]])) + 1), as.character(x.bed[[3]]), sep = "-"), sep = ":") 52 | tmp <- grep(">", x) 53 | genomicreg <- substr(x[tmp], 2, nchar(x[tmp])) 54 | sequences.tmp <- rep("", length(tmp)) 55 | for (i in 1: (length(tmp) - 1)) { 56 | sequences.tmp[i] <- gsub(", ", "", toString(x[(tmp[i] + 1):(tmp[i+1] - 1)])) 57 | } 58 | sequences.tmp[length(tmp)] <- gsub(", ", "", toString(x[(tmp[length(tmp)] + 1): length(x)])) 59 | genenames.tmp <- as.character(x.bed[[4]]) 60 | unique.genenames <- unique(genenames.tmp) 61 | strand.tmp <- read.delim(file.path(tmpfolder, "strand.txt"), sep = "\t", header = F) 62 | strand <- strand.tmp[[2]] 63 | sequences <- rep("", length(unique.genenames)) 64 | for (i in 1: length(unique.genenames)) { 65 | genenames1 <- paste(">", unique.genenames[i], sep = "") 66 | ix.gene <- which(genenames.tmp == unique.genenames[i]) 67 | ix.refid <- which(genomicreg %in% refid.bed[ix.gene]) 68 | if (strand[i] == "-1") { 69 | seqtmp0 <- gsub(", ", "", toString(sequences.tmp[ix.refid])) 70 | sequences[i] <- convertToComplement(seqtmp0) 71 | } else { 72 | sequences[i] <- gsub(", ", "", toString(sequences.tmp[ix.refid])) 73 | } 74 | if (nchar(sequences[i]) == 0) { 75 | sequences[i] <- "NNNNNNN" 76 | } 77 | 78 | if (i == 1) { 79 | cat(genenames1, file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = F, sep = "\n") 80 | } else { 81 | cat(genenames1, file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = T, sep = "\n") 82 | } 83 | cat(formatfasta(sequences[i]), file = file.path(refid.folder, "EnsemblGene.Reference.fa"), append = T, sep = "\n") 84 | } 85 | ix.emptyseq <- which(nchar(sequences) == 0) 86 | GeneNames <- unique.genenames 87 | if (length(ix.emptyseq) > 0) { 88 | GeneNames <- GeneNames[-ix.emptyseq] 89 | sequences <- sequences[-ix.emptyseq] 90 | } 91 | save(GeneNames, file = file.path(refid.folder, "EnsemblGene.GeneNames.RData")) 92 | save(sequences, file = file.path(refid.folder, "EnsemblGene.Sequences.RData")) 93 | 94 | -------------------------------------------------------------------------------- /lib/R/BuildFasta.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | samplename <- split.vars [1] 5 | outputfolder <- split.vars[2] 6 | ericscriptfolder <- split.vars[3] 7 | readlength <- max(as.numeric(split.vars[4])) 8 | refid <- as.character(split.vars[5]) 9 | dbfolder <- as.character(split.vars[6]) 10 | 11 | 12 | formatfasta <- function(myfasta, step = 50) { 13 | 14 | totalchar <- nchar(myfasta) 15 | if (totalchar > step) { 16 | steps <- seq(1, totalchar, by = step) 17 | newfasta <- rep("", (length(steps) - 1)) 18 | for (j in 1: (length(steps) - 1)) { 19 | aa <- substr(myfasta, steps[j], (steps[j] + (step - 1))) 20 | newfasta[j] <- aa 21 | } 22 | if ((totalchar - tail(steps, n = 1)) > 0) { 23 | newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar)) 24 | } 25 | } else 26 | { 27 | newfasta <- substr(myfasta, 1, totalchar) 28 | } 29 | return(newfasta) 30 | } 31 | 32 | 33 | load(file.path(outputfolder,"out", paste(samplename,".chimeric.RData", sep = ""))) 34 | load(file.path(dbfolder, "data", refid, "EnsemblGene.GeneNames.RData")) 35 | load(file.path(dbfolder, "data", refid, "EnsemblGene.Sequences.RData")) 36 | load(file.path(dbfolder, "data", refid, "EnsemblGene.Structures.RData")) 37 | load(file.path(outputfolder, "out", paste(samplename,".chimeric.RData", sep = ""))) 38 | id1 <- MyGF$id1 39 | id2 <- MyGF$id2 40 | junctions <- rep(NA, length(id1)) 41 | ids_fasta <- rep("", length(id1)) 42 | sequences.fasta <- rep("", length(id1)) 43 | fasta.file <- c() 44 | maxgap <- 300 45 | for (i in 1: length(id1)) { 46 | ix.genetable1 <- which(EnsemblGene.Structures$EnsemblGene == id1[i]) 47 | ix.genetable2 <- which(EnsemblGene.Structures$EnsemblGene == id2[i]) 48 | ix.gene1 <- which(GeneNames == id1[i]) 49 | ix.gene2 <- which(GeneNames == id2[i]) 50 | min.pos1 <- min(MyGF$pos1[[i]]) - 2*readlength 51 | max.pos1 <- max(MyGF$pos1[[i]]) + readlength - 1 52 | if (min.pos1 < 1) {min.pos1 <- 1} 53 | min.pos2 <- min(MyGF$pos2[[i]]) 54 | max.pos2 <- max(MyGF$pos2[[i]]) + 2*readlength 55 | a <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.genetable1]), ","))) 56 | b <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.genetable1]), ","))) 57 | strand1 <- as.character(EnsemblGene.Structures$Strand[ix.genetable1]) 58 | if (strand1 == "+") { 59 | tmp.sum1 <- cumsum((b - a )) 60 | } else { 61 | tmp.sum1 <- cumsum(rev(b - a)) 62 | } 63 | exonenumber1 <- which(tmp.sum1 >= max.pos1)[1] 64 | if (is.na(exonenumber1)) {exonenumber1 <- length(tmp.sum1)} 65 | a2 <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.genetable2]), ","))) 66 | b2 <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.genetable2]), ","))) 67 | strand2 <- as.character(EnsemblGene.Structures$Strand[ix.genetable2]) 68 | if (strand2 == "+") { 69 | tmp.sum2 <- cumsum((b2 - a2)) 70 | } else { 71 | tmp.sum2 <- cumsum(rev(b2 - a2)) 72 | } 73 | exonenumber2 <- which(tmp.sum2 >= min.pos2)[1] 74 | if (is.na(exonenumber2)) {exonenumber2 <- length(tmp.sum2)} 75 | id.gf1 <- paste(id1[i], exonenumber1, sep = "_") 76 | fasta.gf1.tmp0 <- sequences[ix.gene1] 77 | start.end.exons <- c(0,tmp.sum1) 78 | fasta.gf1 <- substr(fasta.gf1.tmp0, min.pos1, (max.pos1 + maxgap - 1)) 79 | id.gf2 <- paste(id2[i], exonenumber2, sep = "_") 80 | fasta.gf2.tmp0 <- sequences[ix.gene2] 81 | if (max.pos2 > nchar(fasta.gf2.tmp0)) {max.pos2 <- nchar(fasta.gf2.tmp0)} 82 | start.end.exons <- c(0,tmp.sum2) 83 | fasta.gf2 <- substr(fasta.gf2.tmp0, (min.pos2 - maxgap), max.pos2) 84 | id.fastaGF <- paste(">",id.gf1,"----",id.gf2," junction@",nchar(fasta.gf1),sep = "") 85 | sequences.fasta[i] <- paste(fasta.gf1, fasta.gf2, sep = "") 86 | fasta.gf12 <- formatfasta(sequences.fasta[i]) 87 | ids_fasta[i] <- paste(id.gf1,id.gf2, sep = "----") 88 | junctions[i] <- nchar(fasta.gf1) 89 | fastaGF <- c(id.fastaGF, fasta.gf12) 90 | fasta.file <- c(fasta.file, fastaGF) 91 | } 92 | save(junctions, file = file.path(outputfolder, "out", paste(samplename,".junctions.RData", sep = ""))) 93 | save(sequences.fasta, file = file.path(outputfolder, "out", paste(samplename,".sequences_fasta.RData", sep = ""))) 94 | save(ids_fasta, file = file.path(outputfolder, "out", paste(samplename, ".ids_fasta.RData", sep = ""))) 95 | cat(fasta.file, file = file.path(outputfolder,"out", paste(samplename,".EricScript.junctions.fa",sep = "")), sep = "\n") 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /lib/R/BuildNeighbourhoodSequences.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | samplename <- split.vars [1] 5 | outputfolder <- split.vars[2] 6 | z <- read.delim(file.path(outputfolder,"out",paste(samplename,".intervals.pileup", sep = "")), sep = "\t", header = F) 7 | load(file.path(outputfolder,"out",paste(samplename,".ids_filtered.RData", sep = ""))) 8 | load(file.path(outputfolder,"out",paste(samplename,".junctions.recalibrated.RData", sep = ""))) 9 | load(file.path(outputfolder, "out", paste(samplename, ".ids_fasta.RData", sep = ""))) 10 | id.pileup <- as.character(z[,1]) 11 | pos.pileup <- as.numeric(as.character(z[,2])) 12 | sequence.pileup <- as.character(z[,3]) 13 | unique.ids.pileup <- unique(id.pileup) 14 | width <- 100 15 | fasta.file <- c() 16 | for (i in 1:length(id.filtered)) { 17 | ix.id <- which(id.pileup == id.filtered[i]) 18 | ix.ref <- which(ids_fasta == id.filtered[i]) 19 | junction <- junctions.recalibrated[ix.ref] 20 | ix.id.pileup <- which(id.pileup == id.filtered[i]) 21 | ix.junction1 <- which(pos.pileup[ix.id.pileup] == junction) 22 | ix.junction2 <- which(pos.pileup[ix.id.pileup] == (junction + 1)) 23 | seq.vec <- rep("N", width) 24 | pos.seq <- seq.int((junction-(width/2-1)), (junction + (width/2))) 25 | ix.pos.tmp <- which(pos.seq %in% pos.pileup[ix.id.pileup]) 26 | seq.vec[ix.pos.tmp] <- sequence.pileup[ix.id.pileup] 27 | if ((length(ix.junction1)!=0) & (length(ix.junction2)!=0)) { 28 | query.sequence <- character(length = 1) 29 | for (ii in 1:length(seq.vec)) { 30 | query.sequence <- paste(query.sequence, seq.vec[ii], sep = "") 31 | } 32 | ids_fasta_query <- paste(">", id.filtered[i],sep = "") 33 | fasta.file <- c(fasta.file, c(ids_fasta_query, query.sequence)) 34 | } 35 | } 36 | cat(fasta.file, sep = "\n", file = file.path(outputfolder, "out",paste(samplename,".checkselfhomology.fa", sep = ""))) 37 | cat(file.path(outputfolder, "out", paste(samplename,".checkselfhomology.fa", sep = "")), file = file.path(outputfolder, "out", ".link")) 38 | -------------------------------------------------------------------------------- /lib/R/CalcBreakpointPositions.R: -------------------------------------------------------------------------------- 1 | ## re-calculate breakpoints positions for samples analysed with ericscript < 0.4.0 2 | ## and re-estimation of ericscore and blacklist 3 | 4 | vars.tmp <- commandArgs() 5 | vars <- vars.tmp[length(vars.tmp)] 6 | split.vars <- unlist(strsplit(vars, ",")) 7 | ericscriptfolder <- as.character(split.vars[1]) 8 | outputfolder <- split.vars [2] 9 | dbfolder <- split.vars [3] 10 | refid <- as.character(split.vars[4]) 11 | genomeref <- as.character(split.vars[5]) 12 | 13 | flag.ada <- require(ada, quietly = T) 14 | if (flag.ada == F) { 15 | require(kernlab, quietly = T) 16 | } 17 | load(file.path(ericscriptfolder, "lib","data", "_resources", "BlackList.RData")) 18 | load(file.path(ericscriptfolder, "lib","data", "_resources", "DataModel.RData")) 19 | load(file.path(dbfolder, "data", refid, "EnsemblGene.Structures.RData")) 20 | 21 | myls <- list.files(outputfolder, pattern = "Summary.RData") 22 | myls.tsv.total <- list.files(outputfolder, pattern = ".results.total.tsv") 23 | myls.tsv.filt <- list.files(outputfolder, pattern = ".results.filtered.tsv") 24 | 25 | if (length(myls) == 1 & length(myls.tsv.total) == 1 & length(myls.tsv.filt)) { 26 | samplename <- gsub(".results.total.tsv", "", myls.tsv.total) 27 | load(file.path(outputfolder, myls)) 28 | cat(paste("[EricScript] Re-estimating EricScore for sample ", samplename, "... ", sep = "")) 29 | 30 | ensgenename1 <- as.character(SummaryMat$EnsemblGene1) 31 | ensgenename2 <- as.character(SummaryMat$EnsemblGene2) 32 | genename1 <- as.character(SummaryMat$GeneName1) 33 | genename2 <- as.character(SummaryMat$GeneName2) 34 | gjs.score <- as.numeric(as.character(SummaryMat$GJS)) 35 | edge.score <- as.numeric(as.character(SummaryMat$ES)) 36 | nreads.score <- as.numeric(as.character(SummaryMat$US)) 37 | cov.score <- as.numeric(as.character(SummaryMat$GeneExpr_Fused)) 38 | myscores <- cbind(gjs.score, edge.score, nreads.score, cov.score) 39 | colnames(myscores) <- c("probs.gjs", "probs.es", "probs.us", "cov") 40 | myscores <- data.frame(myscores) 41 | if (flag.ada) { 42 | myada <- ada(control~., data = DataScores,loss="exponential", nu = 0.1) 43 | ericscore <- as.numeric(predict(myada, myscores, type = "probs")[,2]) 44 | } else { 45 | sig <- sigest(control~., data = DataScores, frac = 1, na.action = na.omit, scaled = TRUE)[2] 46 | model <- ksvm(control~., data = DataScores, type = "C-svc", kernel = "rbfdot", kpar = list(sigma = sig), C = 1, prob.model = TRUE) 47 | ericscore <- predict(model, myscores, type = "probabilities")[,2] 48 | } 49 | 50 | myblacklist <- rep("", length(genename1)) 51 | ix.bl <- which((genename1 %in% gene.bl1 & genename2 %in% gene.bl2) | (genename1 %in% gene.bl2 & genename2 %in% gene.bl1)) 52 | if (length(ix.bl) > 0) { 53 | for (bli in 1: length(ix.bl)) { 54 | ix.bli <- which((gene.bl1 == genename1[ix.bl[bli]] & gene.bl2 == genename2[ix.bl[bli]]) | (gene.bl2 == genename1[ix.bl[bli]] & gene.bl1 == genename2[ix.bl[bli]])) 55 | myblacklist[ix.bl[bli]] <- paste("Frequency:", sum(freq.bl[ix.bli])) 56 | } 57 | } 58 | 59 | cat("done. \n") 60 | cat(paste("[EricScript] Re-calculating breakpoint positions for sample ", samplename, "... ", sep = "")) 61 | 62 | myseq <- as.character(SummaryMat$JunctionSequence) 63 | left_junction <- substr(myseq, 1, 50) 64 | right_junction <- substr(myseq, 51, 100) 65 | 66 | chr1 <- rep("", length(ensgenename1)) 67 | chr2 <- rep("", length(ensgenename1)) 68 | genestart1 <- rep("", length(ensgenename1)) 69 | genestart2 <- rep("", length(ensgenename1)) 70 | geneend1 <- rep("", length(ensgenename1)) 71 | geneend2 <- rep("", length(ensgenename1)) 72 | strand1 <- rep("", length(ensgenename1)) 73 | strand2 <- rep("", length(ensgenename1)) 74 | 75 | generef <- as.character(EnsemblGene.Structures$EnsemblGene) 76 | chrref <- as.character(EnsemblGene.Structures$Chromosome) 77 | genestartref <- as.character(EnsemblGene.Structures$geneStart) 78 | geneendref <- as.character(EnsemblGene.Structures$geneEnd) 79 | strandref <- as.character(EnsemblGene.Structures$Strand) 80 | for (i in 1: length(ensgenename1)) { 81 | 82 | ix.ref <- which(generef == ensgenename1[i]) 83 | chr1[i] <- chrref[ix.ref] 84 | genestart1[i] <- genestartref[ix.ref] 85 | geneend1[i] <- geneendref[ix.ref] 86 | strand1[i] <- strandref[ix.ref] 87 | 88 | ix.ref <- which(generef == ensgenename2[i]) 89 | chr2[i] <- chrref[ix.ref] 90 | genestart2[i] <- genestartref[ix.ref] 91 | geneend2[i] <- geneendref[ix.ref] 92 | strand2[i] <- strandref[ix.ref] 93 | } 94 | 95 | 96 | # NEW Find GenomicPosition (50nt) 97 | for (i in 1: length(ensgenename1)) { 98 | if (i == 1) { 99 | cat(paste("@", i, "_", 1, "\n", left_junction[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction[i])))), "\n", "@", i, "_", 2, "\n", right_junction[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = F) 100 | } else { 101 | cat(paste("@", i, "_", 1, "\n", left_junction[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction[i])))), "\n", "@", i, "_", 2, "\n", right_junction[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = T) 102 | } 103 | } 104 | system(paste("bwa aln", "-R 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.sai"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) 105 | system(paste("bwa samse", "-n 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq.sai"), file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) 106 | system(paste("cat", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "|", file.path(ericscriptfolder, "lib", "perl", "xa2multi.pl"), "-", "|","grep -v -e \'^\\@\' -",">", file.path(outputfolder, "out", "findgenomicpos.fq.sam"))) 107 | xx.pos <- read.delim(file.path(outputfolder, "out", "findgenomicpos.fq.sam"), sep = "\t", header= F) 108 | genpos_1 <- rep(0,length(ensgenename1)) 109 | genpos_2 <- rep(0,length(ensgenename1)) 110 | id.pos <- as.character(xx.pos[[1]]) 111 | flag.pos <- as.character(xx.pos[[2]]) 112 | chr.pos <- as.character(xx.pos[[3]]) 113 | if (length(grep("chr", chr.pos)) > 0) { 114 | chr.pos <- gsub("chr", "", chr.pos) 115 | } 116 | pos.pos <- as.numeric(as.character(xx.pos[[4]])) 117 | mapq.pos <- as.numeric(as.character(xx.pos[[5]])) 118 | 119 | for (i in 1: length(ensgenename1)) { 120 | 121 | ## for 5' gene 122 | ix.mypos <- which(id.pos == paste(i, "_1", sep = "")) 123 | chr.pos.ix <- chr.pos[ix.mypos] 124 | flag.pos.ix <- flag.pos[ix.mypos] 125 | pos.pos.ix <- pos.pos[ix.mypos] 126 | mapq.pos.ix <- mapq.pos[ix.mypos] 127 | ix.okpos <- which(chr.pos.ix == chr1[i] & pos.pos.ix >= as.numeric(genestart1[i]) & pos.pos.ix <= as.numeric(geneend1[i])) 128 | if (length(ix.okpos) > 1) { 129 | ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] 130 | } 131 | if (length(ix.okpos) > 0) { 132 | if (flag.pos.ix[ix.okpos] == 16) { 133 | genpos_1[i] <- pos.pos.ix[ix.okpos] 134 | } else { 135 | genpos_1[i] <- pos.pos.ix[ix.okpos] + 49 136 | } 137 | } 138 | ## for 3' gene 139 | ix.mypos <- which(id.pos == paste(i, "_2", sep = "")) 140 | chr.pos.ix <- chr.pos[ix.mypos] 141 | flag.pos.ix <- flag.pos[ix.mypos] 142 | pos.pos.ix <- pos.pos[ix.mypos] 143 | mapq.pos.ix <- mapq.pos[ix.mypos] 144 | ix.okpos <- which(chr.pos.ix == chr2[i] & pos.pos.ix >= as.numeric(genestart2[i]) & pos.pos.ix <= as.numeric(geneend2[i])) 145 | if (length(ix.okpos) > 1) { 146 | ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] 147 | } 148 | if (length(ix.okpos) > 0) { 149 | if (flag.pos.ix[ix.okpos] == 16) { 150 | genpos_2[i] <- pos.pos.ix[ix.okpos] + 49 151 | } else { 152 | genpos_2[i] <- pos.pos.ix[ix.okpos] 153 | } 154 | } 155 | } 156 | 157 | # NEW Find GenomicPosition (25nt_1) 158 | ix.na.pos_1 <- which(genpos_1 == 0) 159 | ix.na.pos_2 <- which(genpos_2 == 0) 160 | if (length(ix.na.pos_1 ) > 0 | length(ix.na.pos_2 ) > 0) { 161 | left_junction.trim <- substr(left_junction, 26, 50) 162 | right_junction.trim <- substr(right_junction, 1, 25) 163 | for (i in 1: length(ensgenename1)) { 164 | if (i == 1) { 165 | cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = F) 166 | } else { 167 | cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = T) 168 | } 169 | } 170 | system(paste("bwa aln", "-R 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.sai"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) 171 | system(paste("bwa samse", "-n 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq.sai"), file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) 172 | system(paste("cat", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "|", file.path(ericscriptfolder, "lib", "perl", "xa2multi.pl"), "-", "|","grep -v -e \'^\\@\' -",">", file.path(outputfolder, "out", "findgenomicpos.fq.sam"))) 173 | xx.pos <- read.delim(file.path(outputfolder, "out", "findgenomicpos.fq.sam"), sep = "\t", header= F) 174 | id.pos <- as.character(xx.pos[[1]]) 175 | flag.pos <- as.character(xx.pos[[2]]) 176 | chr.pos <- as.character(xx.pos[[3]]) 177 | if (length(grep("chr", chr.pos)) > 0) { 178 | chr.pos <- gsub("chr", "", chr.pos) 179 | } 180 | pos.pos <- as.numeric(as.character(xx.pos[[4]])) 181 | mapq.pos <- as.numeric(as.character(xx.pos[[5]])) 182 | for (i in 1: length(ensgenename1)) { 183 | if (i %in% ix.na.pos_1) { 184 | ## for 5' gene 185 | ix.mypos <- which(id.pos == paste(i, "_1", sep = "")) 186 | chr.pos.ix <- chr.pos[ix.mypos] 187 | flag.pos.ix <- flag.pos[ix.mypos] 188 | pos.pos.ix <- pos.pos[ix.mypos] 189 | mapq.pos.ix <- mapq.pos[ix.mypos] 190 | ix.okpos <- which(chr.pos.ix == chr1[i] & pos.pos.ix >= as.numeric(genestart1[i]) & pos.pos.ix <= as.numeric(geneend1[i])) 191 | if (length(ix.okpos) > 1) { 192 | ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] 193 | } 194 | if (length(ix.okpos) > 0) { 195 | if (flag.pos.ix[ix.okpos] == 16) { 196 | genpos_1[i] <- pos.pos.ix[ix.okpos] 197 | } else { 198 | genpos_1[i] <- pos.pos.ix[ix.okpos] + 24 199 | } 200 | } 201 | } 202 | ## for 3' gene 203 | if (i %in% ix.na.pos_2) { 204 | ix.mypos <- which(id.pos == paste(i, "_2", sep = "")) 205 | chr.pos.ix <- chr.pos[ix.mypos] 206 | flag.pos.ix <- flag.pos[ix.mypos] 207 | pos.pos.ix <- pos.pos[ix.mypos] 208 | mapq.pos.ix <- mapq.pos[ix.mypos] 209 | ix.okpos <- which(chr.pos.ix == chr2[i] & pos.pos.ix >= as.numeric(genestart2[i]) & pos.pos.ix <= as.numeric(geneend2[i])) 210 | if (length(ix.okpos) > 1) { 211 | ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] 212 | } 213 | if (length(ix.okpos) > 0) { 214 | if (flag.pos.ix[ix.okpos] == 16) { 215 | genpos_2[i] <- pos.pos.ix[ix.okpos] + 24 216 | } else { 217 | genpos_2[i] <- pos.pos.ix[ix.okpos] 218 | } 219 | } 220 | } 221 | } 222 | # NEW Find GenomicPosition (25nt_2) 223 | ix.na.pos_1 <- which(genpos_1 == 0) 224 | ix.na.pos_2 <- which(genpos_2 == 0) 225 | left_junction.trim <- substr(left_junction, 1, 25) 226 | right_junction.trim <- substr(right_junction, 26, 50) 227 | for (i in 1: length(ensgenename1)) { 228 | if (i == 1) { 229 | cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = F) 230 | } else { 231 | cat(paste("@", i, "_", 1, "\n", left_junction.trim[i], "\n+\n", gsub(", ", "", toString(rep("I", nchar(left_junction.trim[i])))), "\n", "@", i, "_", 2, "\n", right_junction.trim[i],"\n+\n", gsub(", ", "", toString(rep("I", nchar(right_junction.trim[i])))), sep = ""), sep = "\n", file = file.path(outputfolder, "out", "findgenomicpos.fq"), append = T) 232 | } 233 | } 234 | system(paste("bwa aln", "-R 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.sai"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) 235 | system(paste("bwa samse", "-n 50", genomeref, file.path(outputfolder, "out", "findgenomicpos.fq.sai"), file.path(outputfolder, "out", "findgenomicpos.fq"), ">", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "2>>", file.path(outputfolder, "out", ".ericscript.log"))) 236 | system(paste("cat", file.path(outputfolder, "out", "findgenomicpos.fq.tmp"), "|", file.path(ericscriptfolder, "lib", "perl", "xa2multi.pl"), "-", "|","grep -v -e \'^\\@\' -",">", file.path(outputfolder, "out", "findgenomicpos.fq.sam"))) 237 | 238 | 239 | xx.pos <- read.delim(file.path(outputfolder, "out", "findgenomicpos.fq.sam"), sep = "\t", header= F) 240 | id.pos <- as.character(xx.pos[[1]]) 241 | flag.pos <- as.character(xx.pos[[2]]) 242 | chr.pos <- as.character(xx.pos[[3]]) 243 | if (length(grep("chr", chr.pos)) > 0) { 244 | chr.pos <- gsub("chr", "", chr.pos) 245 | } 246 | pos.pos <- as.numeric(as.character(xx.pos[[4]])) 247 | mapq.pos <- as.numeric(as.character(xx.pos[[5]])) 248 | for (i in 1: length(ensgenename1)) { 249 | if (i %in% ix.na.pos_1) { 250 | ## for 5' gene 251 | ix.mypos <- which(id.pos == paste(i, "_1", sep = "")) 252 | chr.pos.ix <- chr.pos[ix.mypos] 253 | flag.pos.ix <- flag.pos[ix.mypos] 254 | pos.pos.ix <- pos.pos[ix.mypos] 255 | mapq.pos.ix <- mapq.pos[ix.mypos] 256 | ix.okpos <- which(chr.pos.ix == chr1[i] & pos.pos.ix >= as.numeric(genestart1[i]) & pos.pos.ix <= as.numeric(geneend1[i])) 257 | if (length(ix.okpos) > 1) { 258 | ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] 259 | } 260 | if (length(ix.okpos) > 0) { 261 | if (flag.pos.ix[ix.okpos] == 16) { 262 | genpos_1[i] <- pos.pos.ix[ix.okpos] 263 | } else { 264 | genpos_1[i] <- pos.pos.ix[ix.okpos] + 24 265 | } 266 | } 267 | } 268 | ## for 3' gene 269 | if (i %in% ix.na.pos_2) { 270 | ix.mypos <- which(id.pos == paste(i, "_2", sep = "")) 271 | chr.pos.ix <- chr.pos[ix.mypos] 272 | flag.pos.ix <- flag.pos[ix.mypos] 273 | pos.pos.ix <- pos.pos[ix.mypos] 274 | mapq.pos.ix <- mapq.pos[ix.mypos] 275 | ix.okpos <- which(chr.pos.ix == chr2[i] & pos.pos.ix >= as.numeric(genestart2[i]) & pos.pos.ix <= as.numeric(geneend2[i])) 276 | if (length(ix.okpos) > 1) { 277 | ix.okpos <- ix.okpos[which.max(mapq.pos.ix[ix.okpos])] 278 | } 279 | if (length(ix.okpos) > 0) { 280 | if (flag.pos.ix[ix.okpos] == 16) { 281 | genpos_2[i] <- pos.pos.ix[ix.okpos] + 24 282 | } else { 283 | genpos_2[i] <- pos.pos.ix[ix.okpos] 284 | } 285 | } 286 | } 287 | } 288 | } 289 | 290 | # # refine genomic coordinates 291 | # genpos_1.recal <- genpos_1 292 | # genpos_2.recal <- genpos_2 293 | # for (i in 1: length(ensgenename1)) { 294 | # ix.ref <- which(generef == ensgenename1[i]) 295 | # if (strand1[i] == "+") { 296 | # exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.ref]), ","))) 297 | # } else { 298 | # exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.ref]), ","))) 299 | # } 300 | # ix.exon <- which.min(abs(genpos_1[i] - exonpos)) 301 | # mydiff <- abs(genpos_1[i] - exonpos[ix.exon]) 302 | # if (mydiff <= 3) { 303 | # genpos_1.recal[i] <- exonpos[ix.exon] 304 | # } 305 | # 306 | # ix.ref <- which(generef == ensgenename1[i]) 307 | # if (strand2[i] == "+") { 308 | # exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonStart[ix.ref]), ","))) 309 | # } else { 310 | # exonpos <- as.numeric(unlist(strsplit(as.character(EnsemblGene.Structures$exonEnd[ix.ref]), ","))) 311 | # } 312 | # ix.exon <- which.min(abs(genpos_2[i] - exonpos)) 313 | # mydiff <- abs(genpos_2[i] - exonpos[ix.exon]) 314 | # if (mydiff <= 3) { 315 | # genpos_2.recal[i] <- exonpos[ix.exon] 316 | # } 317 | # } 318 | # mynames <- names(SummaryMat) 319 | # SummaryMat$Breakpoint1 <- genpos_1.recal 320 | # SummaryMat$Breakpoint2 <- genpos_2.recal 321 | 322 | SummaryMat$Breakpoint1 <- genpos_1 323 | SummaryMat$Breakpoint2 <- genpos_2 324 | SummaryMat$EricScore <- ericscore 325 | SummaryMat$Blacklist <- myblacklist 326 | save(SummaryMat, file=file.path(outputfolder, paste(samplename, ".Summary.recalc.RData", sep = ""))) 327 | n.spanning <- as.numeric(as.character(SummaryMat$spanningreads)) 328 | n.crossing <- as.numeric(as.character(SummaryMat$crossingreads)) 329 | oddity.spanningreads <- rep(0, length(genpos_1)) 330 | oddity.spanningreads[which(n.spanning == 1 & n.crossing >= 10)] <- 1 331 | 332 | if (dim(SummaryMat)[1] > 0) { 333 | write.table(SummaryMat, file = file.path(outputfolder,paste(samplename,".results.recalc.total.tsv", sep = "")), sep = "\t", row.names = F, quote = F) 334 | ix.sorting.score <- sort(ericscore, decreasing = T, index.return = T)$ix 335 | ericscore.sorted <- ericscore[ix.sorting.score] 336 | myblacklist.sorted <- myblacklist[ix.sorting.score] 337 | oddity.spanningreads.sorted <- oddity.spanningreads[ix.sorting.score] 338 | SummaryMat.sorted <- SummaryMat[ix.sorting.score, ] 339 | SummaryMat.Filtered <- SummaryMat.sorted[which(ericscore.sorted > 0.5 & myblacklist.sorted == "" & oddity.spanningreads.sorted == 0), ] 340 | write.table(SummaryMat.Filtered[, -16], file = file.path(outputfolder,paste(samplename,".results.recalc.filtered.tsv", sep = "")), sep = "\t", row.names = F, quote = F) 341 | } 342 | cat("done. \n") 343 | cat(paste("[EricScript] Breakpoint position corrected files are in ", outputfolder, ".\n", sep = "")) 344 | 345 | } else { 346 | cat("[EricScript] No files of results found in ", outputfolder, ". Nothing to be done, Exit!\n", sep =" ") 347 | } 348 | 349 | -------------------------------------------------------------------------------- /lib/R/CalcStats.R: -------------------------------------------------------------------------------- 1 | ### calculate statistics 2 | 3 | vars.tmp <- commandArgs() 4 | vars <- vars.tmp[length(vars.tmp)] 5 | split.vars <- unlist(strsplit(vars, ",")) 6 | resultsfolder <- split.vars[1] 7 | outputfolder <- split.vars[2] 8 | datafolder <- split.vars[3] 9 | algoname <- split.vars[4] 10 | dataset <- split.vars[5] 11 | readlength <- as.numeric(split.vars[6]) 12 | normroc <- as.numeric(split.vars[7]) 13 | ericscriptfolder <- as.character(split.vars[8]) 14 | 15 | source(file.path(ericscriptfolder, "lib", "R", "ImportResults.R")) 16 | 17 | trapezint <- function (x, y, a, b) { 18 | ## function of the ROC package (http://bioconductor.org) 19 | if (length(x) != length(y)) 20 | stop("length x must equal length y") 21 | y <- y[x >= a & x <= b] 22 | x <- x[x >= a & x <= b] 23 | if (length(unique(x)) < 2) 24 | return(NA) 25 | ya <- approx(x, y, a, ties = max, rule = 2)$y 26 | yb <- approx(x, y, b, ties = max, rule = 2)$y 27 | x <- c(a, x, b) 28 | y <- c(ya, y, yb) 29 | h <- diff(x) 30 | lx <- length(x) 31 | 0.5 * sum(h * (y[-1] + y[-lx])) 32 | } 33 | 34 | algonamelist <- c("ericscript", "chimerascan", "defuse", "fusionmap", "shortfuse") 35 | algoname <- tolower(algoname) 36 | 37 | if (any(algonamelist %in% algoname) == F) { 38 | algoid <- "unknown" 39 | } else { 40 | algoid <- algoname 41 | } 42 | 43 | xx <- list.files(resultsfolder, pattern = "sim_") 44 | nsims <- length(xx) 45 | cat("[EricScript calcstats] Found ", nsims, " synthetic data analysis for algorithm ", algoname,". \n", sep = "") 46 | 47 | if (nsims > 0) { 48 | tpr <- rep(NA, nsims) 49 | fpr <- rep(NA, nsims) 50 | tpr.5 <- rep(NA, nsims) 51 | fpr.5 <- rep(NA, nsims) 52 | tpr.seq <- rep(NA, nsims) 53 | refpath <- file.path(datafolder, dataset, "data") 54 | rocs.tpr <- rep(0, 1000) 55 | rocs.fpr <- rep(0, 1000) 56 | nosims <- 0 57 | 58 | refpath <- file.path(datafolder, dataset, "data") 59 | 60 | for (i in 1: nsims) { 61 | if (i < 10) { 62 | dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", "0000", i, sep = ""))) 63 | if (is.list(dataresults)) { 64 | load(file.path(refpath, paste("sim_", "0000", i, sep = ""), "GeneFusions.RData")) 65 | cat ("[EricScript calcstats] Analysing ", paste("sim_", "0000", i, sep = "")," ... ") 66 | } else if (dataresults == 0) { 67 | cat("[EricScript calcstats] No results file found for ", paste("sim_", "0000", i, sep = ""), ".\n", sep = "") 68 | nosims <- nosims + 1 69 | next 70 | } else if (dataresults > 1) { 71 | cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", "0000", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") 72 | nosims <- nosims + 1 73 | next 74 | } 75 | } else if (i >= 10 & i < 100) { 76 | dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", "000", i, sep = ""))) 77 | if (is.list(dataresults)) { 78 | load(file.path(refpath, paste("sim_", "000", i, sep = ""), "GeneFusions.RData")) 79 | cat ("[EricScript calcstats] Analysing ", paste("sim_", "000", i, sep = "")," ... ") 80 | } else if (dataresults == 0) { 81 | cat("[EricScript calcstats] No results file found for ", paste("sim_", "000", i, sep = ""), ".\n", sep = "") 82 | nosims <- nosims + 1 83 | next 84 | } else if (dataresults > 1) { 85 | cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", "000", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") 86 | nosims <- nosims + 1 87 | next 88 | } 89 | } else if (i >= 100 & i < 1000) { 90 | dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", "00", i, sep = ""))) 91 | if (is.list(dataresults)) { 92 | load(file.path(refpath, paste("sim_", "00", i, sep = ""), "GeneFusions.RData")) 93 | cat ("[EricScript calcstats] Analysing ", paste("sim_", "00", i, sep = "")," ... ") 94 | } else if (dataresults == 0) { 95 | cat("[EricScript calcstats] No results file found for ", paste("sim_", "00", i, sep = ""), ".\n", sep = "") 96 | nosims <- nosims + 1 97 | next 98 | } else if (dataresults > 1) { 99 | cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", "00", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") 100 | nosims <- nosims + 1 101 | next 102 | } 103 | } else if (i >= 1000 & i < 10000) { 104 | dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", "0", i, sep = ""))) 105 | if (is.list(dataresults)) { 106 | load(file.path(refpath, paste("sim_", "0", i, sep = ""), "GeneFusions.RData")) 107 | cat ("[EricScript calcstats] Analysing ", paste("sim_", "0000", i, sep = "")," ... ") 108 | } else if (dataresults == 0) { 109 | cat("[EricScript calcstats] No results file found for ", paste("sim_", "0", i, sep = ""), ".\n", sep = "") 110 | nosims <- nosims + 1 111 | next 112 | } else if (dataresults > 1) { 113 | cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", "0", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") 114 | nosims <- nosims + 1 115 | next 116 | } 117 | } else if (i >= 10000 & i < 100000) { 118 | dataresults <- get(paste("Import", algoid, sep = "_"))(file.path(resultsfolder, paste("sim_", i, sep = ""))) 119 | if (is.list(dataresults)) { 120 | load(file.path(refpath, paste("sim_", i, sep = ""), "GeneFusions.RData")) 121 | cat ("[EricScript calcstats] Analysing ", paste("sim_", i, sep = "")," ... ") 122 | } else if (dataresults == 0) { 123 | cat("[EricScript calcstats] No results file found for ", paste("sim_", i, sep = ""), ".\n", sep = "") 124 | nosims <- nosims + 1 125 | next 126 | } else if (dataresults > 1) { 127 | cat("[EricScript calcstats] Error: ", dataresults, " results file found for ", paste("sim_", i, sep = ""), ". Only 1 file of results is required. \n", sep = "") 128 | nosims <- nosims + 1 129 | next 130 | } 131 | } 132 | 133 | id1.simul <- GeneFusions[[1]] 134 | id2.simul <- GeneFusions[[2]] 135 | seq.simul <- GeneFusions[[5]] 136 | ngenefusions <- length(id1.simul) 137 | if (!exists("cov.tpr")) { 138 | cov.tpr <- rep(0, ngenefusions) 139 | } 140 | gene1ens <- as.character(dataresults$gene5) 141 | gene2ens <- as.character(dataresults$gene3) 142 | nreads <- as.numeric(as.character(dataresults$nreads)) 143 | score <- as.numeric(as.character(dataresults$score)) 144 | if (normroc > 1) { 145 | score <- score/normroc 146 | } 147 | seq <- as.character(dataresults$seq) 148 | 149 | tpr[i] <- length(which(gene1ens %in% id1.simul & gene2ens %in% id2.simul))/ngenefusions 150 | ix.tpr <- which(id1.simul %in% gene1ens & id2.simul %in%gene2ens) 151 | ix.fpr <- which((gene1ens %in% id1.simul & gene2ens %in% id2.simul) == F) 152 | cov.tpr[ix.tpr] <- cov.tpr[ix.tpr] + 1 153 | 154 | tpr.5[i] <- length(which(gene1ens %in% id1.simul & gene2ens %in% id2.simul & nreads > 5))/ngenefusions 155 | fpr.5[i] <- length(which((gene1ens %in% id1.simul & gene2ens %in% id2.simul) == F & nreads > 5))/length(gene1ens) 156 | 157 | fpr[i] <- (length(gene1ens) - tpr[i]*ngenefusions)/length(gene1ens) 158 | 159 | 160 | rocs.tpr <- colSums(rbind(rocs.tpr, tabulate(score[which(gene1ens %in% id1.simul & gene2ens %in% id2.simul)]*1000, nbins = 1000))) 161 | rocs.fpr <- colSums(rbind(rocs.fpr, tabulate(score[which((gene1ens %in% id1.simul & gene2ens %in% id2.simul) == F)]*1000, nbins = 1000))) 162 | 163 | ix.correctseq <- c() 164 | for (ii in 1: length(ix.tpr)) { 165 | ix.tmp <- which(gene1ens == id1.simul[ix.tpr[ii]] & gene2ens == id2.simul[ix.tpr[ii]]) 166 | if (length(ix.tmp) > 1) { 167 | ix.tmp <- ix.tmp[1] 168 | } 169 | if (length(agrep(toupper(seq[ix.tmp]), seq.simul[ix.tpr], max = 5)) > 0 & is.na(seq[ix.tmp]) == F) { 170 | ix.correctseq <- c(ix.correctseq, ix.tpr[ii]) 171 | } 172 | } 173 | tpr.seq[i] <- length(ix.correctseq)/length(ix.tpr) 174 | 175 | 176 | cat ("done.\n") 177 | 178 | } 179 | 180 | 181 | roc.total <- rocs.tpr + rocs.fpr 182 | ntot <- sum(roc.total) 183 | ntot.tpr <- sum(rocs.tpr) 184 | ntot.fpr <- sum(rocs.fpr) 185 | sens <- rep(0, 1000) 186 | spec <- rep(0, 1000) 187 | for (i in 1: 1000) { 188 | if (i == 1) { 189 | sens[i] <- sum(rocs.tpr)/ntot.tpr 190 | spec[i] <- 1 - sum(rocs.fpr)/ntot.fpr 191 | } else { 192 | sens[i] <- sum(rocs.tpr[-c(1:i)])/ntot.tpr 193 | spec[i] <- 1 - sum(rocs.fpr[-c(1:i)])/ntot.fpr 194 | } 195 | } 196 | 197 | stats <- list() 198 | stats$algorithm <- algoname 199 | stats$dataset <- dataset 200 | stats$readlength <- readlength 201 | stats$totalsims <- nsims 202 | stats$nsims <- nsims - nosims 203 | stats$meantpr <- mean(tpr, na.rm = T) 204 | stats$meanfpr <- mean(fpr, na.rm = T) 205 | stats$meantpsr <- mean(tpr.seq, na.rm = T) 206 | stats$auc <- trapezint(sens, 1 - spec, 0, 1) 207 | stats$meantpr5 <- mean(tpr.5, na.rm = T) 208 | stats$meanfpr5 <- mean(fpr.5, na.rm = T) 209 | stats$tpr <- tpr 210 | stats$fpr <- fpr 211 | stats$tpsr <- tpr.seq 212 | stats$tpr5 <- tpr.5 213 | stats$fpr5 <- fpr.5 214 | stats$scoring_sensitivity <- sens 215 | stats$scoring_specificity <- spec 216 | stats$covtpr <- cov.tpr/(nsims-nosims) 217 | 218 | save(stats, file = file.path(outputfolder, paste(algoname, dataset, readlength, "stats","RData", sep = "."))) 219 | } else { 220 | cat ("[EricScript calcstats] Error: no directories containing results on synthetic data have been found in ", resultsfolder, ". Exit.\n", sep = "") 221 | 222 | } 223 | 224 | -------------------------------------------------------------------------------- /lib/R/CheckDB.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | ericscriptfolder <- split.vars[1] 5 | refid <- split.vars[2] 6 | dbfolder <- as.character(split.vars[3]) 7 | 8 | flag.dbexists <- 1 9 | 10 | mydbdata.homo <- c("EnsemblGene.Reference.fa", "EnsemblGene.Sequences.RData", "EnsemblGene.GenePosition.RData", "EnsemblGene.Structures.RData", "EnsemblGene.GeneInfo.RData", "EnsemblGene.Paralogs.RData", "EnsemblGene.GeneNames.RData") 11 | mydbdata <- c("EnsemblGene.Reference.fa", "EnsemblGene.Sequences.RData", "EnsemblGene.GenePosition.RData", "EnsemblGene.Structures.RData", "EnsemblGene.GeneInfo.RData","EnsemblGene.GeneNames.RData") 12 | 13 | xx <- file.exists(file.path(dbfolder, "data", refid)) 14 | if (xx) { 15 | xx.files <- list.files(file.path(dbfolder, "data", refid)) 16 | if (refid == "homo_sapiens") { 17 | xx1 <- all( mydbdata.homo %in% xx.files ) 18 | } else { 19 | xx1 <- all( mydbdata %in% xx.files ) 20 | } 21 | if (!xx1) { 22 | flag.dbexists <- 0 23 | cat("[EricScript] Some required db files were not found for", refid, "genome. Please run \"ericscript.pl --downdb --refid", refid, "\" to solve this.\n") 24 | } 25 | } else { 26 | flag.dbexists <- 0 27 | cat("[EricScript] DB data for", refid, "genome do not exist. Set correct -db option or run \" ericscript.pl --downdb --refid", refid, "\" to solve this.\n") 28 | } 29 | 30 | ## check bwa version 31 | yy <- file.exists(file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version")) 32 | if (yy) { 33 | prev.version.bwa <- scan(file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version"), what = "", quiet = T, sep = "\n") 34 | system(paste("bwa", "2>&1", "|", "grep ersion", ">", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"))) 35 | curr.version.bwa <- scan(file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"), what = "", quiet = T, sep = "\n") 36 | if (curr.version.bwa != prev.version.bwa) { 37 | cat("[EricScript] Updating BWA indexes for", refid,"... ") 38 | system(paste("bwa index", file.path(file.path(dbfolder, "data", refid, "EnsemblGene.Reference.fa")), "1>>", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"), "2>>", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"))) 39 | cat("done.\n") 40 | cat(curr.version.bwa, file = file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version")) 41 | } 42 | } else { 43 | system(paste("bwa", "2>&1", "|", "grep ersion", ">", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"))) 44 | curr.version.bwa <- scan(file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"), what = "", quiet = T, sep = "\n") 45 | version.a <- gsub("Version: ", "", strsplit(curr.version.bwa, ".", fixed = T)[[1]][1]) 46 | version.b <- strsplit(curr.version.bwa, ".", fixed = T)[[1]][2] 47 | version.c <- gsub("[a-z]", "", strsplit(strsplit(curr.version.bwa, ".", fixed = T)[[1]][3], "-")[[1]][1]) 48 | version.tot <- as.numeric(paste(version.a, version.b, version.c, sep = "")) 49 | if (version.tot >= 74) { 50 | cat("[EricScript] Updating BWA indexes for", refid, "...") 51 | system(paste("bwa index", file.path(file.path(dbfolder, "data", refid, "EnsemblGene.Reference.fa")), "1>>", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"), "2>>", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version.tmp"))) 52 | cat("done.\n") 53 | system(paste("bwa", "2>&1", "|", "grep ersion", ">", file.path(ericscriptfolder, "lib", "data", "_resources", ".bwa.version"))) 54 | } else { 55 | flag.dbexists <- 0 56 | cat("[EricScript] BWA version >= 0.7.4 is required. Exit.\n") 57 | } 58 | } 59 | 60 | mydbdata.bwa <- c("EnsemblGene.Reference.fa.bwt", "EnsemblGene.Reference.fa.pac", "EnsemblGene.Reference.fa.ann", "EnsemblGene.Reference.fa.amb", "EnsemblGene.Reference.fa.sa") 61 | 62 | if (xx) { 63 | xx.files.bwa <- list.files(file.path(dbfolder, "data", refid)) 64 | xx1 <- all( mydbdata.bwa %in% xx.files.bwa ) 65 | if (!xx1) { 66 | flag.dbexists <- 0 67 | cat("[EricScript] Some required files (bwa indexes) were not found for", refid, "genome. Please run \"ericscript.pl --downdb --refid", refid, "\" to solve this.\n") 68 | } 69 | } 70 | 71 | cat(flag.dbexists, file = file.path(ericscriptfolder, "lib", "data", "_resources", ".flag.dbexists")) 72 | 73 | 74 | -------------------------------------------------------------------------------- /lib/R/CheckSelfHomology.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | samplename <- split.vars [1] 5 | outputfolder <- split.vars[2] 6 | x <- read.delim(file.path(outputfolder,"out",paste(samplename, ".checkselfhomology.blat", sep = "")), sep = "\t", header = F) 7 | query.fa <- readLines(file.path(outputfolder,"out",paste(samplename, ".checkselfhomology.fa", sep = ""))) 8 | id.fa <- substr(query.fa[seq(1, length(query.fa), by = 2)], 2, nchar(query.fa[seq(1, length(query.fa), by = 2)])) 9 | id.query.nolabel <- as.character(x[,1]) 10 | unique.ids.nolabel <- unique(id.query.nolabel) 11 | ix.fa <- which(id.fa %in% unique.ids.nolabel) 12 | seq.fa <- query.fa[seq(2, length(query.fa), by = 2)][ix.fa] 13 | id.target.f <- as.character(x[,2]) 14 | id.match.f <- as.numeric(as.character(x[,4])) 15 | start.match <- as.numeric(as.character(x[,7])) 16 | end.match <- as.numeric(as.character(x[,8])) 17 | diff.match <- end.match - start.match + 1 18 | ix.junct.nohomology <- rep(0, length(unique.ids.nolabel)) 19 | ix.junct.homology <- rep(0, length(unique.ids.nolabel)) 20 | flag.dup.a <- rep(0, length(unique.ids.nolabel)) 21 | flag.dup.b <- rep(0, length(unique.ids.nolabel)) 22 | homology.list <- vector("list", length(unique.ids.nolabel)) 23 | for (i in 1:length(unique.ids.nolabel)) { 24 | ix.id <- which(id.query.nolabel == unique.ids.nolabel[i]) 25 | query <- unique.ids.nolabel[i] 26 | target <- id.target.f[ix.id] 27 | match <- as.numeric(id.match.f[ix.id]) 28 | query.tmp <- unlist(strsplit(query, "----", fixed = T)) 29 | query_a <- unlist(strsplit(query.tmp[1], "_"))[1] 30 | query_b <- unlist(strsplit(query.tmp[2], "_"))[1] 31 | diff.match.tmp <- diff.match[ix.id] 32 | start.match.tmp <- start.match[ix.id] 33 | end.match.tmp <- end.match[ix.id] 34 | width <- 100 - length(grep("N", unlist(strsplit(seq.fa[i], "")))) 35 | ix.c <- seq.int(1,length(target)) 36 | ix.a <- which(target %in% query_a) 37 | ix.b <- which(target %in% query_b) 38 | ix.ab <- c(ix.a, ix.b) 39 | if (((length(ix.a) > 0) & (length(ix.b) > 0)) | (length(ix.a) > 1 & length(ix.b) == 1) | (length(ix.a) == 1 & length(ix.b) > 1)) { 40 | if ((length(ix.a) > 1 & length(ix.b) == 1)) { 41 | myflag <- length(which(start.match.tmp[ix.a] %in% c((start.match.tmp[ix.b]-3):(start.match.tmp[ix.b]+3)))) + length(which(end.match.tmp[ix.a] %in% c((end.match.tmp[ix.b]-3):(end.match.tmp[ix.b]+3)))) 42 | } else if ((length(ix.a) == 1 & length(ix.b) > 1)) { 43 | myflag <- length(which(start.match.tmp[ix.b] %in% c((start.match.tmp[ix.a]-3):(start.match.tmp[ix.a]+3)))) + length(which(end.match.tmp[ix.b] %in% c((end.match.tmp[ix.a]-3):(end.match.tmp[ix.a]+3)))) 44 | } else { 45 | myflag <- 0 46 | } 47 | if (myflag == 0) { 48 | if (max(diff.match.tmp[ix.ab]) < round(0.8*width) & ((length(ix.ab) > 2) & any(diff.match.tmp[ix.ab] < 30) | (length(ix.ab) == 2))) { 49 | if (length(ix.ab) != 0) { 50 | ix.c <- ix.c[-ix.ab] 51 | } 52 | 53 | if(length(ix.c) != 0) { 54 | unique.ids.homology <- unique(target[ix.c]) 55 | homology.list[[i]] <- vector("list", length(unique.ids.homology)) 56 | for (j in 1:length(unique.ids.homology)) { 57 | ix.id.homology <- which(target[ix.c] == unique.ids.homology[j]) 58 | max.match <- max(match[ix.c][ix.id.homology]) 59 | homology.list[[i]][[j]] <- cbind(unique.ids.homology[j], max.match) 60 | } 61 | ix.junct.homology[i] <- 1 62 | } 63 | if(length(ix.c) == 0) { 64 | ix.junct.nohomology[i] <- 1 65 | 66 | } 67 | if (length(ix.a) > 1) { 68 | flag.dup.a[i] <- 1 69 | } 70 | if (length(ix.b) > 1) { 71 | flag.dup.b[i] <- 1 72 | } 73 | 74 | } 75 | 76 | } 77 | } 78 | } 79 | ix.junct <- sort(c(which(ix.junct.nohomology == 1), which(ix.junct.homology == 1))) 80 | if (length(ix.junct) == 0) { 81 | myflag <- 0 82 | cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) 83 | stop("No putative gene fusions pass the self-homology filter. Exit!") 84 | } 85 | 86 | info.homology <- rep("", length(ix.junct)) 87 | for (i in 1: length(ix.junct)) { 88 | list.tmp <- homology.list[[ix.junct[i]]] 89 | if (is.null(list.tmp) == F) { 90 | info.homology.tmp <- c() 91 | n.homo <- length(list.tmp) 92 | if (n.homo > 30) { 93 | n.homo <- 30 94 | info.homology.tmp <- "More than 30 homologies found: " 95 | } 96 | for (j in 1: n.homo) { 97 | info.homology.tmp <- paste(info.homology.tmp, paste(list.tmp[[j]][1,1]," (",list.tmp[[j]][1,2],"%)", sep = "" ), sep = "") 98 | if (n.homo > 1 & j < n.homo) { 99 | info.homology.tmp <- paste(info.homology.tmp, ", ", sep = "") 100 | } else 101 | { 102 | info.homology[i] <- info.homology.tmp 103 | } 104 | } 105 | 106 | } 107 | } 108 | 109 | info.id.and.homology <- cbind(unique.ids.nolabel[ix.junct], info.homology, flag.dup.a[ix.junct], flag.dup.b[ix.junct]) 110 | save(info.id.and.homology, file = file.path(outputfolder,"out",paste(samplename,".ids_homology.RData", sep = ""))) 111 | -------------------------------------------------------------------------------- /lib/R/ConvertTxt2R.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | ericscriptfolder <- split.vars [1] 5 | refid <- split.vars[2] 6 | dbfolder <- split.vars[3] 7 | tmpfolder <- split.vars[4] 8 | 9 | xx <- read.delim(file.path(tmpfolder, "genepos.txt"), sep = "\t", header = F) 10 | chrs <- as.character(xx[[2]]) 11 | unique.chrs <- unique(chrs) 12 | geneid <- as.character(xx[[1]]) 13 | genepos <- as.numeric(as.character(xx[[3]])) 14 | xx.strand <- read.delim(file.path(tmpfolder, "strand.txt"), sep = "\t", header = F) 15 | strand <- as.character(xx.strand[[2]]) 16 | ## sorting genes by genomic pos 17 | ix.srt <- rep(NA, dim(xx)[1]) 18 | count <- 0 19 | for ( i in 1: length(unique.chrs)) { 20 | ix.chr <- which(chrs == unique.chrs[i]) 21 | tmp <- sort(genepos[ix.chr], index.return = T) 22 | ix.srt[(count + 1):(count + length(ix.chr))] <- ix.chr[tmp$ix] 23 | count <- count + length(ix.chr) 24 | } 25 | 26 | geneid.srt <- geneid[ix.srt] 27 | genepos.str <- genepos[ix.srt] 28 | strand.srt <- strand[ix.srt] 29 | 30 | EnsemblGene.GenePosition <- xx[ix.srt, ] 31 | names(EnsemblGene.GenePosition) <- c("EnsemblGene", "Chromosome", "Position") 32 | save(EnsemblGene.GenePosition, file = file.path(dbfolder, "data", refid, "EnsemblGene.GenePosition.RData")) 33 | 34 | xx <- read.delim(file.path(tmpfolder, "exonstartend.mrg.txt"), sep = "\t", header = F) 35 | 36 | exgeneid <- as.character(xx[[4]]) 37 | exchr <- as.character(xx[[1]]) 38 | exstart.tmp <- as.numeric(as.character(xx[[2]])) + 1 39 | exend.tmp <- as.character(xx[[3]]) 40 | ix.dup <- which(!duplicated(exgeneid)) 41 | exstart <- rep("", length(ix.dup)) 42 | exend <- rep("", length(ix.dup)) 43 | excount <- rep(NA, length(ix.dup)) 44 | start <- rep("", length(ix.dup)) 45 | end <- rep("", length(ix.dup)) 46 | strand1 <- rep("", length(ix.dup)) 47 | for (i in 1: (length(ix.dup) - 1)) { 48 | if (strand[i] == "-1") { 49 | strand1[i] <- "-" 50 | } else { 51 | strand1[i] <- "+" 52 | } 53 | ix.tmp <- ix.dup[i]:(ix.dup[i+1] - 1) 54 | start[i] <- exstart.tmp[ix.tmp][1] 55 | end[i] <- exend.tmp[ix.tmp][length(ix.tmp)] 56 | exstart[i] <- toString(exstart.tmp[ix.tmp]) 57 | exend[i] <- toString(exend.tmp[ix.tmp]) 58 | excount[i] <- length(ix.tmp) 59 | } 60 | 61 | 62 | 63 | EnsemblGene.Structures <- cbind(exgeneid[ix.dup], exchr[ix.dup], strand1, start, end, excount, exstart, exend)[ix.srt, ] 64 | EnsemblGene.Structures <- data.frame(EnsemblGene.Structures) 65 | names(EnsemblGene.Structures) <- c("EnsemblGene", "Chromosome", "Strand", "geneStart", "geneEnd", "exonCount", "exonStart", "exonEnd") 66 | save(EnsemblGene.Structures, file = file.path(dbfolder, "data", refid, "EnsemblGene.Structures.RData")) 67 | 68 | xx <- read.delim(file.path(tmpfolder, "geneinfo.txt"), sep = "\t", header = F) 69 | EnsemblGene.GeneInfo <- xx[ix.srt, ] 70 | names(EnsemblGene.GeneInfo) <- c("EnsemblGene", "GeneName", "Description") 71 | save(EnsemblGene.GeneInfo, file = file.path(dbfolder, "data", refid, "EnsemblGene.GeneInfo.RData")) 72 | 73 | if (refid == "homo_sapiens") { 74 | xx <- read.delim(file.path(tmpfolder, "paralogs.txt"), sep = "\t", header = F) 75 | EnsemblGene.Paralogs <- xx 76 | names(EnsemblGene.Paralogs) <- c("EnsemblGene", "Paralogs") 77 | save(EnsemblGene.Paralogs, file = file.path(dbfolder, "data", refid, "EnsemblGene.Paralogs.RData")) 78 | } 79 | -------------------------------------------------------------------------------- /lib/R/CreateDataEricTheSimulator.R: -------------------------------------------------------------------------------- 1 | options(stringsAsFactors=F) 2 | vars.tmp <- commandArgs() 3 | vars <- vars.tmp[length(vars.tmp)] 4 | split.vars <- unlist(strsplit(vars, ",")) 5 | refid <- split.vars[1] 6 | dbfolder <- split.vars[2] 7 | tmpfolder <- split.vars[3] 8 | 9 | ## read transcript genomic info 10 | xx <- read.delim(file.path(tmpfolder, "transcripts.txt"), sep = "\t", header = F) 11 | geneid <- xx[[1]] 12 | transcriptid <- xx[[2]] 13 | exonstart <- xx[[3]] 14 | exonend <- xx[[4]] 15 | chr <- xx[[5]] 16 | strandtmp <- xx[[6]] 17 | strand <- rep("+", length(strandtmp)) 18 | strand[strandtmp == "-1"] <- "-" 19 | ## read transcript cdna 20 | xxseq <- scan(file.path(tmpfolder, "transcripts.fa"), what = list(seq="", id=""), sep = "\t", quiet = T) 21 | seqtmp <- xxseq[[1]] 22 | transcriptid.seqtmp <- xxseq[[2]] 23 | rm (xx, xxseq) 24 | unique.transcriptid <- unique(transcriptid) 25 | EnsemblGene.Structures <- c() 26 | GeneNames <- rep("", length(unique.transcriptid)) 27 | sequences <- rep("", length(unique.transcriptid)) 28 | for (i in 1: length(unique.transcriptid)) { 29 | ix <- which(transcriptid == unique.transcriptid[i]) 30 | ixseq <- which(transcriptid.seqtmp == unique.transcriptid[i]) 31 | if (length(ixseq) > 0) { 32 | sequences[i] <- seqtmp[ixseq] 33 | } else { 34 | sequences[i] <- "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" 35 | } 36 | ixsrt <- sort(exonstart[ix], index.return = T)$ix 37 | genestart <- min(c(exonstart[ix], exonend[ix])) 38 | geneend <- max(c(exonstart[ix], exonend[ix])) 39 | exonStart <- toString(exonstart[ix[ixsrt]]) 40 | exonEnd <- toString(exonend[ix[ixsrt]]) 41 | exoncount <- length(ix) 42 | mychr <- unique(chr[ix]) 43 | mystrand <- unique(strand[ix]) 44 | GeneNames[i] <- unique(geneid[ix]) 45 | EnsemblGene.Structures <- rbind(EnsemblGene.Structures, c(unique.transcriptid[i], mychr, mystrand, genestart, geneend, exoncount, exonStart, exonEnd)) 46 | } 47 | colnames(EnsemblGene.Structures) <- c("EnsemblGene", "Chromosome", "Strand", "geneStart", "geneEnd", "exonCount", "exonStart", "exonEnd") 48 | EnsemblGene.Structures <- data.frame(EnsemblGene.Structures) 49 | save(EnsemblGene.Structures, GeneNames, sequences, file = file.path(dbfolder, "data", refid, "EnsemblGene.Transcripts.RData")) 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /lib/R/EstimateSpanningReads.R: -------------------------------------------------------------------------------- 1 | ## EstimateSpanningReads v2: exclude soft clipped reads 2 | vars.tmp <- commandArgs() 3 | vars <- vars.tmp[length(vars.tmp)] 4 | split.vars <- unlist(strsplit(vars, ",")) 5 | samplename <- split.vars [1] 6 | outputfolder <- split.vars[2] 7 | readlength <- max(as.numeric(split.vars[3])) 8 | load(file.path(outputfolder,"out",paste(samplename,".junctions.recalibrated.RData", sep = ""))) 9 | load(file.path(outputfolder, "out", paste(samplename, ".ids_fasta.RData", sep = ""))) 10 | load(file.path(outputfolder, "out", "isize.RData")) 11 | mysigma <- readlength/4 12 | myref <- sort(dnorm(seq(1, readlength, by = 1), readlength/2, mysigma), decreasing = T) 13 | DataMatrix <- matrix(NA, nrow = length(ids_fasta), ncol = 9) 14 | for (i in 1: length(ids_fasta)) { 15 | junction.tmp <- junctions.recalibrated[i] 16 | x <- system(paste("samtools view ", file.path(outputfolder, "aln", paste(samplename,".remap.recal.sorted.rmdup.bam", sep = ""))," ", ids_fasta[i], ":", junction.tmp, "-", junction.tmp + 1, " | awk '($5>0)' | cut -f 1,4,6 > ", file.path(outputfolder,"out",".spanningreads.sam"),sep = "")) 17 | x <- system(paste("samtools view ", file.path(outputfolder, "aln", paste(samplename,".remap.recal.sorted.rmdup.bam", sep = ""))," ", ids_fasta[i]," | awk '($9>0)' | cut -f 9 - > ", file.path(outputfolder,"out",".insertsize.sam"),sep = "")) 18 | x <- system(paste("samtools view ", file.path(outputfolder, "aln", paste(samplename,".remap.recal.sorted.rmdup.bam", sep = ""))," ", ids_fasta[i]," | awk '($4<", junction.tmp, ") && ($8>",junction.tmp+1,")' | cut -f 1,4 - > ", file.path(outputfolder,"out",".crossingreads.sam"),sep = "")) 19 | 20 | spanningreads.tmp <- scan(file.path(outputfolder,"out",".spanningreads.sam"), sep = "\t", what = list("", 1, ""), quiet = T) 21 | ix.nosc <- sort(intersect(grep("^[0-9]*S", spanningreads.tmp[[3]], perl = T, invert = T), grep("[0-9]*S$", spanningreads.tmp[[3]], perl = T, invert = T))) 22 | spanningreads <- vector("list", length = 3) 23 | if (length(ix.nosc) > 0) { 24 | spanningreads[[1]] <- spanningreads.tmp[[1]][ix.nosc] 25 | spanningreads[[2]] <- spanningreads.tmp[[2]][ix.nosc] 26 | spanningreads[[3]] <- spanningreads.tmp[[3]][ix.nosc] 27 | } else { 28 | spanningreads <- spanningreads.tmp 29 | } 30 | crossingreads <- scan(file.path(outputfolder,"out",".crossingreads.sam"), sep = "\t", what = list("", 1), quiet = T) 31 | insert.size <- mean(abs(as.numeric(readLines(file.path(outputfolder,"out",".insertsize.sam"))))) - readlength 32 | id.spanningreads <- spanningreads[[1]] 33 | pos.spanningreads <- spanningreads[[2]] 34 | id.crossingreads <- crossingreads[[1]] 35 | pos.crossingreads <- crossingreads[[2]] 36 | spanning.score <- 0 37 | edge.score <- 0 38 | range.pos.crossingreads <- 0 39 | if (length(pos.crossingreads) > 0) { 40 | range.pos.crossingreads <- junction.tmp - min(pos.crossingreads) 41 | } 42 | n.crossingreads <- length(which(id.crossingreads %in% id.spanningreads == F)) 43 | n.spanningreads <- 0 44 | gjs <- 0 45 | unique.score <- 0 46 | us.prob <- 0 47 | insertsize.score <- 0 48 | if (length(pos.spanningreads) > 0) { 49 | pos <- pos.spanningreads - junction.tmp + readlength 50 | us.pos <- tabulate(pos, nbins = readlength) 51 | 52 | if (sum(us.pos) > 0) { 53 | us.mult <- floor(sum(us.pos)/readlength) 54 | us.residuals <- sum(us.pos)/readlength - floor(sum(us.pos)/readlength) 55 | us.refdistr <- rep(us.mult, readlength) 56 | if (us.residuals > 0) { 57 | for (kk in 1: (sum(us.pos) - us.mult*readlength)) { 58 | us.refdistr[kk] <- us.refdistr[kk] + 1 59 | } 60 | } else { 61 | us.refdistr[1] <- us.refdistr[1] + 1 62 | us.refdistr[2] <- us.refdistr[2] - 1 63 | } 64 | 65 | ff <- which(sort(us.pos!=0)) 66 | us.prob <- 1- sum(abs(sort(us.pos)[ff]-sort(us.refdistr)[ff]))/sum(us.pos) 67 | } 68 | 69 | mynorm <- sum(myref[1:length(unique(pos))]) 70 | prob <- sum(dnorm(unique(pos), readlength/2, mysigma)) 71 | gjs <- prob/mynorm 72 | 73 | 74 | left.spanningreads <- pos.spanningreads[(pos.spanningreads <= (junction.tmp - round(readlength/3)))] 75 | right.spanningreads <- pos.spanningreads[(pos.spanningreads > (junction.tmp - round(readlength/3)))] 76 | n.left.spanningreads <- length(left.spanningreads ) 77 | n.right.spanningreads <- length(right.spanningreads) 78 | spanning.score <- 1- abs(n.left.spanningreads - n.right.spanningreads)/(n.left.spanningreads + n.right.spanningreads) 79 | 80 | if (length(left.spanningreads) > 0) { 81 | left.score <- mean((junction.tmp - readlength)-left.spanningreads) 82 | } else { 83 | left.score <- 0 84 | } 85 | if (length(right.spanningreads) > 0) { 86 | right.score <- mean(right.spanningreads - junction.tmp) 87 | } else { 88 | right.score <- 0 89 | } 90 | edge.score <- 1 - 1.1^(mean(c(left.score,right.score))) 91 | insertsize.score <- dnorm(insert.size, isize.mean, isize.sd)/dnorm(isize.mean, isize.mean, isize.sd) 92 | n.spanningreads <- length(pos.spanningreads) 93 | 94 | } 95 | 96 | DataMatrix[i,] <- c(ids_fasta[i], n.crossingreads , insert.size, n.spanningreads, range.pos.crossingreads, edge.score, gjs, us.prob, insertsize.score) 97 | 98 | } 99 | colnames(DataMatrix) <- c("id", "nreads","mean_ins_size","nreads_junc", "rangepos", "edgescore", "gjs", "uniformity.score", "isize.score") 100 | save(DataMatrix, file = file.path(outputfolder,"out",paste(samplename, ".DataMatrix.RData", sep = ""))) 101 | 102 | filecon <- file(file.path(outputfolder,"out", paste(samplename, ".intervals", sep = "")), open = "w") 103 | ix.filter <- sort(unique(intersect(which(DataMatrix[,4] > 0), which(DataMatrix[,3] != "NaN")))) 104 | if (length(ix.filter) > 0) { 105 | width <- 100 106 | id.filtered <- ids_fasta[ix.filter] 107 | save(id.filtered, file = file.path(outputfolder, "out",paste(samplename,".ids_filtered.RData", sep = ""))) 108 | for (i in 1:length(ix.filter)) { 109 | ix.ref <- ix.filter[i] 110 | junction <- junctions.recalibrated[ix.ref] 111 | pileup.interval <- seq.int((junction - (width/2 - 1)), (junction + (width/2))) 112 | pileup.interval[which(pileup.interval < 1)] <- 1 113 | pileup.interval <- unique(pileup.interval) 114 | cat(paste(rep(id.filtered[i], length(pileup.interval)), pileup.interval, sep = " "), file = filecon, sep = "\n", append = T) 115 | } 116 | close(filecon) 117 | myflag <- 1 118 | cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) 119 | } else { 120 | myflag <- 0 121 | cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) 122 | stop("No chimeric transcripts found. Exit!") 123 | } 124 | 125 | -------------------------------------------------------------------------------- /lib/R/ExtractInsertSize.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | outputfolder <- split.vars[1] 5 | bwa_aln <- as.numeric(as.character(split.vars[2])) 6 | if (bwa_aln == 0) { 7 | xx <- readLines(file.path(outputfolder, "out", ".ericscript.log")) 8 | ix.isize <- grep("mean and std.dev:", xx) 9 | if (length(ix.isize) > 0) { 10 | isize.tmp <- strsplit(tail(xx[ix.isize], n = 1), ": ")[[1]][2] 11 | isize.tmp1 <- unlist(strsplit(isize.tmp, ",", fixed = T)) 12 | isize.mean <- as.numeric(substr(isize.tmp1[1], 2, nchar(isize.tmp1[1]))) 13 | isize.sd <- as.numeric(substr(isize.tmp1[2], 1, nchar(isize.tmp1[2])-1)) 14 | } else { 15 | isize.mean <- 200 16 | isize.sd <- 40 17 | } 18 | save(isize.mean, isize.sd, file = file.path(outputfolder, "out", "isize.RData")) 19 | } else { 20 | xx <- readLines(file.path(outputfolder, "out", ".ericscript.log")) 21 | ix.isize <- grep("inferred external isize from", xx) 22 | if (length(ix.isize) > 0) { 23 | isize.tmp <- strsplit(tail(xx[ix.isize], n = 1), ": ")[[1]][2] 24 | isize.tmp1 <- unlist(strsplit(isize.tmp, "+/-", fixed = T)) 25 | isize.mean <- as.numeric(isize.tmp1[1]) 26 | isize.sd <- as.numeric(isize.tmp1[2]) 27 | } else { 28 | isize.mean <- 200 29 | isize.sd <- 40 30 | } 31 | save(isize.mean, isize.sd, file = file.path(outputfolder, "out", "isize.RData")) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /lib/R/ImportResults.R: -------------------------------------------------------------------------------- 1 | ### ensgene converter 2 | 3 | toens <- function(ericscriptfolder, genename) { 4 | 5 | load(file.path(ericscriptfolder, "lib", "data", "EnsemblGene.GeneInfo.RData")) 6 | ensgene <- as.character(EnsemblGene.GeneInfo$EnsemblGene)[which(as.character(EnsemblGene.GeneInfo$GeneName) == genename)] 7 | if (length(ensgene) == 0) {ensgene <- NA} 8 | return(ensgene) 9 | 10 | } 11 | 12 | 13 | convertToComplement<-function(x) { 14 | 15 | bases=c("A","C","G","T") 16 | xx<-unlist(strsplit(toupper(x), NULL)) 17 | paste(unlist(lapply(xx, function(bbb) { 18 | if(bbb=="A") compString <- "T" 19 | if(bbb=="C") compString <- "G" 20 | if(bbb=="G") compString <- "C" 21 | if(bbb=="T") compString <- "A" 22 | if(!bbb %in% bases) compString <- "N" 23 | return(compString) 24 | })),collapse="") 25 | 26 | } 27 | 28 | ### import results from algorithm''s output 29 | 30 | Import_ericscript <- function(outputpath) { 31 | 32 | filename <- grep(".results.total.tsv", list.files(outputpath), value = T) 33 | 34 | if (length(filename) == 1) { 35 | xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) 36 | gene5 <- as.character(xx$EnsemblGene1) 37 | gene3 <- as.character(xx$EnsemblGene2) 38 | nreads <- as.numeric(as.character(xx$spanningreads)) 39 | score <- as.numeric(as.character(xx$EricScore)) 40 | seq <- as.character(xx$JunctionSequence) 41 | 42 | algout <- list() 43 | algout$gene5 <- gene5 44 | algout$gene3 <- gene3 45 | algout$nreads <- nreads 46 | algout$score <- score 47 | algout$seq <- seq 48 | 49 | return(algout) 50 | } else if (length(filename) == 0) { 51 | 52 | return(0) 53 | 54 | } else if (length(filename) > 1) { 55 | 56 | return(length(filename)) 57 | 58 | } 59 | 60 | 61 | } 62 | 63 | 64 | 65 | Import_defuse <- function(outputpath) { 66 | 67 | filename <- grep(".classify.tsv", list.files(outputpath), value = T) 68 | 69 | if (length(filename) == 1) { 70 | 71 | xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) 72 | gene5 <- as.character(xx$gene1) 73 | gene3 <- as.character(xx$gene2) 74 | nreads <- as.numeric(as.character(xx$splitr_count)) 75 | score <- as.numeric(as.character(xx$probability)) 76 | seq <- rep("", dim(xx)[1]) 77 | for (seqd in 1: dim(xx)[1]) { 78 | tmp <- unlist(strsplit(as.character(xx$splitr_sequence[seqd]), "|", fixed = T)) 79 | seq[seqd] <- paste(substr(tmp[1], (nchar(tmp[1])-29), nchar(tmp[1])), substr(tmp[2], 1, 30), sep = "") 80 | } 81 | 82 | 83 | algout <- list() 84 | algout$gene5 <- gene5 85 | algout$gene3 <- gene3 86 | algout$nreads <- nreads 87 | algout$score <- score 88 | algout$seq <- seq 89 | 90 | return(algout) 91 | } else if (length(filename) == 0) { 92 | 93 | return(0) 94 | 95 | } else if (length(filename) > 1) { 96 | 97 | return(length(filename)) 98 | 99 | } 100 | 101 | } 102 | 103 | 104 | Import_chimerascan <- function(outputpath) { 105 | 106 | filename <- grep("chimeras.bedpe", list.files(outputpath), value = T) 107 | 108 | if (length(filename) == 1) { 109 | 110 | xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) 111 | gene1tmp <- as.character(xx$genes5p) 112 | gene2tmp <- as.character(xx$genes3p) 113 | nreadstmp <- as.numeric(as.character(xx$total_frags)) 114 | scoretmp <- as.numeric(as.character(xx$score)) 115 | 116 | gene1 <- c() 117 | gene2 <- c() 118 | nreads <- c() 119 | score <- c() 120 | 121 | for (i in 1: length(gene1tmp)) { 122 | 123 | if((length(grep(",", gene1tmp[i])) > 0) & (length(grep(",", gene2tmp[i])) == 0)) { 124 | 125 | gene1 <- c(gene1, unlist(strsplit(gene1tmp[i], ","))) 126 | myrep <- length(unlist(strsplit(gene1tmp[i], ","))) 127 | gene2 <- c(gene2, rep(gene2tmp[i], myrep)) 128 | nreads <- c(nreads, rep(nreadstmp[i], myrep)) 129 | score <- c(score, rep(scoretmp[i], myrep)) 130 | 131 | } else if ((length(grep(",", gene1tmp[i])) == 0) & (length(grep(",", gene2tmp[i])) > 0)) { 132 | 133 | gene2 <- c(gene2, unlist(strsplit(gene2tmp[i], ","))) 134 | myrep <- length(unlist(strsplit(gene2tmp[i], ","))) 135 | gene1 <- c(gene1, rep(gene1tmp[i], myrep)) 136 | nreads <- c(nreads, rep(nreadstmp[i], myrep)) 137 | score <- c(score, rep(scoretmp[i], myrep)) 138 | 139 | } else if ((length(grep(",", gene1tmp[i])) > 0) & (length(grep(",", gene2tmp[i])) > 0)) { 140 | 141 | gene1tmp1 <- unlist(strsplit(gene1tmp[i], ",")) 142 | gene2tmp1 <- unlist(strsplit(gene2tmp[i], ",")) 143 | myrep1 <- length(unlist(strsplit(gene1tmp[i], ","))) 144 | myrep2 <- length(unlist(strsplit(gene2tmp[i], ","))) 145 | 146 | for (j in 1: myrep1) { 147 | 148 | gene1 <- c(gene1, rep(gene1tmp1[j], myrep2)) 149 | gene2 <- c(gene2, gene2tmp1) 150 | nreads <- c(nreads, rep(nreadstmp[i], myrep2)) 151 | score <- c(score, rep(scoretmp[i], myrep2)) 152 | 153 | } 154 | 155 | } else { 156 | 157 | gene1 <- c(gene1, gene1tmp[i]) 158 | gene2 <- c(gene2, gene2tmp[i]) 159 | nreads <- c(nreads, nreadstmp[i]) 160 | score <- c(score, scoretmp[i]) 161 | 162 | } 163 | 164 | } 165 | 166 | seq <- rep(NA, length(gene1)) 167 | gene5 <- rep(NA, length(gene1)) 168 | gene3 <- rep(NA, length(gene1)) 169 | 170 | for (i in 1: length(gene1)) { 171 | 172 | gene5[i] <- toens(ericscriptfolder, gene1[i]) 173 | gene3[i] <- toens(ericscriptfolder, gene2[i]) 174 | 175 | } 176 | 177 | algout <- list() 178 | algout$gene5 <- gene5 179 | algout$gene3 <- gene3 180 | algout$nreads <- nreads 181 | algout$score <- score 182 | algout$seq <- seq 183 | 184 | return(algout) 185 | } else if (length(filename) == 0) { 186 | 187 | return(0) 188 | 189 | } else if (length(filename) > 1) { 190 | 191 | return(length(filename)) 192 | 193 | } 194 | 195 | } 196 | 197 | 198 | 199 | Import_shortfuse <- function(outputpath) { 200 | 201 | filename <- grep("fusion_counts.bedpe", list.files(outputpath), value = T) 202 | 203 | if (length(filename) == 1) { 204 | 205 | xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = F) 206 | gene1tmp <- as.character(xx[, 11]) 207 | gene2tmp <- as.character(xx[, 12]) 208 | nreadstmp <- as.numeric(as.character(xx[, 8])) 209 | scoretmp <- as.numeric(as.character(xx[, 8])) 210 | 211 | gene12 <- paste(gene1tmp, gene2tmp) 212 | ixNOdupgene <- which(duplicated(gene12) == F) 213 | 214 | gene5 <- rep(NA, length(ixNOdupgene)) 215 | gene3 <- rep(NA, length(ixNOdupgene)) 216 | 217 | for (i in 1: length(ixNOdupgene)) { 218 | 219 | gene5[i] <- toens(ericscriptfolder, gene1tmp[ixNOdupgene[i]]) 220 | gene3[i] <- toens(ericscriptfolder, gene2tmp[ixNOdupgene[i]]) 221 | 222 | } 223 | 224 | nreads <- nreadstmp[ixNOdupgene] 225 | score <- scoretmp[ixNOdupgene] 226 | seq <- rep(NA, length(nreads)) 227 | 228 | algout <- list() 229 | algout$gene5 <- gene5 230 | algout$gene3 <- gene3 231 | algout$nreads <- nreads 232 | algout$score <- score 233 | algout$seq <- seq 234 | 235 | return(algout) 236 | } else if (length(filename) == 0) { 237 | 238 | return(0) 239 | 240 | } else if (length(filename) > 1) { 241 | 242 | return(length(filename)) 243 | 244 | } 245 | 246 | } 247 | 248 | 249 | 250 | 251 | Import_fusionmap <- function(outputpath) { 252 | 253 | filename <- grep("FusionReport.txt", list.files(outputpath), value = T) 254 | 255 | if (length(filename) == 1) { 256 | 257 | xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) 258 | 259 | transcriptversetmp <- as.character(xx$FusionGene) 260 | genelisttmp <- unlist(strsplit(transcriptversetmp, "->")) 261 | gene1tmp <- genelisttmp[seq(1, length(genelisttmp), by = 2)] 262 | gene2tmp <- genelisttmp[seq(2, length(genelisttmp), by = 2)] 263 | gene1tmp1 <- as.character(xx$KnownGene1) 264 | gene2tmp1 <- as.character(xx$KnownGene2) 265 | seqtmp <- as.character(xx$FusionJunctionSequence) 266 | ix.reverse <- which(gene1tmp %in% gene1tmp1 == F) 267 | for (ii in 1: length(ix.reverse)) { 268 | seqtmp[ix.reverse[ii]] <- convertToComplement(reverse(seqtmp[ix.reverse[ii]])) 269 | } 270 | nreadstmp <- as.numeric(as.character(xx[, 2])) 271 | scoretmp <- as.numeric(as.character(xx[, 2])) 272 | 273 | gene1 <- c() 274 | gene2 <- c() 275 | nreads <- c() 276 | score <- c() 277 | seq <- c() 278 | 279 | for (i in 1: length(gene1tmp)) { 280 | 281 | if((length(grep(",", gene1tmp[i])) > 0) & (length(grep(",", gene2tmp[i])) == 0)) { 282 | 283 | gene1 <- c(gene1, unlist(strsplit(gene1tmp[i], ","))) 284 | myrep <- length(unlist(strsplit(gene1tmp[i], ","))) 285 | gene2 <- c(gene2, rep(gene2tmp[i], myrep)) 286 | nreads <- c(nreads, rep(nreadstmp[i], myrep)) 287 | score <- c(score, rep(scoretmp[i], myrep)) 288 | seq <- c(seq, rep(seqtmp[i], myrep)) 289 | 290 | } else if ((length(grep(",", gene1tmp[i])) == 0) & (length(grep(",", gene2tmp[i])) > 0)) { 291 | 292 | gene2 <- c(gene2, unlist(strsplit(gene2tmp[i], ","))) 293 | myrep <- length(unlist(strsplit(gene2tmp[i], ","))) 294 | gene1 <- c(gene1, rep(gene1tmp[i], myrep)) 295 | nreads <- c(nreads, rep(nreadstmp[i], myrep)) 296 | score <- c(score, rep(scoretmp[i], myrep)) 297 | seq <- c(seq, rep(seqtmp[i], myrep)) 298 | 299 | 300 | } else if ((length(grep(",", gene1tmp[i])) > 0) & (length(grep(",", gene2tmp[i])) > 0)) { 301 | 302 | gene1tmp1 <- unlist(strsplit(gene1tmp[i], ",")) 303 | gene2tmp1 <- unlist(strsplit(gene2tmp[i], ",")) 304 | myrep1 <- length(unlist(strsplit(gene1tmp[i], ","))) 305 | myrep2 <- length(unlist(strsplit(gene2tmp[i], ","))) 306 | 307 | for (j in 1: myrep1) { 308 | 309 | gene1 <- c(gene1, rep(gene1tmp1[j], myrep2)) 310 | gene2 <- c(gene2, gene2tmp1) 311 | nreads <- c(nreads, rep(nreadstmp[i], myrep2)) 312 | score <- c(score, rep(scoretmp[i], myrep2)) 313 | seq <- c(seq, rep(seqtmp[i], myrep2)) 314 | 315 | } 316 | 317 | } else { 318 | 319 | gene1 <- c(gene1, gene1tmp[i]) 320 | gene2 <- c(gene2, gene2tmp[i]) 321 | nreads <- c(nreads, nreadstmp[i]) 322 | score <- c(score, scoretmp[i]) 323 | seq <- c(seq, seqtmp[i]) 324 | 325 | } 326 | 327 | } 328 | 329 | gene5 <- rep(NA, length(gene1)) 330 | gene3 <- rep(NA, length(gene1)) 331 | 332 | for (i in 1: length(gene1)) { 333 | 334 | gene5[i] <- toens(ericscriptfolder, gene1[i]) 335 | gene3[i] <- toens(ericscriptfolder, gene2[i]) 336 | 337 | } 338 | 339 | algout <- list() 340 | algout$gene5 <- gene5 341 | algout$gene3 <- gene3 342 | algout$nreads <- nreads 343 | algout$score <- score 344 | algout$seq <- seq 345 | 346 | return(algout) 347 | } else if (length(filename) == 0) { 348 | 349 | return(0) 350 | 351 | } else if (length(filename) > 1){ 352 | 353 | return(length(filename)) 354 | 355 | } 356 | 357 | } 358 | 359 | 360 | 361 | 362 | Import_unknown <- function(outputpath) { 363 | 364 | filename <- grep("ericsim", list.files(outputpath), value = T) 365 | 366 | if (length(filename) == 1) { 367 | 368 | xx <- read.delim(file.path(outputpath, filename), sep = "\t", header = T) 369 | 370 | gene5 <- as.character(xx$gene5) 371 | gene3 <- as.character(xx$gene3) 372 | nreads <- as.numeric(as.character(xx$nread)) 373 | score <- as.numeric(as.character(xx$score)) 374 | seq <- as.character(x$seq) 375 | 376 | algout <- list() 377 | algout$gene5 <- gene5 378 | algout$gene3 <- gene3 379 | algout$nreads <- nreads 380 | algout$score <- score 381 | algout$seq <- seq 382 | 383 | return(algout) 384 | 385 | } else if (length(filename) == 0) { 386 | 387 | return(0) 388 | 389 | } else if (length(filename) > 1) { 390 | 391 | return(length(filename)) 392 | 393 | } 394 | 395 | 396 | 397 | } 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | -------------------------------------------------------------------------------- /lib/R/MakeAdjacencyMatrix.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | samplename <- split.vars [1] 5 | outputfolder <- split.vars[2] 6 | ericscriptfolder <- split.vars[3] 7 | minreads <- as.numeric(split.vars[4]) 8 | MAPQ <- as.numeric(split.vars[5]) 9 | refid <- as.character(split.vars[6]) 10 | dbfolder <- as.character(split.vars[7]) 11 | 12 | filein <- file.path(outputfolder, "out", paste(samplename, ".filtered.out", sep = "")) 13 | xx <- readLines(filein, n = 1) 14 | if (length(xx) == 0) { 15 | myflag <- 0 16 | cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) 17 | stop("No lines available in ",filein,". No discordant reads found with MAPQ set to ", MAPQ, ". Try to decrease MAPQ parameter and run again EricScript. Exit!") 18 | } else { 19 | myflag <- 1 20 | cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) 21 | } 22 | x <- read.delim(filein, sep = "\t", header = F) 23 | load(file.path(dbfolder,"data", refid, "EnsemblGene.GenePosition.RData")) 24 | flag <- x[,1] 25 | #ix.flag <- which((flag > 63 & flag < 70) | (flag > 95 & flag < 118) | flag == 161 | flag == 181) 26 | ix.flag <- which((flag > 63 & flag < 70) | (flag > 95 & flag < 112) | flag == 161) 27 | id_1 <- as.character(x[ix.flag,2]) 28 | id_2 <- as.character(x[ix.flag,5]) 29 | pos_1 <- as.numeric(as.character(x[ix.flag,3])) 30 | pos_2 <- as.numeric(as.character(x[ix.flag,6])) 31 | rm(x) 32 | genename <- as.character(EnsemblGene.GenePosition$EnsemblGene) 33 | id1 <- c() 34 | id2 <- c() 35 | nreads <- c() 36 | diffpos <- c() 37 | generef <- unique(id_1) 38 | for (i in 1: length(generef)) { 39 | ix.generef <- which(genename == generef[i]) 40 | ix.gene <- which(id_1 == generef[i]) 41 | tmp <- sort(summary(as.factor(id_2[ix.gene]), maxsum = length(unique(id_2[ix.gene]))), decreasing = T) 42 | tmp.genename <- names(tmp) 43 | tmp.weight <- as.numeric(tmp) 44 | if ((max(tmp.weight) >= minreads) & (length(which(tmp.weight >= minreads)) <= 10)) { 45 | ix.maxnodes <- which(tmp.weight >= minreads) 46 | tmp.genename <- tmp.genename[ix.maxnodes] 47 | tmp.weight <- tmp.weight[ix.maxnodes] 48 | for (j in 1:length(tmp.weight)) { 49 | ix.genelink <- which(genename == tmp.genename[j]) 50 | if (length(ix.genelink)!=0) { 51 | id1 <- c(id1, generef[i]) 52 | id2 <- c(id2, tmp.genename[j]) 53 | nreads <- c(nreads, tmp.weight[j]) 54 | diffpos <- c(diffpos, abs(ix.generef-ix.genelink)) 55 | } 56 | } 57 | } 58 | 59 | } 60 | ## filter paralogs if paralogs exist 61 | 62 | if (file.exists(file.path(dbfolder,"data", refid, "EnsemblGene.Paralogs.RData"))) { 63 | 64 | load(file.path(dbfolder, "data", refid, "EnsemblGene.Paralogs.RData")) 65 | paralogs.flag <- rep(0, length(id1)) 66 | 67 | if (length(id1) == 0) { 68 | myflag <- 0 69 | cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) 70 | stop("No discordant reads found with minimum reads set to ", minreads, ". Exit!") 71 | } 72 | for (i in 1: length(id1)) { 73 | ix.paralogs <- which(EnsemblGene.Paralogs$EnsemblGene == id1[i]) 74 | paralogs <- as.character(EnsemblGene.Paralogs$Paralogs[ix.paralogs]) 75 | if (length(grep(id2[i], paralogs)) > 0) { 76 | paralogs.flag[i] <- 1 77 | } 78 | } 79 | ## 80 | paralogs.filter <- which(paralogs.flag == 0) 81 | id1f <- id1[paralogs.filter] 82 | id2f <- id2[paralogs.filter] 83 | nreadsf <- nreads[paralogs.filter] 84 | diffposf <- diffpos[paralogs.filter] 85 | if (length(id1) == 0) { 86 | myflag <- 0 87 | cat(myflag, file = file.path(outputfolder, "out", ".ericscript.flag")) 88 | stop("No discordant reads found with minimum reads set to ",minreads,". Exit!") 89 | } 90 | nfus <- length(id1f) 91 | MyGF <- vector("list", 6) 92 | names(MyGF) <- c("id1", "id2", "nreads", "pos1", "pos2", "diffpos") 93 | MyGF$id1 <- id1f 94 | MyGF$id2 <- id2f 95 | MyGF$nreads <- nreadsf 96 | MyGF$diffpos <- diffposf 97 | MyGF$pos1 <- vector("list", nfus) 98 | MyGF$pos2 <- vector("list", nfus) 99 | for (i in 1: (nfus)) { 100 | ix.pos <- which((id_1 == id1f[i]) & (id_2 ==id2f[i])) 101 | MyGF$pos1[[i]] <- pos_1[ix.pos] 102 | MyGF$pos2[[i]] <- pos_2[ix.pos] 103 | } 104 | } else { 105 | 106 | nfus <- length(id1) 107 | MyGF <- vector("list", 6) 108 | names(MyGF) <- c("id1", "id2", "nreads", "pos1", "pos2", "diffpos") 109 | MyGF$id1 <- id1 110 | MyGF$id2 <- id2 111 | MyGF$nreads <- nreads 112 | MyGF$diffpos <- diffpos 113 | MyGF$pos1 <- pos_1 114 | MyGF$pos2 <- pos_2 115 | 116 | } 117 | 118 | save(MyGF, file = file.path(outputfolder, "out", paste(samplename, ".chimeric.RData", sep = ""))) 119 | 120 | -------------------------------------------------------------------------------- /lib/R/MakeEmptyResults.R: -------------------------------------------------------------------------------- 1 | ## MakeResults.R v0.2 2 | ## different read count-based method for gene expression level estimation 3 | ## added machine-learning based algorithm as summarization score 4 | vars.tmp <- commandArgs() 5 | vars <- vars.tmp[length(vars.tmp)] 6 | split.vars <- unlist(strsplit(vars, ",")) 7 | samplename <- split.vars [1] 8 | outputfolder <- split.vars[2] 9 | 10 | Results <- "No Chimeric Transcript found!" 11 | write.table(Results, file = file.path(outputfolder,paste(samplename,".results.total.tsv", sep = "")), sep = "\t", row.names = F, col.names = F, quote = F) 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /lib/R/RecalibrateJunctions.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | samplename <- split.vars [1] 5 | outputfolder <- split.vars[2] 6 | readlength <- as.numeric(split.vars[3]) 7 | verbose <- as.numeric(split.vars[4]) 8 | grep.readlength <- c("grep") 9 | for (i in 1: length(readlength)) { 10 | grep.readlength <- paste(grep.readlength, " -v -e MD:Z:", readlength[i], sep = "") 11 | } 12 | 13 | formatfasta <- function(myfasta, step = 50) { 14 | totalchar <- nchar(myfasta) 15 | if (totalchar > step) { 16 | steps <- seq(1, totalchar, by = step) 17 | newfasta <- rep("", (length(steps) - 1)) 18 | for (j in 1: (length(steps) - 1)) { 19 | aa <- substr(myfasta, steps[j], (steps[j] + (step - 1))) 20 | newfasta[j] <- aa 21 | } 22 | if ((totalchar - tail(steps, n = 1)) > 0) { 23 | newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar)) 24 | } 25 | } else 26 | { 27 | newfasta <- substr(myfasta, 1, totalchar) 28 | } 29 | return(newfasta) 30 | } 31 | 32 | TryRecalibration <- function(outputfolder, verbose) { 33 | 34 | if (verbose == 0) { 35 | x <- system(paste("blat -tileSize=8 -fine", file.path(outputfolder,"out",".tmp.ref.fa"), file.path(outputfolder, "out", ".link"), file.path(outputfolder, "out",".recalibrated.junctions.blat"), " 1>> ", file.path(outputfolder, "out",".ericscript.log"))) 36 | } else { 37 | x <- system(paste("blat -tileSize=8 -fine", file.path(outputfolder,"out",".tmp.ref.fa"), file.path(outputfolder, "out", ".link"), file.path(outputfolder, "out",".recalibrated.junctions.blat"))) 38 | } 39 | yy <- readLines(file.path(outputfolder, "out", ".recalibrated.junctions.blat"), n = 6) 40 | if (length(yy) > 5) { 41 | xx <- read.delim(file.path(outputfolder, "out", ".recalibrated.junctions.blat"), sep = "\t", skip = 5, header = F) 42 | gapsize <- xx[,8] 43 | } 44 | if (all(gapsize <= 3) | (length(yy) <= 5)) { 45 | if (verbose == 0) { 46 | x <- system(paste("blat -tileSize=8", file.path(outputfolder,"out",".tmp.ref.fa"), file.path(outputfolder, "out", ".link"), file.path(outputfolder, "out",".recalibrated.junctions.blat"), " 1>> ", file.path(outputfolder, "out",".ericscript.log"))) 47 | } else { 48 | x <- system(paste("blat -tileSize=8", file.path(outputfolder,"out",".tmp.ref.fa"), file.path(outputfolder, "out", ".link"), file.path(outputfolder, "out",".recalibrated.junctions.blat"))) 49 | } 50 | 51 | } 52 | 53 | } 54 | 55 | load(file.path(outputfolder,"out",paste(samplename,".junctions.RData", sep = ""))) 56 | load(file.path(outputfolder,"out",paste(samplename,".ids_fasta.RData", sep = ""))) 57 | load(file.path(outputfolder,"out",paste(samplename,".sequences_fasta.RData", sep = ""))) 58 | cat(file.path(outputfolder,"out",".tmp.query.fa"), file = file.path(outputfolder,"out",".link")) 59 | sequences <- sequences.fasta 60 | recal.left <- rep(0, length(ids_fasta)) 61 | recal.right <- rep(0, length(ids_fasta)) 62 | count.recal <- rep(0, length(ids_fasta)) 63 | count.total <- rep(0, length(ids_fasta)) 64 | sequences.recal <- sequences 65 | junctions.recalibrated <- junctions 66 | for (i in 1:length(ids_fasta)) { 67 | junction.tmp <- junctions[i] 68 | x <- system(paste("samtools view ", file.path(outputfolder, "aln", paste(samplename,".remap.sorted.bam", sep = ""))," ", ids_fasta[i], " | ", grep.readlength, " | awk '((($5==0) && ($6==\"*\")) || ($5>=0))' | awk '{ print \">\" $1\"_\"$2,\"\\n\"$10}' > ", file.path(outputfolder,"out",".tmp.query.fa"),sep = "")) 69 | xx <- readLines(file.path(outputfolder,"out",".tmp.query.fa"), n = 1) 70 | if (length(xx) != 0) { 71 | x <- cat(paste(">",ids_fasta[i],sep=""), sequences[i],sep = "\n", file = file.path(outputfolder,"out",".tmp.ref.fa")) 72 | try.recal <- TryRecalibration(outputfolder, verbose) 73 | rm(xx) 74 | yy <- readLines(file.path(outputfolder, "out", ".recalibrated.junctions.blat"), n = 6) 75 | if (length(yy) > 5) { 76 | xx <- read.delim(file.path(outputfolder, "out", ".recalibrated.junctions.blat"), sep = "\t", skip = 5, header = F) 77 | gapsize <- xx[,8] 78 | gapstarts <- strsplit(as.character(xx[,21]), ",") 79 | blocksize <- strsplit(as.character(xx[,19]), ",") 80 | if (any(gapsize > 3)) { 81 | ix.0 <- which(gapsize > 3) 82 | if (length(ix.0) > 0) { 83 | gapstarts1 <- gapstarts[ix.0] 84 | gapstarts.tmp1 <- c() 85 | gapstarts.tmp2 <- c() 86 | for (jgap in 1:length(gapstarts1)) { 87 | ccc <- as.numeric(gapstarts1[[jgap]]) 88 | gapstarts.tmp1 <- c(gapstarts.tmp1, ccc[1]) 89 | gapstarts.tmp2 <- c(gapstarts.tmp2, ccc[2]) 90 | } 91 | ix.gap.in.junct <- ix.0[which((gapstarts.tmp1 <= junction.tmp) & (gapstarts.tmp2 >= (junction.tmp - 10 + 1)))] 92 | gaps <- gapsize[ix.gap.in.junct] 93 | unique.gaps <- unique(gaps) 94 | rr <- tabulate(gaps) 95 | max.rr <- max(rr) 96 | if( max.rr >= 1) { 97 | gap.length <- which.max(rr) 98 | ix.gaps <- which(gapsize == gap.length) 99 | a <- rep(0, length(ix.gaps)) 100 | b <- rep(0, length(ix.gaps)) 101 | aa <- rep(0, length(ix.gaps)) 102 | for (jj in 1:length(ix.gaps)) { 103 | gapstarts.tmp <- as.numeric(gapstarts[[ix.gaps[jj]]]) 104 | blocksize.tmp <- as.numeric(blocksize[[ix.gaps[jj]]]) 105 | a[jj] <- gapstarts.tmp[1] + blocksize.tmp[1] - 1 106 | b[jj] <- gapstarts.tmp[2] 107 | aa[jj] <- gapstarts.tmp[1] 108 | } 109 | max.a <- max(tabulate(a)) 110 | max.b <- max(tabulate(b)) 111 | my.a <- which.max(tabulate(a)) 112 | my.b <- which.max(tabulate(b)) 113 | if ((abs(max.a-max.b)/max.a) < 0.31) { 114 | count.total[i] <- length(gaps) 115 | count.recal[i] <- max.rr 116 | recal.left[i] <- my.a + 1 117 | recal.right[i] <- my.b + 1 118 | sequences.recal[i] <- paste(substr(sequences[i], 1, recal.left[i]), substr(sequences[i], recal.right[i], nchar(sequences[i])), sep = "") 119 | junctions.recalibrated[i] <- recal.left[i] 120 | } 121 | } 122 | 123 | } 124 | } 125 | } 126 | 127 | } 128 | } 129 | ids_fasta.recalibrated <- paste(">", ids_fasta, " junction@", junctions.recalibrated, sep = "") 130 | ref.recalibrated <- c() 131 | for (i in 1:length(ids_fasta.recalibrated)) { 132 | ref.recalibrated <- c(ref.recalibrated, c(ids_fasta.recalibrated[i], formatfasta(sequences.recal[i]))) 133 | } 134 | write(ref.recalibrated, file = file.path(outputfolder,"out",paste(samplename,".EricScript.junctions.recalibrated.fa", sep = "")), ncolumns = 1, sep = "") 135 | Recalibrated.Data <- cbind(recal.left, recal.right, junctions, count.recal, count.total) 136 | colnames(Recalibrated.Data) <- c("Left_Junction", "Right_Junction", "Junction", "Recal_Count", "Total_Count") 137 | save(sequences.recal, file = file.path(outputfolder,"out",paste(samplename,".sequences.recalibrated.RData", sep = ""))) 138 | save(Recalibrated.Data, file = file.path(outputfolder,"out",paste(samplename,".Recalibrated.Data.RData", sep = ""))) 139 | save(junctions.recalibrated, file = file.path(outputfolder,"out",paste(samplename,".junctions.recalibrated.RData", sep = ""))) 140 | 141 | 142 | -------------------------------------------------------------------------------- /lib/R/SimulateFusions.R: -------------------------------------------------------------------------------- 1 | ### simulate data [revised]. 2 | 3 | vars.tmp <- commandArgs() 4 | vars <- vars.tmp[length(vars.tmp)] 5 | split.vars <- unlist(strsplit(vars, ",")) 6 | 7 | readlength <- as.numeric(split.vars[1]) 8 | outputfolder <- split.vars[2] 9 | ericscriptfolder <- split.vars[3] 10 | verbose <- as.numeric(split.vars[4]) 11 | ins.size <- as.numeric(split.vars[5]) 12 | sd.inssize <- as.numeric(split.vars[6]) 13 | ngenefusion <- as.numeric(split.vars[7]) 14 | min.coverage <- as.numeric(split.vars[8]) 15 | max.coverage <- as.numeric(split.vars[9]) 16 | nsims <- as.numeric(split.vars[10]) 17 | BE.data <- as.numeric(split.vars[11]) 18 | IE.data <- as.numeric(split.vars[12]) 19 | background.data_1 <- as.character(split.vars[13]) 20 | background.data_2 <- as.character(split.vars[14]) 21 | nreads.background <- as.numeric(split.vars[15]) 22 | dbfolder <- as.character(split.vars[16]) 23 | refid <- as.character(split.vars[17]) 24 | 25 | mysyndata <- file.exists(file.path(dbfolder, "data", refid, "EnsemblGene.Transcripts.RData")) 26 | if (mysyndata == T) { 27 | cat("[EricScript simulator] Load genes data ...") 28 | load(file.path(dbfolder, "data", refid, "EnsemblGene.Transcripts.RData")) 29 | } else { 30 | cat( paste("[EricScript simulator] You need to download", refid, "data before running EricScript Simulator. Exit.\n")) 31 | system(paste("rm -r", outputfolder)) 32 | quit() 33 | } 34 | 35 | # myurl <- "http://dl.dropbox.com/u/3629305/EnsemblGene.Transcripts.RData" 36 | # if (mysyndata == T) { 37 | # cat("[EricScript simulator] Load genes data ...") 38 | # load(file.path(dbfolder, "data", "EnsemblGene.Transcripts.RData")) 39 | # } else { 40 | # cat("[EricScript simulator] Retrieving genes data ...") 41 | # download.file(myurl, destfile = file.path(dbfolder, "data", "EnsemblGene.Transcripts.RData"), quiet = T) 42 | # load(file.path(dbfolder, "data", "EnsemblGene.Transcripts.RData")) 43 | # cat(" done.\n") 44 | # cat("[EricScript simulator] Load genes data ...") 45 | # 46 | # } 47 | 48 | flag.background <- 0 49 | if (nchar(background.data_1) > 2 & nchar(background.data_2) > 2) { 50 | flag.background <- 1 51 | } 52 | 53 | dataset <- c() 54 | if (BE.data == 1) { 55 | dataset <- c(dataset, "BE") 56 | } 57 | if (IE.data == 1) { 58 | dataset <- c(dataset, "IE") 59 | } 60 | formatfasta <- function(myfasta, step = 50) { 61 | 62 | totalchar <- nchar(myfasta) 63 | if (totalchar > step) { 64 | steps <- seq(1, totalchar, by = step) 65 | newfasta <- rep("", (length(steps) - 1)) 66 | for (j in 1: (length(steps) - 1)) { 67 | aa <- substr(myfasta, steps[j], (steps[j] + (step - 1))) 68 | newfasta[j] <- aa 69 | } 70 | if ((totalchar - tail(steps, n = 1)) > 0) { 71 | newfasta <- c(newfasta, substr(myfasta, steps[j+1], totalchar)) 72 | } 73 | } else 74 | { 75 | newfasta <- substr(myfasta, 1, totalchar) 76 | } 77 | return(newfasta) 78 | } 79 | 80 | ## evaluate n.backgound reads 81 | 82 | 83 | TranscriptNames <- as.character(EnsemblGene.Structures$EnsemblGene) 84 | acceptable.chrs <- c(seq(1,22), "X", "Y") 85 | mycoverage <- seq(min.coverage, max.coverage, length.out = ngenefusion) 86 | minlength <- ins.size + 2*sd.inssize 87 | if (refid == "homo_sapiens") { 88 | ix.geneok <- which((EnsemblGene.Structures$Chromosome %in% acceptable.chrs)) 89 | } else { 90 | ix.geneok <- seq(1, length(EnsemblGene.Structures$Chromosome)) 91 | } 92 | genenameok <- as.character(EnsemblGene.Structures$EnsemblGene)[ix.geneok] 93 | strandok <- as.character(EnsemblGene.Structures$Strand)[ix.geneok] 94 | ix.goodseq <- which((nchar(sequences) > 2*minlength) & (TranscriptNames %in% genenameok) & is.na(GeneNames) == F) 95 | sequences <- sequences[ix.goodseq] 96 | GeneNames <- GeneNames[ix.goodseq] 97 | TranscriptNames <- TranscriptNames[ix.goodseq] 98 | 99 | 100 | formatted.count.tmp <-paste("00000", seq(1, nsims), sep = "") 101 | formatted.count <- substr(formatted.count.tmp, nchar(formatted.count.tmp) - 4, nchar(formatted.count.tmp)) 102 | formatted.count.tmp <-paste("00000", seq(1, ngenefusion), sep = "") 103 | formatted.count.fusions <- substr(formatted.count.tmp, nchar(formatted.count.tmp) - 4, nchar(formatted.count.tmp)) 104 | 105 | 106 | for (tt in 1: length(dataset)) { 107 | dir.create(file.path(outputfolder, dataset[tt])) 108 | dir.create(file.path(outputfolder, dataset[tt], "data")) 109 | dir.create(file.path(outputfolder, dataset[tt], "reads")) 110 | 111 | for (jj in 1: nsims) { 112 | 113 | dir.create(file.path(outputfolder, dataset[tt], "data", paste("sim", formatted.count[jj], sep = "_"))) 114 | dir.create(file.path(outputfolder, dataset[tt], "reads", paste("sim", formatted.count[jj], sep = "_"))) 115 | } 116 | } 117 | cat(" done.\n") 118 | 119 | for (jj in 1: nsims) { 120 | cat("[EricScript simulator] Generating synthetic dataset", formatted.count[jj], "...") 121 | 122 | if (flag.background == 1) { 123 | 124 | myrandomseed <- round(runif(1, 1, 1000)) 125 | system(paste("seqtk sample -s", myrandomseed, " background.data_1 ", nreads.background, " > ", file.path(outputfolder, "background.reads.1.fq") , sep = "")) 126 | system(paste("seqtk sample -s", myrandomseed, " background.data_2 ", nreads.background, " > ", file.path(outputfolder, "background.reads.2.fq") , sep = "")) 127 | } 128 | 129 | ix.gene1 <- rep(0,ngenefusion) 130 | ix.gene2 <- rep(0,ngenefusion) 131 | strand1 <- rep(0,ngenefusion) 132 | strand2 <- rep(0,ngenefusion) 133 | flag <- 1 134 | mycount <- 0 135 | while (flag == 1) { 136 | trans1 <- sample(TranscriptNames, ngenefusion) 137 | for (ii in 1: ngenefusion) { 138 | ix.gene1[ii] <- which(TranscriptNames == trans1[ii]) 139 | strand1[ii] <- strandok[which(genenameok == trans1[ii])] 140 | } 141 | gene1 <- GeneNames[ix.gene1] 142 | trans2 <- sample(TranscriptNames, ngenefusion) 143 | for (ii in 1: ngenefusion) { 144 | ix.gene2[ii] <- which(TranscriptNames == trans2[ii]) 145 | strand2[ii] <- strandok[which(genenameok == trans2[ii])] 146 | } 147 | gene2 <- GeneNames[ix.gene2] 148 | ix.gene12 <- c(ix.gene1, ix.gene2) 149 | if (length(unique(ix.gene12)) == 2*ngenefusion & length(unique(GeneNames[ix.gene12])) == 2*ngenefusion) { 150 | flag <- 0 151 | } else 152 | {flag <- 1} 153 | } 154 | 155 | sequence1 <- sequences[ix.gene1] 156 | sequence2 <- sequences[ix.gene2] 157 | 158 | if ("BE" %in% dataset) { 159 | 160 | myref <- c() 161 | junction1.tot <- c() 162 | junction2.tot <- c() 163 | id.fusions <- rep(0, length(sequence1)) 164 | sequence.fusions <- rep(0, length(sequence1)) 165 | sequence.fusions.50bp <- rep(0, length(sequence1)) 166 | 167 | for (i in 1: length(sequence1)) { 168 | 169 | tmp <- seq.int(100,(nchar(sequence1[i]) - 100)) 170 | junction1 <- sample(tmp,1) 171 | junction1.tot <- c(junction1.tot, junction1) 172 | tmp <- seq.int(100,(nchar(sequence2[i]) - 100)) 173 | junction2 <- sample(tmp,1) 174 | junction2.tot <- c(junction2.tot, junction2) 175 | sequence.fusions[i] <- paste(substr(sequence1[i], 1, junction1), substr(sequence2[i], junction2 + 1, nchar(sequence2[i])), sep = "") 176 | sequence.fusions.50bp[i] <- paste(substr(sequence1[i], (junction1 - 49), junction1), substr(sequence2[i], junction2 + 1, (junction2 + 50)), sep = "") 177 | id.fusions[i] <- paste(">", paste(gene1[i], gene2[i], sep = "----"), sep = "") 178 | myref <- c(myref, c(id.fusions[i], sequence.fusions[i])) 179 | myref.single <- c(id.fusions[i], formatfasta(sequence.fusions[i])) 180 | cat(myref.single, file = file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), sep = "\n") 181 | } 182 | 183 | GeneFusions <- list() 184 | GeneFusions[[1]] <- gene1 185 | GeneFusions[[2]] <- gene2 186 | GeneFusions[[3]] <- junction1.tot 187 | GeneFusions[[4]] <- junction2.tot 188 | GeneFusions[[5]] <- sequence.fusions.50bp 189 | GeneFusions[[6]] <- mycoverage 190 | GeneFusions[[7]] <- trans1 191 | GeneFusions[[8]] <- trans2 192 | names(GeneFusions) <- c("gene1", "gene2", "junction1", "junction2", "junctionseq", "coverage", "trans1", "trans2") 193 | save(GeneFusions, file = file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "GeneFusions.RData")) 194 | 195 | system(paste(">", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"))) 196 | system(paste(">", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"))) 197 | 198 | for (i in 1: ngenefusion) { 199 | mynreads <- round(mycoverage[i]*nchar(sequence.fusions[i])/(2*readlength)) 200 | if (verbose == 0) { 201 | system(paste("wgsim -d ", ins.size, " -r 0.0001 -R 0.001 -s ", sd.inssize, " -N ", mynreads, " -1 ", readlength, " -2 ", readlength," ", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), " " ,file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq")," ", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), " 2>> ", file.path(outputfolder, "wgsim.log"), " 1>> ", file.path(outputfolder, "wgsim.log"), sep = "")) 202 | } else { 203 | system(paste("wgsim -d ", ins.size, " -r 0.0001 -R 0.001 -s ", sd.inssize, " -N ", mynreads, " -1 ", readlength, " -2 ", readlength," ", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), " " ,file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq")," ", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), sep = "")) 204 | 205 | } 206 | 207 | system(paste("cat", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq"), ">>", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"))) 208 | system(paste("cat", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), ">>", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"))) 209 | 210 | } 211 | 212 | if (flag.background == 1) { 213 | system(paste("cat ", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"), " ", file.path(outputfolder, "background.reads.1.fq"), " > ", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "total.reads.1.fq"), sep = "")) 214 | system(paste("cat ", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"), " ", file.path(outputfolder, "background.reads.2.fq"), " > ", file.path(outputfolder, "BE", "reads", paste("sim", formatted.count[jj], sep = "_"), "total.reads.2.fq"), sep = "")) 215 | } 216 | 217 | system(paste("rm", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq"))) 218 | system(paste("rm", file.path(outputfolder, "BE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"))) 219 | 220 | 221 | } 222 | 223 | if ("IE" %in% dataset) { 224 | 225 | 226 | myref <- c() 227 | 228 | Gene.Table <- EnsemblGene.Structures 229 | junction1.tot <- c() 230 | junction2.tot <- c() 231 | id.fusions <- rep(0, length(sequence1)) 232 | sequence.fusions <- rep(0, length(sequence1)) 233 | sequence.fusions.50bp <- rep(0, length(sequence1)) 234 | genename.table <- as.character(Gene.Table[,1]) 235 | 236 | 237 | for (i in 1: length(sequence1)) { 238 | ix.genename.table <- which(genename.table == trans1[i]) 239 | start.exons <- as.numeric(unlist(strsplit(as.character(Gene.Table[ix.genename.table, 7]), ","))) 240 | end.exons <- as.numeric(unlist(strsplit(as.character(Gene.Table[ix.genename.table, 8]), ","))) 241 | strand <- as.character(Gene.Table[ix.genename.table, 3]) 242 | if (strand == "+") { 243 | tmp <- cumsum((end.exons - start.exons)) 244 | } else { 245 | tmp <- cumsum(rev(end.exons - start.exons)) 246 | } 247 | if (length(tmp) > 1) { 248 | junction1 <- sample(tmp,1) 249 | } else { 250 | junction1 <- tmp 251 | } 252 | junction1.tot <- c(junction1.tot, junction1) 253 | ix.genename.table <- which(genename.table == trans2[i]) 254 | start.exons <- as.numeric(unlist(strsplit(as.character(Gene.Table[ix.genename.table, 7]), ","))) 255 | end.exons <- as.numeric(unlist(strsplit(as.character(Gene.Table[ix.genename.table, 8]), ","))) 256 | strand <- as.character(Gene.Table[ix.genename.table, 3]) 257 | if (strand == "+") { 258 | tmp <- cumsum((end.exons - start.exons)) 259 | tmp <- tmp[-length(tmp)] 260 | } else { 261 | tmp <- cumsum(rev(end.exons - start.exons)) 262 | tmp <- tmp[-length(tmp)] 263 | } 264 | if (length(tmp) > 1) { 265 | junction2 <- sample(tmp,1) 266 | } else { 267 | junction2 <- 1 268 | } 269 | junction2.tot <- c(junction2.tot, junction2) 270 | sequence.fusions[i] <- paste(substr(sequence1[i], 1, junction1), substr(sequence2[i], junction2 + 1, nchar(sequence2[i])), sep = "") 271 | sequence.fusions.50bp[i] <- paste(substr(sequence1[i], (junction1 - 49), junction1), substr(sequence2[i], junction2 + 1, (junction2 + 50)), sep = "") 272 | id.fusions[i] <- paste(">", paste(gene1[i], gene2[i], sep = "----"), sep = "") 273 | myref <- c(myref, c(id.fusions[i], sequence.fusions[i])) 274 | myref.single <- c(id.fusions[i], formatfasta(sequence.fusions[i])) 275 | cat(myref.single, file = file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), sep = "\n") 276 | 277 | } 278 | 279 | GeneFusions <- list() 280 | GeneFusions[[1]] <- gene1 281 | GeneFusions[[2]] <- gene2 282 | GeneFusions[[3]] <- junction1.tot 283 | GeneFusions[[4]] <- junction2.tot 284 | GeneFusions[[5]] <- sequence.fusions.50bp 285 | GeneFusions[[6]] <- mycoverage 286 | GeneFusions[[7]] <- trans1 287 | GeneFusions[[8]] <- trans2 288 | names(GeneFusions) <- c("gene1", "gene2", "junction1", "junction2", "junctionseq", "coverage", "trans1", "trans2") 289 | save(GeneFusions, file = file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "GeneFusions.RData")) 290 | 291 | 292 | system(paste(">", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"))) 293 | system(paste(">", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"))) 294 | for (i in 1: ngenefusion) { 295 | mynreads <- round(mycoverage[i]*nchar(sequence.fusions[i])/(2*readlength)) 296 | if (verbose == 0) { 297 | system(paste("wgsim -d ", ins.size, " -r 0.0001 -R 0.001 -s ", sd.inssize, " -N ", mynreads, " -1 ", readlength, " -2 ", readlength," ", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), " " ,file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq")," ", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), " 2>> ", file.path(outputfolder, "wgsim.log"), " 1>> ", file.path(outputfolder, "wgsim.log"), sep = "")) 298 | } else { 299 | system(paste("wgsim -d ", ins.size, " -r 0.0001 -R 0.001 -s ", sd.inssize, " -N ", mynreads, " -1 ", readlength, " -2 ", readlength," ", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), paste("myref", formatted.count.fusions[i], ".fa", sep = "")), " " ,file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq")," ", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), sep = "")) 300 | 301 | } 302 | system(paste("cat", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq"), ">>", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"))) 303 | system(paste("cat", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"), ">>", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"))) 304 | } 305 | 306 | if (flag.background == 1) { 307 | 308 | system(paste("cat ", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.1.fq"), " ", file.path(outputfolder, "background.reads.1.fq"), " > ", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "total.reads.1.fq"), sep = "")) 309 | system(paste("cat ", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "fusions.reads.2.fq"), " ", file.path(outputfolder, "background.reads.2.fq"), " > ", file.path(outputfolder, "IE", "reads", paste("sim", formatted.count[jj], sep = "_"), "total.reads.2.fq"), sep = "")) 310 | } 311 | system(paste("rm", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.1.fq"))) 312 | system(paste("rm", file.path(outputfolder, "IE", "data", paste("sim", formatted.count[jj], sep = "_"), "out.reads.2.fq"))) 313 | } 314 | if (flag.background == 1) { 315 | system(paste("rm", file.path(outputfolder, "background.reads.1.fq"))) 316 | system(paste("rm", file.path(outputfolder, "background.reads.2.fq"))) 317 | } 318 | system(paste("rm", file.path(outputfolder, "wgsim.log"))) 319 | cat(" done. \n") 320 | } 321 | 322 | -------------------------------------------------------------------------------- /lib/R/UpdateDB.R: -------------------------------------------------------------------------------- 1 | vars.tmp <- commandArgs() 2 | vars <- vars.tmp[length(vars.tmp)] 3 | split.vars <- unlist(strsplit(vars, ",")) 4 | ericscriptfolder <- split.vars [1] 5 | dbfolder <- split.vars[2] 6 | 7 | mydblist.tmp <- list.files(file.path(dbfolder, "data")) 8 | if (length(mydblist.tmp) > 1) { 9 | mydblist <- mydblist.tmp[-which(mydblist.tmp == "_resources")] 10 | flag <- scan(file.path(ericscriptfolder, "lib", "data", "_resources", ".flag.updatedb"), what = "numeric", quiet = T) 11 | if (flag == 0) { 12 | cat("[EricScript] Nothing to update. Exit.\n", sep = "") 13 | } else 14 | { 15 | cat("[EricScript] Found a new release of Ensembl Gene. Updating database for ", toString(mydblist),".\n", sep = "") 16 | for (i in 1: length(mydblist)) { 17 | system(paste("sh", file.path(ericscriptfolder, "lib", "bash", "BuildSeq.sh"), ericscriptfolder, mydblist[i])) 18 | } 19 | } 20 | } else { 21 | cat("[EricScript] No database was found in ", file.path(dbfolder, "data"), ". Please run ericscript.pl --downdb to download your databases.\n", sep = "") 22 | } 23 | 24 | 25 | -------------------------------------------------------------------------------- /lib/bash/BuildSeq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ericscriptfolder=$1 3 | refid=$2 4 | dbfolder=$3 5 | ensversion=$4 6 | myrandomn=$RANDOM 7 | tmpfolder=$dbfolder/".tmp_"$myrandomn 8 | mkdir $tmpfolder 9 | printf "[EricScript] Downloading $refid data. This process may take from few minutes to few hours depending on the selected genome ..." 10 | R --slave --args $ericscriptfolder,$refid,$tmpfolder,$ensversion < $ericscriptfolder/lib/R/DownloadDB.R 11 | flagrefid=`cat $tmpfolder/.refid.flag` 12 | if [ $flagrefid -eq 1 ] 13 | then 14 | bedtools sort -i $tmpfolder/exonstartend.txt | bedtools merge -c 4 -o collapse -i - | cut -d ',' -f1 - | awk '{print $4"\t"($2-1)"\t"$3"\t"$1}' - > $tmpfolder/exonstartend.mrg.txt 15 | seqtk subseq $tmpfolder/seq.fa.gz $tmpfolder/exonstartend.mrg.txt > $tmpfolder/subseq.fa 16 | printf "done.\n" 17 | printf "[EricScript] Creating database for $refid ..." 18 | R --slave --args $ericscriptfolder,$refid,$dbfolder,$tmpfolder < $ericscriptfolder/lib/R/BuildExonUnionModel.R 19 | R --slave --args $ericscriptfolder,$refid,$dbfolder,$tmpfolder < $ericscriptfolder/lib/R/ConvertTxt2R.R 20 | R --slave --args $refid,$dbfolder,$tmpfolder < $ericscriptfolder/lib/R/CreateDataEricTheSimulator.R 21 | if [ $refid == "homo_sapiens" ] 22 | then 23 | seqtk subseq -l 50 $tmpfolder/seq.fa.gz $tmpfolder/chrlist > $dbfolder/data/$refid/allseq.fa 24 | else 25 | gunzip -c -d $tmpfolder/seq.fa.gz > $dbfolder/data/$refid/allseq.fa 26 | fi 27 | printf "done.\n" 28 | printf "[EricScript] Building reference indexes with BWA for transcriptome and genome ..." 29 | bwa index $dbfolder/data/$refid/allseq.fa 1>> $tmpfolder/.tmp.log 2>> $tmpfolder/.tmp.log 30 | bwa index $dbfolder/data/$refid/EnsemblGene.Reference.fa 1>> $tmpfolder/.tmp.log 2>> $tmpfolder/.tmp.log 31 | printf "done.\n" 32 | fi 33 | printf "[EricScript] Removing temporary files ..." 34 | rm -r $tmpfolder 35 | printf "done.\n" 36 | -------------------------------------------------------------------------------- /lib/bash/Ftp2Ensembl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ericscriptfolder=$1 4 | ensversion=$2 5 | if [ $ensversion -eq 0 ]; then 6 | fasta_path="current_fasta/" 7 | else 8 | fasta_path="release-"$ensversion"/fasta/" 9 | fi 10 | ftp -vi ftp.ensembl.org >> ~/.ericscript.log 2>&1 < $outputfolder/aln/"$samplename"_1.sai 2>> $outputfolder/out/.ericscript.log 36 | bwa aln -R 5 -t $nthreads $myref $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename"_2.sai 2>> $outputfolder/out/.ericscript.log 37 | fi 38 | if [ $MAPQ -gt 0 ]; then 39 | if [ $bwa_aln -eq 1 ]; then 40 | bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename".sam 2>> $outputfolder/out/.ericscript.log 41 | else 42 | bwa mem -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename".sam 2>> $outputfolder/out/.ericscript.log 43 | fi 44 | else 45 | if [ $bwa_aln -eq 1 ]; then 46 | bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log 47 | else 48 | bwa mem -Y -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log 49 | fi 50 | cat $outputfolder/aln/tmp.sam | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam 51 | fi 52 | else 53 | printf "[EricScript] Aligning with bwa ...\n" 54 | if [ $bwa_aln -eq 1 ]; then 55 | bwa aln -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed > $outputfolder/aln/"$samplename"_1.sai 56 | bwa aln -t $nthreads $myref $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename"_2.sai 57 | fi 58 | if [ $MAPQ -gt 0 ]; then 59 | if [ $bwa_aln -eq 1 ]; then 60 | bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename".sam 61 | else 62 | bwa mem -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed > $outputfolder/aln/"$samplename".sam 63 | fi 64 | else 65 | if [ $bwa_aln -eq 1 ]; then 66 | bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam 67 | else 68 | bwa mem -Y -t $nthreads $myref $outputfolder/aln/$samplename.1.fq.trimmed $outputfolder/aln/$samplename.2.fq.trimmed | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam 69 | fi 70 | fi 71 | fi 72 | else 73 | if [ $ntrim -ge $readlength ]; then 74 | printf "[EricScript] Selected trimming value is greater equal to read length. Reads will not be trimmed.\n" 75 | fi 76 | if [ $verbose -eq 0 ]; then 77 | printf "[EricScript] Aligning with bwa ..." 78 | if [ $bwa_aln -eq 1 ]; then 79 | bwa aln -R 5 -t $nthreads $myref $reads_1 > $outputfolder/aln/"$samplename"_1.sai 2>> $outputfolder/out/.ericscript.log 80 | bwa aln -R 5 -t $nthreads $myref $reads_2 > $outputfolder/aln/"$samplename"_2.sai 2>> $outputfolder/out/.ericscript.log 81 | fi 82 | if [ $MAPQ -gt 0 ]; then 83 | if [ $bwa_aln -eq 1 ]; then 84 | bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $reads_1 $reads_2 > $outputfolder/aln/"$samplename".sam 2>> $outputfolder/out/.ericscript.log 85 | else 86 | bwa mem -t $nthreads $myref $reads_1 $reads_2 > $outputfolder/aln/"$samplename".sam 2>> $outputfolder/out/.ericscript.log 87 | fi 88 | else 89 | if [ $bwa_aln -eq 1 ]; then 90 | bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log 91 | else 92 | bwa mem -Y -t $nthreads $myref $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log 93 | fi 94 | cat $outputfolder/aln/tmp.sam | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam 95 | fi 96 | else 97 | printf "[EricScript] Aligning with bwa ...\n" 98 | if [ $bwa_aln -eq 1 ]; then 99 | bwa aln -t $nthreads $myref $reads_1 > $outputfolder/aln/"$samplename"_1.sai 100 | bwa aln -t $nthreads $myref $reads_2 > $outputfolder/aln/"$samplename"_2.sai 101 | fi 102 | if [ $MAPQ -gt 0 ]; then 103 | if [ $bwa_aln -eq 1 ]; then 104 | bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $reads_1 $reads_2 > $outputfolder/aln/"$samplename".sam 105 | else 106 | bwa mem -t $nthreads $myref $reads_1 $reads_2 > $outputfolder/aln/"$samplename".sam 107 | fi 108 | else 109 | if [ $bwa_aln -eq 1 ]; then 110 | bwa sampe -P -c 0.001 $myref $outputfolder/aln/"$samplename"_1.sai $outputfolder/aln/"$samplename"_2.sai $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam 111 | else 112 | bwa mem -Y -t $nthreads $myref $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/"$samplename".sam 113 | fi 114 | fi 115 | fi 116 | fi 117 | printf "done. \n" 118 | R --slave --args $outputfolder,$bwa_aln < $ericscriptfolder/lib/R/ExtractInsertSize.R 119 | printf "[EricScript] Extracting discordant alignments ... " 120 | grep -v '^@' $outputfolder/aln/"$samplename".sam | awk -v mapq="$MAPQ" '(($7!="=") && ($7!="*") && ($5>=mapq)) { print }' | cut -f2,3,4,5,7,8 > $outputfolder/out/"$samplename".filtered.out 121 | R --slave --args $samplename,$outputfolder,$ericscriptfolder,$minreads,$MAPQ,$refid,$dbfolder < $ericscriptfolder/lib/R/MakeAdjacencyMatrix.R 122 | myflag=`cat $outputfolder/out/.ericscript.flag` 123 | if [ $myflag -eq 0 ]; then 124 | printf "done. \n" 125 | printf "[EricScript] No chimeric transcripts found! Writing results ..." 126 | R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/MakeEmptyResults.R 127 | printf "done. \n" 128 | exit 1 129 | fi 130 | printf "done. \n" 131 | printf "[EricScript] Building exon junction reference ... " 132 | R --slave --args $samplename,$outputfolder,$ericscriptfolder,$readlength,$refid,$dbfolder < $ericscriptfolder/lib/R/BuildFasta.R 133 | printf "done. \n" 134 | ## Aligning to putative junction reference 135 | if [ $verbose -eq 0 ]; then 136 | printf "[EricScript] Aligning to exon junction reference ... " 137 | bwa index $mynewref 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log 138 | if [ $bwa_aln -eq 1 ]; then 139 | bwa aln -t $nthreads $mynewref $reads_1 > $outputfolder/aln/"$samplename"_1.remap.sai 2>> $outputfolder/out/.ericscript.log 140 | bwa aln -t $nthreads $mynewref $reads_2 > $outputfolder/aln/"$samplename"_2.remap.sai 2>> $outputfolder/out/.ericscript.log 141 | fi 142 | if [ $MAPQ -gt 0 ]; then 143 | if [ $bwa_aln -eq 1 ]; then 144 | bwa sampe -P $mynewref $outputfolder/aln/"$samplename"_1.remap.sai $outputfolder/aln/"$samplename"_2.remap.sai $reads_1 $reads_2 > $outputfolder/aln/$samplename.remap.sam 2>> $outputfolder/out/.ericscript.log 145 | else 146 | bwa mem -t $nthreads $mynewref $reads_1 $reads_2 > $outputfolder/aln/$samplename.remap.sam 2>> $outputfolder/out/.ericscript.log 147 | fi 148 | else 149 | if [ $bwa_aln -eq 1 ]; then 150 | bwa sampe -P $mynewref $outputfolder/aln/"$samplename"_1.remap.sai $outputfolder/aln/"$samplename"_2.remap.sai $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log 151 | else 152 | bwa mem -Y -t $nthreads $mynewref $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log 153 | fi 154 | cat $outputfolder/aln/tmp.sam | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.sam 155 | fi 156 | samtools view -@ $nthreads -bS -o $outputfolder/aln/$samplename.remap.bam $outputfolder/aln/$samplename.remap.sam 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log 157 | samtools sort -@ $nthreads $outputfolder/aln/$samplename.remap.bam $outputfolder/aln/$samplename.remap.sorted 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log 158 | samtools index $outputfolder/aln/$samplename.remap.sorted.bam 1>> $outputfolder/out/.ericscript.log 159 | else 160 | printf "[EricScript] Aligning to exon junction reference ... \n" 161 | bwa index $mynewref 162 | if [ $bwa_aln -eq 1 ]; then 163 | bwa aln -t $nthreads $mynewref $reads_1 > $outputfolder/aln/"$samplename"_1.remap.sai 164 | bwa aln -t $nthreads $mynewref $reads_2 > $outputfolder/aln/"$samplename"_2.remap.sai 165 | fi 166 | if [ $MAPQ -gt 0 ]; then 167 | if [ $bwa_aln -eq 1 ]; then 168 | bwa sampe -P $mynewref $outputfolder/aln/"$samplename"_1.remap.sai $outputfolder/aln/"$samplename"_2.remap.sai $reads_1 $reads_2 > $outputfolder/aln/$samplename.remap.sam 169 | else 170 | bwa mem -t $nthreads $mynewref $reads_1 $reads_2 > $outputfolder/aln/$samplename.remap.sam 171 | fi 172 | else 173 | if [ $bwa_aln -eq 1 ]; then 174 | bwa sampe -P $mynewref $outputfolder/aln/"$samplename"_1.remap.sai $outputfolder/aln/"$samplename"_2.remap.sai $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.sam 175 | else 176 | bwa mem -Y -t $nthreads $mynewref $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.sam 177 | fi 178 | fi 179 | samtools view -@ $nthreads -bS -o $outputfolder/aln/$samplename.remap.bam $outputfolder/aln/$samplename.remap.sam 180 | samtools sort -@ $nthreads $outputfolder/aln/$samplename.remap.bam $outputfolder/aln/$samplename.remap.sorted 181 | samtools index $outputfolder/aln/$samplename.remap.sorted.bam 182 | fi 183 | printf "done. \n" 184 | ## Recalibrating junctions 185 | printf "[EricScript] Recalibrating junctions ... " 186 | R --slave --args $samplename,$outputfolder,$readlength,$verbose < $ericscriptfolder/lib/R/RecalibrateJunctions.R 187 | cat $outputfolder/out/$samplename.EricScript.junctions.recalibrated.fa $myref > $mynewref_recal 188 | printf "done. \n" 189 | ## Aligning not properly mapped reads 190 | if [ $verbose -eq 0 ]; then 191 | printf "[EricScript] Aligning to recalibrated junction reference ... " 192 | bwa index $mynewref_recal 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log 193 | if [ $bwa_aln -eq 1 ]; then 194 | bwa aln -R 5 -t $nthreads $mynewref_recal $reads_1 > $outputfolder/aln/"$samplename"_1.remap.recal.sai 2>> $outputfolder/out/.ericscript.log 195 | bwa aln -R 5 -t $nthreads $mynewref_recal $reads_2 > $outputfolder/aln/"$samplename"_2.remap.recal.sai 2>> $outputfolder/out/.ericscript.log 196 | bwa sampe -P $mynewref_recal $outputfolder/aln/"$samplename"_1.remap.recal.sai $outputfolder/aln/"$samplename"_2.remap.recal.sai $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log 197 | else 198 | bwa mem -Y -t $nthreads $mynewref_recal $reads_1 $reads_2 > $outputfolder/aln/tmp.sam 2>> $outputfolder/out/.ericscript.log 199 | fi 200 | cat $outputfolder/aln/tmp.sam | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.recal.sam 201 | samtools view -@ $nthreads -bt $mynewref_recal -o $outputfolder/aln/$samplename.remap.recal.bam $outputfolder/aln/$samplename.remap.recal.sam 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log 202 | samtools sort -@ $nthreads $outputfolder/aln/$samplename.remap.recal.bam $outputfolder/aln/$samplename.remap.recal.sorted 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log 203 | samtools rmdup $outputfolder/aln/$samplename.remap.recal.sorted.bam $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam 1>> $outputfolder/out/.ericscript.log 2>> $outputfolder/out/.ericscript.log 204 | samtools index $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam 1>> $outputfolder/out/.ericscript.log 205 | samtools view -@ $nthreads -b -h -q 1 $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam > $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam 206 | samtools index $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam 207 | else 208 | printf "[EricScript] Aligning to recalibrated junction reference ... \n" 209 | bwa index $mynewref_recal 210 | if [ $bwa_aln -eq 1 ]; then 211 | bwa aln -R 5 -t $nthreads $mynewref_recal $reads_1 > $outputfolder/aln/"$samplename"_1.remap.recal.sai 212 | bwa aln -R 5 -t $nthreads $mynewref_recal $reads_2 > $outputfolder/aln/"$samplename"_2.remap.recal.sai 213 | bwa sampe -P $mynewref_recal $outputfolder/aln/"$samplename"_1.remap.recal.sai $outputfolder/aln/"$samplename"_2.remap.recal.sai $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.recal.sam 214 | else 215 | bwa mem -Y -t $nthreads $mynewref_recal $reads_1 $reads_2 | $ericscriptfolder/lib/perl/xa2multi.pl > $outputfolder/aln/$samplename.remap.recal.sam 216 | fi 217 | samtools view -@ $nthreads -bt $mynewref_recal -o $outputfolder/aln/$samplename.remap.recal.bam $outputfolder/aln/$samplename.remap.recal.sam 218 | samtools sort -@ $nthreads $outputfolder/aln/$samplename.remap.recal.bam $outputfolder/aln/$samplename.remap.recal.sorted 219 | samtools rmdup $outputfolder/aln/$samplename.remap.recal.sorted.bam $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam 220 | samtools index $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam 221 | samtools view -@ $nthreads -b -h -q 1 $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam > $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam 222 | samtools index $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam 223 | fi 224 | printf "done. \n" 225 | samtools idxstats $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.q1.bam > $outputfolder/out/$samplename.stats 226 | rm $outputfolder/aln/*.sam 227 | ## Estimating spanning reads 228 | printf "[EricScript] Scoring candidate fusions ..." 229 | R --slave --args $samplename,$outputfolder,$readlength < $ericscriptfolder/lib/R/EstimateSpanningReads.R 230 | myflag=`cat $outputfolder/out/.ericscript.flag` 231 | if [ $myflag -eq 0 ]; then 232 | printf "done. \n" 233 | printf "[EricScript] No chimeric transcripts found! Writing results ..." 234 | R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/MakeEmptyResults.R 235 | printf "done. \n" 236 | exit 1 237 | fi 238 | printf "done. \n" 239 | printf "[EricScript] Filtering candidate fusions ..." 240 | if [ $verbose -eq 0 ]; then 241 | samtools mpileup -A -f $mynewref_recal -l $outputfolder/out/$samplename.intervals $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam > $outputfolder/out/$samplename.remap.recal.sorted.rmdup.pileup 2>> $outputfolder/out/.ericscript.log 242 | else 243 | samtools mpileup -A -f $mynewref_recal -l $outputfolder/out/$samplename.intervals $outputfolder/aln/$samplename.remap.recal.sorted.rmdup.bam > $outputfolder/out/$samplename.remap.recal.sorted.rmdup.pileup 244 | fi 245 | cut -f1,2,3 $outputfolder/out/$samplename.remap.recal.sorted.rmdup.pileup | grep -e '[0-9]----[a-z | A-Z]' - > $outputfolder/out/$samplename.intervals.pileup 246 | R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/BuildNeighbourhoodSequences.R 247 | if [ $verbose -eq 0 ]; then 248 | blat $myref $outputfolder/out/.link $outputfolder/out/$samplename.checkselfhomology.blat -out=blast8 1>> $outputfolder/out/.ericscript.log 249 | else 250 | blat $myref $outputfolder/out/.link $outputfolder/out/$samplename.checkselfhomology.blat -out=blast8 251 | fi 252 | R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/CheckSelfHomology.R 253 | myflag=`cat $outputfolder/out/.ericscript.flag` 254 | if [ $myflag -eq 0 ]; then 255 | printf "done. \n" 256 | printf "[EricScript] No chimeric transcripts found! Writing results ..." 257 | R --slave --args $samplename,$outputfolder < $ericscriptfolder/lib/R/MakeEmptyResults.R 258 | printf "done. \n" 259 | exit 1 260 | fi 261 | printf "done. \n" 262 | ## Writing results 263 | if [ $verbose -eq 0 ]; then 264 | printf "[EricScript] Writing results ... " 265 | else 266 | printf "[EricScript] Writing results ... \n" 267 | fi 268 | R --slave --args $samplename,$outputfolder,$ericscriptfolder,$readlength,$verbose,$refid,$dbfolder < $ericscriptfolder/lib/R/MakeResults.R 269 | printf "done. \n" 270 | rm $outputfolder/out/*fai 271 | if [ $removetemp -eq 1 ]; then 272 | printf "[EricScript] Removing temporary files ... " 273 | rm -r $outputfolder/aln 274 | rm -r $outputfolder/out 275 | printf "done. \n" 276 | fi 277 | printf "[EricScript] Open $outputfolder/$samplename.results* to view the results of EricScript analysis.\n" 278 | -------------------------------------------------------------------------------- /lib/data/_resources/BlackList.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/ericscript/edd6f8b5cd80f9c828f5b8f54a46ae4b2e6648d5/lib/data/_resources/BlackList.RData -------------------------------------------------------------------------------- /lib/data/_resources/DataModel.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/ericscript/edd6f8b5cd80f9c828f5b8f54a46ae4b2e6648d5/lib/data/_resources/DataModel.RData -------------------------------------------------------------------------------- /lib/demo/myreads_1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/ericscript/edd6f8b5cd80f9c828f5b8f54a46ae4b2e6648d5/lib/demo/myreads_1.fq.gz -------------------------------------------------------------------------------- /lib/demo/myreads_2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/databio/ericscript/edd6f8b5cd80f9c828f5b8f54a46ae4b2e6648d5/lib/demo/myreads_2.fq.gz -------------------------------------------------------------------------------- /lib/perl/trimfq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | 5 | my($listin) = $ARGV[0]; 6 | my($ntrim) = $ARGV[1]; 7 | my($outfile) = $ARGV[2]; 8 | 9 | open LIST, "${listin}" or die $!; 10 | open OUT, ">$outfile"; 11 | my $a; 12 | my $count = 1; 13 | while () 14 | 15 | { 16 | if ($count++ % 2 == 0) { 17 | $a = substr($_, 0, $ntrim); 18 | print OUT "$a\n"; 19 | } else 20 | { 21 | print OUT "$_"; 22 | } 23 | 24 | } 25 | 26 | close LIST; 27 | close OUT; 28 | -------------------------------------------------------------------------------- /lib/perl/xa2multi.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use strict; 4 | use warnings; 5 | 6 | while (<>) { 7 | if (/\tXA:Z:(\S+)/) { 8 | my $l = $1; 9 | print; 10 | my @t = split("\t"); 11 | while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) { 12 | my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated! 13 | my $seq = $t[9]; 14 | my $phred = $t[10]; 15 | # if alternative alignment has other orientation than primary, 16 | # then print the reverse (complement) of sequence and phred string 17 | if ((($t[1]&0x10)>0) xor ($2<0)) { 18 | $seq = reverse $seq; 19 | $seq =~ tr/ACGTacgt/TGCAtgca/; 20 | $phred = reverse $phred; 21 | } 22 | print(join("\t", $t[0], ($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, $seq, $phred, "NM:i:$4"), "\n"); 23 | } 24 | } else { print; } 25 | } 26 | 27 | --------------------------------------------------------------------------------