├── DESCRIPTION
├── LICENSE.md
├── NAMESPACE
├── R
    ├── .Rhistory
    ├── TMBpred.R
    ├── ecTMB.R
    ├── procMaf.R
    ├── stats.R
    └── utils.R
├── README.md
├── inst
    ├── nf
    │   ├── bed2vcf.changes.pl
    │   ├── create.exome.Rdata.R
    │   ├── ecTMB_refbuild.nf
    │   ├── nextflow.config
    │   ├── process.maf.output.pl
    │   └── process.vep.output.pl
    ├── perl
    │   └── Sequence_Retrieve.pl
    └── stan
    │   ├── NB_bs.stan
    │   ├── NB_bs2.stan
    │   ├── NB_bs3.stan
    │   ├── Poisson_bs.stan
    │   ├── Poisson_bs2.stan
    │   ├── Poisson_bs3.stan
    │   ├── ZIP.stan
    │   ├── ZIPmix.stan
    │   ├── ZIPmix_bs.stan
    │   ├── ZIPmix_bs2.stan
    │   └── ZIPmix_bs3.stan
└── man
    ├── CM.Rd
    ├── CalTMB.Rd
    ├── ConvertExtraType.Rd
    ├── ExGMM.Rd
    ├── Get_incon_mut.Rd
    ├── MutSet.Rd
    ├── OverLap.Rd
    ├── assignClass.Rd
    ├── calMut.Rd
    ├── checkMutC.Rd
    ├── count_Mut.Rd
    ├── fit_model.Rd
    ├── getBgMRtri.Rd
    ├── getEnsemblID.Rd
    ├── getGeneBgProb.Rd
    ├── getGeneSymbol.Rd
    ├── getTMBs.Rd
    ├── get_exomeGene.Rd
    ├── get_genewise_dispersion_mle_mu.Rd
    ├── get_glen.Rd
    ├── get_mu_hat_mle.Rd
    ├── get_mu_phi.Rd
    ├── get_muhat.Rd
    ├── get_zip_pairparam_mle.Rd
    ├── getmu_post_optimize.Rd
    ├── getsigma.Rd
    ├── getsubData.Rd
    ├── getzero_p.Rd
    ├── loadfile.Rd
    ├── mafCleanup.Rd
    ├── maf_dnp_converter.Rd
    ├── pred_TMB.Rd
    ├── readData.Rd
    ├── remove_outliers.Rd
    ├── retrieve_context.Rd
    └── zipMLE.Rd


/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: ecTMB
 2 | Type: Package
 3 | Title: Esitmation and Classification of TMB
 4 | Version: 0.1.1
 5 | Author: Lijing Yao, Fu Yao
 6 | Maintainer: The package maintainer <lijing.yao@roche.com>
 7 | Description: ecTMB is a powerful and flexible statistical framework for TMB 
 8 |     estimation and classification. It uses an explicit background mutation mdoel for 
 9 |     more robust and consistent TMB prediction. The backgournd mutation model 
10 |     takes account of unknown as well as known mutational heterogeneous factors, 
11 |     including tri-nucleotide context, sample mutational burden, gene expression 
12 |     level and replication timing by utilization of a Bayesian framework. The discovery
13 |     of three TMB-based subtypes, including one novel subtype TMB-extreme, enable 
14 |     ecTMB to classify samples to biological and clinically relavent TMB subtypes.
15 | Imports: ggplot2,
16 |         limma,
17 |         reshape2,
18 |         dplyr,
19 |         R6,
20 |         MASS,
21 |         GenomicRanges,
22 |         data.table,
23 |         parallel,
24 |         mixtools
25 | License: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International license
26 | Encoding: UTF-8
27 | LazyData: true
28 | RoxygenNote: 6.1.1.9000
29 | Suggests: maftools
30 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | ecTMB (c) 2019 by Roche Sequencing Solutions, Inc. All rights reserved. 
  2 | 
  3 | ecTMB is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. You should have received a copy of the license along with this work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/.
  4 | 
  5 | Attribution-NonCommercial-ShareAlike 4.0 International
  6 | 
  7 | =======================================================================
  8 | 
  9 | Creative Commons Corporation ("Creative Commons") is not a law firm and
 10 | does not provide legal services or legal advice. Distribution of
 11 | Creative Commons public licenses does not create a lawyer-client or
 12 | other relationship. Creative Commons makes its licenses and related
 13 | information available on an "as-is" basis. Creative Commons gives no
 14 | warranties regarding its licenses, any material licensed under their
 15 | terms and conditions, or any related information. Creative Commons
 16 | disclaims all liability for damages resulting from their use to the
 17 | fullest extent possible.
 18 | 
 19 | Using Creative Commons Public Licenses
 20 | 
 21 | Creative Commons public licenses provide a standard set of terms and
 22 | conditions that creators and other rights holders may use to share
 23 | original works of authorship and other material subject to copyright
 24 | and certain other rights specified in the public license below. The
 25 | following considerations are for informational purposes only, are not
 26 | exhaustive, and do not form part of our licenses.
 27 | 
 28 |      Considerations for licensors: Our public licenses are
 29 |      intended for use by those authorized to give the public
 30 |      permission to use material in ways otherwise restricted by
 31 |      copyright and certain other rights. Our licenses are
 32 |      irrevocable. Licensors should read and understand the terms
 33 |      and conditions of the license they choose before applying it.
 34 |      Licensors should also secure all rights necessary before
 35 |      applying our licenses so that the public can reuse the
 36 |      material as expected. Licensors should clearly mark any
 37 |      material not subject to the license. This includes other CC-
 38 |      licensed material, or material used under an exception or
 39 |      limitation to copyright. More considerations for licensors:
 40 |   wiki.creativecommons.org/Considerations_for_licensors
 41 | 
 42 |      Considerations for the public: By using one of our public
 43 |      licenses, a licensor grants the public permission to use the
 44 |      licensed material under specified terms and conditions. If
 45 |      the licensor's permission is not necessary for any reason--for
 46 |      example, because of any applicable exception or limitation to
 47 |      copyright--then that use is not regulated by the license. Our
 48 |      licenses grant only permissions under copyright and certain
 49 |      other rights that a licensor has authority to grant. Use of
 50 |      the licensed material may still be restricted for other
 51 |      reasons, including because others have copyright or other
 52 |      rights in the material. A licensor may make special requests,
 53 |      such as asking that all changes be marked or described.
 54 |      Although not required by our licenses, you are encouraged to
 55 |      respect those requests where reasonable. More considerations
 56 |      for the public: 
 57 |   wiki.creativecommons.org/Considerations_for_licensees
 58 | 
 59 | =======================================================================
 60 | 
 61 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
 62 | Public License
 63 | 
 64 | By exercising the Licensed Rights (defined below), You accept and agree
 65 | to be bound by the terms and conditions of this Creative Commons
 66 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
 67 | ("Public License"). To the extent this Public License may be
 68 | interpreted as a contract, You are granted the Licensed Rights in
 69 | consideration of Your acceptance of these terms and conditions, and the
 70 | Licensor grants You such rights in consideration of benefits the
 71 | Licensor receives from making the Licensed Material available under
 72 | these terms and conditions.
 73 | 
 74 | 
 75 | Section 1 -- Definitions.
 76 | 
 77 |   a. Adapted Material means material subject to Copyright and Similar
 78 |      Rights that is derived from or based upon the Licensed Material
 79 |      and in which the Licensed Material is translated, altered,
 80 |      arranged, transformed, or otherwise modified in a manner requiring
 81 |      permission under the Copyright and Similar Rights held by the
 82 |      Licensor. For purposes of this Public License, where the Licensed
 83 |      Material is a musical work, performance, or sound recording,
 84 |      Adapted Material is always produced where the Licensed Material is
 85 |      synched in timed relation with a moving image.
 86 | 
 87 |   b. Adapter's License means the license You apply to Your Copyright
 88 |      and Similar Rights in Your contributions to Adapted Material in
 89 |      accordance with the terms and conditions of this Public License.
 90 | 
 91 |   c. BY-NC-SA Compatible License means a license listed at
 92 |      creativecommons.org/compatiblelicenses, approved by Creative
 93 |      Commons as essentially the equivalent of this Public License.
 94 | 
 95 |   d. Copyright and Similar Rights means copyright and/or similar rights
 96 |      closely related to copyright including, without limitation,
 97 |      performance, broadcast, sound recording, and Sui Generis Database
 98 |      Rights, without regard to how the rights are labeled or
 99 |      categorized. For purposes of this Public License, the rights
100 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
101 |      Rights.
102 | 
103 |   e. Effective Technological Measures means those measures that, in the
104 |      absence of proper authority, may not be circumvented under laws
105 |      fulfilling obligations under Article 11 of the WIPO Copyright
106 |      Treaty adopted on December 20, 1996, and/or similar international
107 |      agreements.
108 | 
109 |   f. Exceptions and Limitations means fair use, fair dealing, and/or
110 |      any other exception or limitation to Copyright and Similar Rights
111 |      that applies to Your use of the Licensed Material.
112 | 
113 |   g. License Elements means the license attributes listed in the name
114 |      of a Creative Commons Public License. The License Elements of this
115 |      Public License are Attribution, NonCommercial, and ShareAlike.
116 | 
117 |   h. Licensed Material means the artistic or literary work, database,
118 |      or other material to which the Licensor applied this Public
119 |      License.
120 | 
121 |   i. Licensed Rights means the rights granted to You subject to the
122 |      terms and conditions of this Public License, which are limited to
123 |      all Copyright and Similar Rights that apply to Your use of the
124 |      Licensed Material and that the Licensor has authority to license.
125 | 
126 |   j. Licensor means the individual(s) or entity(ies) granting rights
127 |      under this Public License.
128 | 
129 |   k. NonCommercial means not primarily intended for or directed towards
130 |      commercial advantage or monetary compensation. For purposes of
131 |      this Public License, the exchange of the Licensed Material for
132 |      other material subject to Copyright and Similar Rights by digital
133 |      file-sharing or similar means is NonCommercial provided there is
134 |      no payment of monetary compensation in connection with the
135 |      exchange.
136 | 
137 |   l. Share means to provide material to the public by any means or
138 |      process that requires permission under the Licensed Rights, such
139 |      as reproduction, public display, public performance, distribution,
140 |      dissemination, communication, or importation, and to make material
141 |      available to the public including in ways that members of the
142 |      public may access the material from a place and at a time
143 |      individually chosen by them.
144 | 
145 |   m. Sui Generis Database Rights means rights other than copyright
146 |      resulting from Directive 96/9/EC of the European Parliament and of
147 |      the Council of 11 March 1996 on the legal protection of databases,
148 |      as amended and/or succeeded, as well as other essentially
149 |      equivalent rights anywhere in the world.
150 | 
151 |   n. You means the individual or entity exercising the Licensed Rights
152 |      under this Public License. Your has a corresponding meaning.
153 | 
154 | 
155 | Section 2 -- Scope.
156 | 
157 |   a. License grant.
158 | 
159 |        1. Subject to the terms and conditions of this Public License,
160 |           the Licensor hereby grants You a worldwide, royalty-free,
161 |           non-sublicensable, non-exclusive, irrevocable license to
162 |           exercise the Licensed Rights in the Licensed Material to:
163 | 
164 |             a. reproduce and Share the Licensed Material, in whole or
165 |                in part, for NonCommercial purposes only; and
166 | 
167 |             b. produce, reproduce, and Share Adapted Material for
168 |                NonCommercial purposes only.
169 | 
170 |        2. Exceptions and Limitations. For the avoidance of doubt, where
171 |           Exceptions and Limitations apply to Your use, this Public
172 |           License does not apply, and You do not need to comply with
173 |           its terms and conditions.
174 | 
175 |        3. Term. The term of this Public License is specified in Section
176 |           6(a).
177 | 
178 |        4. Media and formats; technical modifications allowed. The
179 |           Licensor authorizes You to exercise the Licensed Rights in
180 |           all media and formats whether now known or hereafter created,
181 |           and to make technical modifications necessary to do so. The
182 |           Licensor waives and/or agrees not to assert any right or
183 |           authority to forbid You from making technical modifications
184 |           necessary to exercise the Licensed Rights, including
185 |           technical modifications necessary to circumvent Effective
186 |           Technological Measures. For purposes of this Public License,
187 |           simply making modifications authorized by this Section 2(a)
188 |           (4) never produces Adapted Material.
189 | 
190 |        5. Downstream recipients.
191 | 
192 |             a. Offer from the Licensor -- Licensed Material. Every
193 |                recipient of the Licensed Material automatically
194 |                receives an offer from the Licensor to exercise the
195 |                Licensed Rights under the terms and conditions of this
196 |                Public License.
197 | 
198 |             b. Additional offer from the Licensor -- Adapted Material.
199 |                Every recipient of Adapted Material from You
200 |                automatically receives an offer from the Licensor to
201 |                exercise the Licensed Rights in the Adapted Material
202 |                under the conditions of the Adapter's License You apply.
203 | 
204 |             c. No downstream restrictions. You may not offer or impose
205 |                any additional or different terms or conditions on, or
206 |                apply any Effective Technological Measures to, the
207 |                Licensed Material if doing so restricts exercise of the
208 |                Licensed Rights by any recipient of the Licensed
209 |                Material.
210 | 
211 |        6. No endorsement. Nothing in this Public License constitutes or
212 |           may be construed as permission to assert or imply that You
213 |           are, or that Your use of the Licensed Material is, connected
214 |           with, or sponsored, endorsed, or granted official status by,
215 |           the Licensor or others designated to receive attribution as
216 |           provided in Section 3(a)(1)(A)(i).
217 | 
218 |   b. Other rights.
219 | 
220 |        1. Moral rights, such as the right of integrity, are not
221 |           licensed under this Public License, nor are publicity,
222 |           privacy, and/or other similar personality rights; however, to
223 |           the extent possible, the Licensor waives and/or agrees not to
224 |           assert any such rights held by the Licensor to the limited
225 |           extent necessary to allow You to exercise the Licensed
226 |           Rights, but not otherwise.
227 | 
228 |        2. Patent and trademark rights are not licensed under this
229 |           Public License.
230 | 
231 |        3. To the extent possible, the Licensor waives any right to
232 |           collect royalties from You for the exercise of the Licensed
233 |           Rights, whether directly or through a collecting society
234 |           under any voluntary or waivable statutory or compulsory
235 |           licensing scheme. In all other cases the Licensor expressly
236 |           reserves any right to collect such royalties, including when
237 |           the Licensed Material is used other than for NonCommercial
238 |           purposes.
239 | 
240 | 
241 | Section 3 -- License Conditions.
242 | 
243 | Your exercise of the Licensed Rights is expressly made subject to the
244 | following conditions.
245 | 
246 |   a. Attribution.
247 | 
248 |        1. If You Share the Licensed Material (including in modified
249 |           form), You must:
250 | 
251 |             a. retain the following if it is supplied by the Licensor
252 |                with the Licensed Material:
253 | 
254 |                  i. identification of the creator(s) of the Licensed
255 |                     Material and any others designated to receive
256 |                     attribution, in any reasonable manner requested by
257 |                     the Licensor (including by pseudonym if
258 |                     designated);
259 | 
260 |                 ii. a copyright notice;
261 | 
262 |                iii. a notice that refers to this Public License;
263 | 
264 |                 iv. a notice that refers to the disclaimer of
265 |                     warranties;
266 | 
267 |                  v. a URI or hyperlink to the Licensed Material to the
268 |                     extent reasonably practicable;
269 | 
270 |             b. indicate if You modified the Licensed Material and
271 |                retain an indication of any previous modifications; and
272 | 
273 |             c. indicate the Licensed Material is licensed under this
274 |                Public License, and include the text of, or the URI or
275 |                hyperlink to, this Public License.
276 | 
277 |        2. You may satisfy the conditions in Section 3(a)(1) in any
278 |           reasonable manner based on the medium, means, and context in
279 |           which You Share the Licensed Material. For example, it may be
280 |           reasonable to satisfy the conditions by providing a URI or
281 |           hyperlink to a resource that includes the required
282 |           information.
283 |        3. If requested by the Licensor, You must remove any of the
284 |           information required by Section 3(a)(1)(A) to the extent
285 |           reasonably practicable.
286 | 
287 |   b. ShareAlike.
288 | 
289 |      In addition to the conditions in Section 3(a), if You Share
290 |      Adapted Material You produce, the following conditions also apply.
291 | 
292 |        1. The Adapter's License You apply must be a Creative Commons
293 |           license with the same License Elements, this version or
294 |           later, or a BY-NC-SA Compatible License.
295 | 
296 |        2. You must include the text of, or the URI or hyperlink to, the
297 |           Adapter's License You apply. You may satisfy this condition
298 |           in any reasonable manner based on the medium, means, and
299 |           context in which You Share Adapted Material.
300 | 
301 |        3. You may not offer or impose any additional or different terms
302 |           or conditions on, or apply any Effective Technological
303 |           Measures to, Adapted Material that restrict exercise of the
304 |           rights granted under the Adapter's License You apply.
305 | 
306 | 
307 | Section 4 -- Sui Generis Database Rights.
308 | 
309 | Where the Licensed Rights include Sui Generis Database Rights that
310 | apply to Your use of the Licensed Material:
311 | 
312 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
313 |      to extract, reuse, reproduce, and Share all or a substantial
314 |      portion of the contents of the database for NonCommercial purposes
315 |      only;
316 | 
317 |   b. if You include all or a substantial portion of the database
318 |      contents in a database in which You have Sui Generis Database
319 |      Rights, then the database in which You have Sui Generis Database
320 |      Rights (but not its individual contents) is Adapted Material,
321 |      including for purposes of Section 3(b); and
322 | 
323 |   c. You must comply with the conditions in Section 3(a) if You Share
324 |      all or a substantial portion of the contents of the database.
325 | 
326 | For the avoidance of doubt, this Section 4 supplements and does not
327 | replace Your obligations under this Public License where the Licensed
328 | Rights include other Copyright and Similar Rights.
329 | 
330 | 
331 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
332 | 
333 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
334 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
335 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
336 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
337 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
338 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
339 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
340 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
341 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
342 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
343 | 
344 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
345 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
346 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
347 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
348 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
349 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
350 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
351 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
352 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
353 | 
354 |   c. The disclaimer of warranties and limitation of liability provided
355 |      above shall be interpreted in a manner that, to the extent
356 |      possible, most closely approximates an absolute disclaimer and
357 |      waiver of all liability.
358 | 
359 | 
360 | Section 6 -- Term and Termination.
361 | 
362 |   a. This Public License applies for the term of the Copyright and
363 |      Similar Rights licensed here. However, if You fail to comply with
364 |      this Public License, then Your rights under this Public License
365 |      terminate automatically.
366 | 
367 |   b. Where Your right to use the Licensed Material has terminated under
368 |      Section 6(a), it reinstates:
369 | 
370 |        1. automatically as of the date the violation is cured, provided
371 |           it is cured within 30 days of Your discovery of the
372 |           violation; or
373 | 
374 |        2. upon express reinstatement by the Licensor.
375 | 
376 |      For the avoidance of doubt, this Section 6(b) does not affect any
377 |      right the Licensor may have to seek remedies for Your violations
378 |      of this Public License.
379 | 
380 |   c. For the avoidance of doubt, the Licensor may also offer the
381 |      Licensed Material under separate terms or conditions or stop
382 |      distributing the Licensed Material at any time; however, doing so
383 |      will not terminate this Public License.
384 | 
385 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
386 |      License.
387 | 
388 | 
389 | Section 7 -- Other Terms and Conditions.
390 | 
391 |   a. The Licensor shall not be bound by any additional or different
392 |      terms or conditions communicated by You unless expressly agreed.
393 | 
394 |   b. Any arrangements, understandings, or agreements regarding the
395 |      Licensed Material not stated herein are separate from and
396 |      independent of the terms and conditions of this Public License.
397 | 
398 | 
399 | Section 8 -- Interpretation.
400 | 
401 |   a. For the avoidance of doubt, this Public License does not, and
402 |      shall not be interpreted to, reduce, limit, restrict, or impose
403 |      conditions on any use of the Licensed Material that could lawfully
404 |      be made without permission under this Public License.
405 | 
406 |   b. To the extent possible, if any provision of this Public License is
407 |      deemed unenforceable, it shall be automatically reformed to the
408 |      minimum extent necessary to make it enforceable. If the provision
409 |      cannot be reformed, it shall be severed from this Public License
410 |      without affecting the enforceability of the remaining terms and
411 |      conditions.
412 | 
413 |   c. No term or condition of this Public License will be waived and no
414 |      failure to comply consented to unless expressly agreed to by the
415 |      Licensor.
416 | 
417 |   d. Nothing in this Public License constitutes or may be interpreted
418 |      as a limitation upon, or waiver of, any privileges and immunities
419 |      that apply to the Licensor or You, including from the legal
420 |      processes of any jurisdiction or authority.
421 | 
422 | =======================================================================
423 | 
424 | Creative Commons is not a party to its public
425 | licenses. Notwithstanding, Creative Commons may elect to apply one of
426 | its public licenses to material it publishes and in those instances
427 | will be considered the “Licensor.” The text of the Creative Commons
428 | public licenses is dedicated to the public domain under the CC0 Public
429 | Domain Dedication. Except for the limited purpose of indicating that
430 | material is shared under a Creative Commons public license or as
431 | otherwise permitted by the Creative Commons policies published at
432 | creativecommons.org/policies, Creative Commons does not authorize the
433 | use of the trademark "Creative Commons" or any other trademark or logo
434 | of Creative Commons without its prior written consent including,
435 | without limitation, in connection with any unauthorized modifications
436 | to any of its public licenses or any other arrangements,
437 | understandings, or agreements concerning use of licensed material. For
438 | the avoidance of doubt, this paragraph does not form part of the
439 | public licenses.
440 | 
441 | Creative Commons may be contacted at creativecommons.org.
442 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(CM)
 4 | export(CalTMB)
 5 | export(ConvertExtraType)
 6 | export(ExGMM)
 7 | export(Get_incon_mut)
 8 | export(OverLap)
 9 | export(assignClass)
10 | export(calMut)
11 | export(count_Mut)
12 | export(fit_model)
13 | export(getBgMRtri)
14 | export(getEnsemblID)
15 | export(getGeneBgProb)
16 | export(getGeneSymbol)
17 | export(getTMBs)
18 | export(get_exomeGene)
19 | export(get_glen)
20 | export(get_zip_pairparam_mle)
21 | export(getsubData)
22 | export(getzero_p)
23 | export(loadfile)
24 | export(mafCleanup)
25 | export(maf_dnp_converter)
26 | export(pred_TMB)
27 | export(readData)
28 | export(remove_outliers)
29 | export(retrieve_context)
30 | export(zipMLE)
31 | exportClasses(MutSet)
32 | import(R6)
33 | importFrom(GenomicRanges,findOverlaps)
34 | importFrom(GenomicRanges,makeGRangesFromDataFrame)
35 | importFrom(MASS,glm.nb)
36 | importFrom(data.table,data.table)
37 | importFrom(data.table,setkey)
38 | importFrom(dplyr,"%>%")
39 | importFrom(dplyr,group_by)
40 | importFrom(dplyr,summarize)
41 | importFrom(limma,alias2Symbol)
42 | importFrom(mixtools,normalmixEM)
43 | importFrom(parallel,mclapply)
44 | 


--------------------------------------------------------------------------------
/R/.Rhistory:
--------------------------------------------------------------------------------
1 | devtools::install_github("r-lib/devtools")
2 | 


--------------------------------------------------------------------------------
/R/TMBpred.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # esitmate the Gausian mixture #
  3 | #' ExGMM
  4 | #' @description fit the data distribution to a mixture of Gaussian mixture
  5 | #' @param x number of mutation
  6 | #' @param msi msi status
  7 | #' @param single A boolen. If True, only one class will be identified.
  8 | #' @importFrom mixtools normalmixEM
  9 | #' @importFrom dplyr %>% group_by summarize
 10 | #' @return A vector mutation count for 96 trinucleotide context
 11 | #' @export
 12 | #' @examples
 13 | #' \dontrun{
 14 | #' ExGMM(Data)
 15 | #' }
 16 | #'
 17 | ExGMM = function(x, msi = NULL, single = FALSE){
 18 |   if(!single){
 19 |     if(is.null(msi)){
 20 |       df          = data.frame(x = x, cluster = ifelse( x > log(50), 3,
 21 |                                                         ifelse(x > log(9), 2, 1)))
 22 |     }else{
 23 |       df          = data.frame(x = x, cluster = ifelse( msi %in% "MSI-H", 2,
 24 |                                                         ifelse(x > mean(x[msi %in% "MSI-H"]), 3, 1)))
 25 |     }
 26 |     if(sum(df$cluster > 1) < 10){
 27 |       df$cluster[df$cluster > 1] = 1
 28 |       k         = length(unique(df$cluster))
 29 |     }
 30 |     if(sum(df$cluster > 2) < 10){
 31 |       df$cluster[df$cluster > 2] = 2
 32 |       k         = length(unique(df$cluster))
 33 |     }
 34 |     summary     = df %>%
 35 |       group_by(cluster) %>%
 36 |       summarize(mu = mean(x), variance = var(x), std = sd(x), size = n())
 37 | 
 38 |     if(length(unique(df$cluster)) == 1){
 39 |       single = TRUE
 40 |     }else{
 41 |       mixmdl = normalmixEM(df$x, mu = summary$mu,  sigma = summary$std,  k = k, arbvar = T, arbmean = T, epsilon = 1e-03)
 42 |       plot(mixmdl,which=2)
 43 |       lines(density(df$x), lty=2, lwd=2)
 44 |     }
 45 |   }else{
 46 |     df          = data.frame(x = x)
 47 |   }
 48 | 
 49 |   if(single){
 50 |     rmoutlierD = remove_outliers(df$x)
 51 |     rmoutlierD = rmoutlierD[!is.na(rmoutlierD)]
 52 |     mixmdl = list(x = df$x, mu = mean(rmoutlierD),  sigma = sd(rmoutlierD), lambda = 1)
 53 |   }
 54 |   return(mixmdl)
 55 | }
 56 | 
 57 | 
 58 | # remove outliers #
 59 | #'remove_outliers
 60 | #' @description remove outlier based on 1.5 quantile
 61 | #' @param x number of mutation
 62 | #' @param na.rm how to deal with na
 63 | #' @param ... The extra parameters for quantile.
 64 | #' @return outlier is labeled as NA
 65 | #' @export
 66 | #' @examples
 67 | #' \dontrun{
 68 | #' remove_outliers(x)
 69 | #' }
 70 | #'
 71 | remove_outliers <- function(x, na.rm = TRUE, ...) {
 72 |   qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
 73 |   H <- 1.5 * IQR(x, na.rm = na.rm)
 74 |   y <- x
 75 |   y[x < (qnt[1] - H)] <- NA
 76 |   y[x > (qnt[2] + H)] <- NA
 77 |   y
 78 | }
 79 | 
 80 | # predict TMB #
 81 | #' pred_TMB
 82 | #' @description fit the data distribution to a mixture of Gaussian mixture
 83 | #' @param x test mutSet object
 84 | #' @param params A list contain result from fit
 85 | #' @param WES test mutSet object which contain whole exome data which will be used to generate the truth.
 86 | #' @param prior A list contain prior parameters. prior = list(mu = prior_bs$mu, sigma = prior_bs$sigma, lambda = prior_bs$lambda)
 87 | #' @param mut.nonsil A boolen. If it is TRUE, the nonsilent mutation will be used for prediction
 88 | #' @param span The extension of low and high limit. High = high * span and low = low/span
 89 | #' @param low The lower limit.
 90 | #' @param high The higher limit.
 91 | #' @param gid_nonsil_p A vector of gid for passenger genes
 92 | #' @param method A string which specify the method to predict the TMB
 93 | #' @param cores A number which specifiy the nubmer of core to use for parallel computing.
 94 | #' @param bs Either all or nonsil.
 95 | #' @return A data.frame contain the result.
 96 | #' @export
 97 | #' @examples
 98 | #' \dontrun{
 99 | #' pred_TMB(Data)
100 | #' }
101 | #'
102 | pred_TMB = function(x, params, WES = NULL, prior = NULL,
103 |                     mut.nonsil = FALSE, gid_nonsil_p = NULL, method = "MLE", cores = 10, span = 1, low = 1, high = 10^4,
104 |                     bs = "nonsil"){
105 |   MR_p               = params$MRtriProb$MR_p     # relative mutation frequency for each tri-nucleotide context #
106 |   o_scale            = params$o_scale
107 |   sampleN            = x$samples$SampleID
108 | 
109 |   ### use panel genes for prediction
110 |   gene_bg_p          = getGeneBgProb(x$exomeGene, MR_p)
111 |   gid                = x$gid
112 |   selGene            = gid
113 |   if(is.null(gid_nonsil_p) & mut.nonsil){
114 |     gid_nonsil_p     = selGene
115 |     cat("Since gid_nonsil_p is NULL, %s percentage of genes' nonsilent mutationw will be used.\n\n")
116 |   }
117 |   if( mut.nonsil ){
118 |     offset_nonsil    =  params$geneMu[selGene] * as.numeric(gene_bg_p[J(selGene,1),]$prob)
119 |     offset_nonsil[!selGene %in% gid_nonsil_p] = 0
120 |     offset_sil       = params$geneMu[selGene] * as.numeric(gene_bg_p[J(selGene,0),]$prob)
121 |     offset           = offset_nonsil + offset_sil
122 |     sil_mut_matrix   = unname(count_Mut(x$silent(), selGene, sampleN))
123 |     nonsil_mut_matrix= unname(count_Mut(x$nonsilent(), selGene, sampleN))
124 |     nonsil_mut_matrix[!selGene %in% gid_nonsil_p, ] = 0
125 |     y                = nonsil_mut_matrix + sil_mut_matrix
126 |   }else{
127 |     offset             = params$geneMu[selGene] * as.numeric(gene_bg_p[J(selGene,0),]$prob)
128 |     sil_mut_matrix     = unname(count_Mut(x$silent(), selGene, sampleN))
129 |     y                  = sil_mut_matrix
130 |   }
131 | 
132 |   rownames(y)          = selGene
133 |   colnames(y)          = sampleN
134 |   offsetS              = offset/exp(o_scale) ### rescale 0~1
135 | 
136 | 
137 | 
138 |   # cat(sprintf("In panel, %s patients contain 0 silent (or + nonsilent) mutations.\n", sum(colSums(y) == 0)))
139 |   if(method == "MLE"){
140 |     pred_bs_panel      = getTMBs(y, offsetS,  phi = params$geneDisp[selGene] ,
141 |                                  cores, zero_p = params$geneZero_p[selGene],
142 |                                  prior = prior, span = span, low = low, high = high)
143 |   }else if( method == "MCMC"){
144 |     # TODO
145 |     # mcmc_panel         = Instan(y, offsetS, prior = prior, cores = cores,
146 |     #                             zero_p = params$geneZero_p[selGene],
147 |     #                             geneDisp  = params$geneDisp[selGene],
148 |     #                             span = span, low = low, high = high)
149 |     # pred_bs_panel      = mcmc_panel[sampleN, "mean"]
150 |     # mcmc_panel         = mcmc_panel[sampleN, !colnames(mcmc_panel) %in% "mean"]
151 |     # colnames(mcmc_panel) = paste0("panel_", colnames(mcmc_panel))
152 |   }else{
153 |     stop(sprintf("Prediction method should either MLE or MCMC. %s was provided", method))
154 |   }
155 |   # mut_panel_used     = colSums(y)
156 | 
157 |   if(bs == "all" ){
158 |     count            = CalTMB(x, sampleN = sampleN, type = "all")
159 |   }else if( bs == "nonsil" ){
160 |     count            = CalTMB(x, sampleN = sampleN, type = "nonsil")
161 |   }
162 | 
163 |   out                = data.frame(sample = sampleN,
164 |                                   ecTMB_panel_TMB = pred_bs_panel,
165 |                                   count_panel_TMB = count$count)
166 | 
167 | 
168 | 
169 |   ## ground truth
170 |   if(!is.null(WES)){
171 |     if(bs == "all" ){
172 |       WES_TMB            = CalTMB(WES, sampleN = sampleN, type = "all")
173 |     }else if( bs == "nonsil" ){
174 |       WES_TMB            = CalTMB(WES, sampleN = sampleN, type = "nonsil")
175 |     }
176 |     out$WES_TMB          = WES_TMB$count
177 |   }
178 |   # if(method == "MCMC"){
179 |   #   out              = cbind(out, mcmc_panel)
180 |   #   out              = cbind(out, mcmc_wes)
181 |   # }
182 |   # cat(sprintf("Columns %s contain NA.\n", paste(colnames(out)[colSums(is.na(out)) > 0], collapse = ",")))
183 |   # out[is.na(out)]    = 0
184 |   return(out)
185 | }
186 | 
187 | 
188 | # predict TMB stat function #
189 | #' getTMBs
190 | #' @description predict TMB stat function
191 | #' @param y A matrix for observed mutation count
192 | #' @param offset offset
193 | #' @param phi gene dispersion
194 | #' @param zero_p gene zero fraction
195 | #' @param prior prior parameters. If it is NULL, no prior will be used.
196 | #' @param span The extension of low and high limit. High = high * span and low = low/span
197 | #' @param low The lower limit.
198 | #' @param high The higher limit.
199 | #' @param cores A number which specifiy the nubmer of core to use for parallel computing.
200 | #' @return A data.frame contain the result.
201 | #' @export
202 | #' @examples
203 | #' \dontrun{
204 | #' getTMBs(Data)
205 | #' }
206 | #'
207 | getTMBs = function(y, offset, phi, cores, zero_p = 0, prior = NULL, span = 10, low = 10^(-4), high = 10^4){
208 |   ngene = nrow(y)
209 |   nsam  = ncol(y)
210 |   if (length(phi) == 1) {
211 |     phi = rep(phi, ngene)
212 |   }
213 | 
214 |   if (length(zero_p) == 1) {
215 |     zero_p = rep(zero_p, ngene)
216 |   }
217 |   getbs = function(x, offset, disp, zero_p, prior, span = 100, low = 10^(-4), high = 10^4){
218 |     #print(1)
219 |     ind0 = x == 0
220 |     obj = function(bs){
221 |       ## Pr(bi)
222 |       p_bs       = 0
223 |       for (i in 1:length(prior$mu)){
224 |         p_bs     = p_bs + (prior$lambda[i] * 1/sqrt(2*pi*prior$sigma[i]) * exp(-(log(bs) - prior$mu[i])^2/(2 * prior$sigma[i]^2) ))
225 |       }
226 |       if(p_bs == 0) {
227 |         lp_bs     = -10^(200)
228 |       }else{
229 |         lp_bs     = log(p_bs)
230 |       }
231 |       if(any(zero_p != 0)){
232 |         # zero-inflated
233 |         if(all(disp == 0)){
234 |           # zero-inflated Poisson
235 |           return(
236 |             # -log(lp_bs) - ## prior section
237 |             -lp_bs -sum( ( log( zero_p + (1 - zero_p) * exp(-bs * offset) ) )[ind0] ) - # zero section disp.g is zero_p
238 |               sum( ( log(1 - zero_p) - (bs * offset)  + x*log(bs * offset) - lgamma(x+1) )[!ind0] )
239 |           )
240 |         }
241 |       }else{
242 |         if(all(disp == 0)){
243 |           ## regular Poisson
244 |           return( lp_bs - sum( x*log(bs * offset)-offset*bs -lgamma(x+1)) )
245 |         }else{
246 |           ## regular NB
247 |           return ( lp_bs -sum( lgamma(x + 1/disp) - lgamma(1/disp) -lgamma(x+1) -
248 |                                  1/disp*log(1 + bs*disp*offset) + x*( log(bs*offset) - log(1/disp+bs*offset) ) ) )
249 |         }
250 |       }
251 |     }
252 | 
253 |     obj2 = function(bs){     ## no prior
254 |       if(any(zero_p != 0)){
255 |         # zero-inflated
256 |         if(all(disp == 0)){
257 |           # zero-inflated Poisson
258 |           return(
259 |             -sum( ( log( zero_p + ( 1 - zero_p) * exp(-bs * offset) ) )[ind0] ) - # zero section disp.g is zero_p
260 |               sum( ( log(1-zero_p) - (bs * offset)  + x*log(bs * offset) -lgamma(x+1) )[!ind0] )
261 |           )
262 |         }
263 |       }else{
264 |         if(all(disp == 0)){
265 |           ## regular Poisson
266 |           return(  - sum( x*log(bs * offset)-offset*bs -lgamma(x+1)) )
267 |         }else{
268 |           ## regular NB
269 |           return ( -sum( lgamma(x + 1/disp) - lgamma(1/disp) -lgamma(x+1) -
270 |                                  1/disp*log(1+bs*disp*offset) + x*( log(bs*offset) - log(1/disp+bs*offset) ) ) )
271 |         }
272 |       }
273 |     }
274 | 
275 |     if(is.null(prior)){
276 |       return(optimize(obj2, interval=c(low/span, high * span))$minimum) ## no prior
277 |     }else{
278 |       return(optimize(obj, interval=c(low/span, high * span))$minimum)  ## with prior
279 |     }
280 |   }
281 |   x <- lapply(apply(y, 2, FUN=list), unlist)
282 |   bs <- unlist(mclapply(x, function(x) getbs(x, offset = offset, disp = phi, zero_p = zero_p, prior = prior,span = span, low = low, high = high) ,mc.cores=cores))
283 |   return(bs)
284 | }
285 | 
286 | 
287 | # Classify the group of TMB #
288 | #' AssignClass
289 | #' @description predict TMB stat function
290 | #' @param x predicted TMB
291 | #' @param prior prior parameters.
292 | #' @param type If 'low_high", only class 2 and 3 will be grouped to high.
293 | #' If 'exact', exact class will be reported
294 | #' @param add1 A boolen. If prior was defined with log( x + 1), then it should be TRUE.
295 | #' @return A vector of class
296 | #' @export
297 | #' @examples
298 | #' \dontrun{
299 | #' assignClass(Data)
300 | #' }
301 | #'
302 | 
303 | assignClass = function(x, prior, type = "exact", add1 = TRUE){
304 |   cla       = function(x, prior){
305 |     pr      = c()
306 |     for(i in 1:length(prior$mu)){
307 |       pr    = c(pr, 1/(prior$sigma[i] * sqrt(2 * pi)) * exp(-(log(x) - prior$mu[i])^2/(2*prior$sigma[i])))
308 |       # pr    = c(pr, 1/(prior$sigma[i] * sqrt(2 * pi)) * exp(-(log(x) - prior$mu[i])^2/(2*prior$sigma[i]))* prior$lambda[i])
309 |     }
310 |     ## for sure low
311 |     if(x < prior$mu[1] & pr[1] < pr[2]){
312 |       pr[1] = 1
313 |       pr[2:length(pr)] = 0
314 |     }
315 | 
316 |     if(sum(pr == max(pr)) > 1){
317 |       MAXs      = which(pr == max(pr))
318 |       class     = MAXs[which(prior$lambda[MAXs] == max(prior$lambda[MAXs]))]
319 |     }else{
320 |       class     = which(pr == max(pr))
321 |     }
322 |     return(list(pred = class, prob = pr))
323 |   }
324 |   out           = lapply(x, cla, prior = prior)
325 |   report        = list(pred = unlist(lapply(out, function(x){x$pred})),
326 |                     prob = do.call(rbind, lapply(out, function(x){x$prob/sum(x$prob)})))
327 |   if(type == "low_high"){
328 |     report$pred = ifelse(report$pred > 1, "high", "low")
329 |     if(ncol(report$prob) > 2){
330 |       report$prob = cbind(report$prob[,1], rowSums(report$prob[, 2:ncol(report$prob)]))
331 |     }
332 |   }else if(type == "exact"){
333 |     report$pred = ifelse(report$pred == 1, "low", ifelse(report$pred == 2, "high", "extreme"))
334 |   }else{
335 |     stop("Parameter type can only be either low_high or exact.\n")
336 |   }
337 |   return(report)
338 | }
339 | 
340 | 
341 | 
342 | # predict number of mutation #
343 | #' CM
344 | #' @description predict number of mutation
345 | #' @param Data mutSet
346 | #' @param mu mu of the gene
347 | #' @param o_scale o_scale
348 | #' @param MRtriProb MRtriProb
349 | #' @param gids gene IDs
350 | #' @param zero_p zero_p
351 | #' @return A list for all the inputs for Instan
352 | #' @export
353 | #' @examples
354 | #' \dontrun{
355 | #' CM(Data)
356 | #' }
357 | #'
358 | CM = function(Data,  mu,  o_scale, MRtriProb, gids, zero_p = 0){
359 |   gene.mu              = mu
360 |   o_scale              = o_scale
361 |   MRtriProb            = MRtriProb
362 |   subData              = Data$clone()
363 |   subData$mut          = Data$mut[Data$mut$Ensembl_gene_id %in% gids, ]
364 |   sampleN              = as.character(Data$samples$SampleID)
365 | 
366 |   MR_p               = MRtriProb$MR_p     # relative mutation frequency for each tri-nucleotide context #
367 |   ### get gene.prob
368 |   geneProb           = getGeneBgProb(Data$exomeGene, MR_p, gid = gids)
369 |   # count number of mutation per patient#
370 |   mutPerP            = count_Mut(subData$mut)
371 |   mutPerP$count      = mutPerP$count/get_glen(subData$exomeGene, selGid = names(subData$gid)) * 1000000
372 |   ### get gene length
373 |   geneLen            = get_glen(Data$exomeGene, selGid = gids, byGene= TRUE)
374 | 
375 | 
376 | 
377 |   if("data.table" %in% class(geneProb)){
378 |     geneProb           = data.table(geneProb)
379 |   }
380 |   setkey(geneProb, gid, consequence)
381 | 
382 | 
383 |   ## get mutation for non-silent/silent mutation ##
384 |   if(any(zero_p != 0)){
385 |     ## zero inflated model
386 |     mut_pre_nonsil       = as.matrix((1-zero_p) * gene.mu * ((geneProb[J(gids,1),]$prob + geneLen[gids]*MRtriProb$MR_p["indel"]) %*%
387 |                                                                t(as.matrix(mutPerP[match(sampleN, mutPerP$Tumor_Sample_Barcode),"count"]))))/exp(o_scale)
388 |     mut_pre_sil          = as.matrix((1-zero_p) * gene.mu * ((geneProb[J(gids,0),]$prob) %*%
389 |                                                                t(as.matrix(mutPerP[match(sampleN, mutPerP$Tumor_Sample_Barcode),"count"]))))/exp(o_scale)
390 |   }else{
391 |     mut_pre_nonsil       = as.matrix(gene.mu * (( geneProb[J(gids,1),]$prob + geneLen[gids]*MRtriProb$MR_p["indel"]) %*%
392 |                                                   t(as.matrix(mutPerP[match(sampleN, mutPerP$Tumor_Sample_Barcode),"count"]))))/exp(o_scale)
393 |     mut_pre_sil          = as.matrix(gene.mu * ((geneProb[J(gids,0),]$prob) %*%
394 |                                                   t(as.matrix(mutPerP[match(sampleN, mutPerP$Tumor_Sample_Barcode),"count"]))))/exp(o_scale)
395 |   }
396 | 
397 |   rownames(mut_pre_nonsil) = rownames(mut_pre_sil) = gids
398 |   colnames(mut_pre_nonsil) = colnames(mut_pre_sil) = sampleN
399 | 
400 |   ## get observed mutation count for non-silent and silent mutation ##
401 |   mut_obs_sil          = count_Mut(subData$silent(), gid = gids, sampleN = sampleN)
402 |   mut_obs_nonsil       = count_Mut(subData$nonsilent(), gid = gids, sampleN = sampleN)
403 | 
404 |   out                  = melt(mut_obs_sil)
405 |   colnames(out)        = c("gid", "sample","mut_obs_sil")
406 |   out$mut_obs_nonsil   = melt(mut_obs_nonsil)$value
407 |   out$mut_pre_sil      = melt(mut_pre_sil)$value
408 |   out$mut_pre_nonsil   = melt(mut_pre_nonsil)$value
409 |   out$geneLen          = geneLen[as.character(out$gid)]
410 |   out$geneProb0        = geneProb[J(as.character(out$gid),0),]$prob
411 |   out$geneProb1        = geneProb[J(as.character(out$gid),1),]$prob
412 |   return(out)
413 | }
414 | 


--------------------------------------------------------------------------------
/R/ecTMB.R:
--------------------------------------------------------------------------------
  1 | 
  2 | "_PACKAGE"
  3 | 
  4 | #' MutSet class
  5 | #'
  6 | #' @docType class
  7 | #' @import R6
  8 | #' @return Object of class \code{MutSet}
  9 | #' @format An \code{\link{R6Class}} object.
 10 | #' @examples
 11 | #' \dontrun{
 12 | #' MutSet$new()
 13 | #' }
 14 | #' @slot mut table from MAF
 15 | #' @slot gid list of gene id for analysis
 16 | #' @slot exome table contains all possible mutations
 17 | #' @slot exomeGene table contains all posiible mutations for each gene.
 18 | #' @slot covar table contains covariates for each gene
 19 | #' @slot mutContext The context of 96 mutations
 20 | #' @slot samples data.frame store sample info. if BED colunm is specified,
 21 | #' the background process will be using the regions limited to bed file.
 22 | #' @slot incosistantAnno Variants with incosistant gene annotation from
 23 | #' GSMuta reference data .
 24 | #' \describe{
 25 | #'   \item{\code{new()}}{Create a MutSet }
 26 | #'   \item{\code{nonsilent()}}{Get nonsilent mutations table}
 27 | #'   \item{\code{silent()}}{Get silent mutations table}
 28 | #'   \item{\code{nc()}}{Get noncoding mutations table}
 29 | #' }
 30 | #'
 31 | #' @name MutSet
 32 | #' @docType class
 33 | #' @exportClass MutSet
 34 | #'
 35 | MutSet = R6::R6Class(
 36 |   'MutSet',
 37 |   portable = FALSE,
 38 |   public = list(
 39 |     mut        = NULL,
 40 |     gid        = NULL,
 41 |     exome      = NULL,
 42 |     exomeGene  = NULL,
 43 |     covar      = NULL,
 44 |     mutContext = NULL,
 45 |     incosistantAnno = NULL,
 46 |     samples    = NULL,
 47 | 
 48 |     nonsilent = function(gid) {
 49 |       mut[mut$Sil == 1,]
 50 | 
 51 |     },
 52 | 
 53 |     silent = function() {
 54 |       mut[mut$Sil == 2,]
 55 | 
 56 |     },
 57 | 
 58 |     nc = function() {
 59 |       mut[mut$Sil == 3,]
 60 |     },
 61 | 
 62 |     get_nonsil_passengers = function(fraction){
 63 |       nosam = aggregate(data=mut[mut$Sil == 1,], Tumor_Sample_Barcode~ Ensembl_gene_id, function(x) length(unique(x)))
 64 |       mut.gene = nosam[with(nosam,order(Tumor_Sample_Barcode)),1]
 65 |       nonsilent_passenger_gene=gid[! gid %in% mut.gene[round(length(mut.gene)* fraction):length(mut.gene)]]
 66 |       return(nonsilent_passenger_gene)
 67 |     }
 68 | 
 69 |   )
 70 | )
 71 | 
 72 | 
 73 | #' read and load maf file and necessary reference file
 74 | #' @param mutf Path to maf file. Detail see \code{mafCleanup}.
 75 | #' @param exomef Path to exome file
 76 | #' @param covarf Path to covariable file
 77 | #' @param mutContextf Path to mutContext file.
 78 | #' @param samplef Path to sample file or data.frame. The column name to specify bed file must be BED
 79 | #' @param ref Path to reference genome
 80 | #' @export
 81 | #' @return a MutSet
 82 | readData = function(mutf, exomef, covarf, mutContextf, ref, samplef= NULL, includeincon = FALSE){
 83 |   # load mutation context file
 84 |   mutContext            = read.table(mutContextf)
 85 |   colnames(mutContext)  = c("triMut", "rev_triMut", "triMut_code", "rev_triMut_code", "tag")
 86 | 
 87 |   # load covariant file
 88 |   covar                 = read.table(covarf, header=T, row.names=2, sep="\t")
 89 |   if(all(!grepl('chr',covar$Chromosome))) covar$Chromosome = paste0("chr", covar$Chromosome)  ## fix encounter old covar file.
 90 |   
 91 |   # load exome file
 92 |   ## format of exome file
 93 |   ## pos <\t> tri-nucleotide<\t>A <\t> C <\t> G <\t> T <\t> gene <\t> amino_acid_pos/protein_length
 94 |   exome                = loadfile(exomef)
 95 |   colnames(exome)      = c("Chromosome","pos", "seq_code","A", "C", "G", "T","gid", "aa_pos");
 96 | 
 97 | 
 98 | 
 99 |   # get valide gid
100 |   gid                  = intersect(rownames(covar)[covar$Chromosome %in% paste0("chr",c(1:22,"X", "Y"))],exome$gid)
101 | 
102 | 
103 |   # read in maf file
104 |   mafTable             = mafCleanup(mutf, gid = gid)
105 |   mafTable             = retrieve_context(mafTable, ref)
106 |   mafTable$tag         = mutContext[match(mafTable[,"Context"],mutContext[,"triMut_code"]),"tag"]
107 |   # remove inconsistant annotation
108 |   inconMut             = Get_incon_mut(mafTable, exome)
109 |   cat(sprintf("Number of inconsistant annotation mutation: %s out of total %s mutation \n",sum(inconMut), length(inconMut)))
110 |   # if(includeincon){
111 |   #   cat("include the inconsistant mutations")
112 | 
113 |   # }
114 |   incosistantAnno      = mafTable[inconMut, ]
115 |   mafTable             = mafTable[!inconMut, ]
116 | 
117 |   # load sample file
118 |   if(is.null(samplef)){
119 |     samples            = data.frame(SampleID = unique(mafTable$Tumor_Sample_Barcode),
120 |                                     stringsAsFactors = FALSE)
121 |   }else if(is.character(samplef)){
122 |     samples            = read.delim(samplef, stringsAsFactors = FALSE)
123 |   }else if("data.frame" %in% class(samplef)){
124 |     samples            = samplef
125 |   }
126 | 
127 |   ## check all samples contain at least one mutation.
128 |   samples              = checkMutC(samples, mafTable)
129 | 
130 | 
131 | 
132 |   # load exomeGene file
133 |   ## format of exome.gene #################
134 |   ## EnsembleGeneID <\t> tri-nucleotide+change <\t> consequence <\t> count
135 |   # consequence coding
136 |   #  0 - silent
137 |   #  1 - miss-sense
138 |   #  2 - nonsense
139 |   #  3 - nonstop
140 |   #  4 - TSS
141 |   #  5 - splice
142 | 
143 |   if("BED" %in% colnames(samples)){
144 |     cat("Bed file is specified for samples\n")
145 |     if(length(unique(samples$BED)) == 1){
146 |       cat("\tAll samples have the same bed regions\n")
147 |       require(GenomicRanges)
148 |       exomeGene = get_exomeGene(exome, Bed = unique(samples$BED), mutContext = mutContext)
149 |       gid       = intersect(gid, exomeGene$gid)  ## need to update gid to bed file.
150 |       MutBed    = data.frame(Chromosome = mafTable[, "Chromosome"],
151 |                              Start = mafTable[, "Start_Position"],
152 |                              End = mafTable[,"End_Position"])
153 |       subMut    = OverLap(MutBed,  regions= unique(samples$BED))
154 |       cat(sprintf("Reduce number of mutation from: %s to %s \n",
155 |                   nrow(MutBed), length(subMut)))
156 |       mafTable  = mafTable[subMut,]
157 |       ## check all samples contain at least one mutation.
158 |     }else{
159 |       cat("\tSamples' bed file are different. exomeGene for each sample will be generated\n")
160 |       exomeGene = lapply(samples$BED, function(x){get_exomeGene(exome, Bed = x, mutContext = mutContext)})
161 |       gids      = lapply(exomeGene, function(x){intersect(gid, x$gid)})
162 |       names(exomeGene)  = names(gids) = samples$SampleID
163 |       gid       = gids
164 |       tmp       = lapply(split(mafTable, mafTable$Tumor_Sample_Barcode),
165 |                          function(x){MutBed    = data.frame(Chromosome = x[, "Chromosome"],
166 |                                                         Start = x[, "Start_Position"],
167 |                                                         End = x[,"End_Position"])
168 |                                      subMut    = OverLap(MutBed, regions= unique(samples$BED))
169 |                                      out       = x[subMut,]
170 |                                      return(out)})
171 |       cat(sprintf("Reduce number of mutation from: %s to %s \n", nrow(mafTable), nrow(do.call(rbind, tmp))))
172 |       mafTable  = do.call(rbind, tmp)
173 |     }
174 |   }else{
175 |     cat("Bed file is not specified for samples. exomeGene for whole exome will be generated.\n")
176 |     exomeGene = get_exomeGene(exome,  mutContext = mutContext)
177 |   }
178 | 
179 |   samples   = checkMutC(samples, mafTable)
180 | 
181 | 
182 | 
183 |   # generate mutSet
184 |   sset                 = MutSet$new()
185 |   sset$mut             = mafTable
186 |   sset$gid             = gid
187 |   sset$exome           = exome
188 |   sset$exomeGene       = exomeGene
189 |   sset$covar           = covar
190 |   sset$mutContext      = mutContext
191 |   sset$samples         = samples
192 |   sset$incosistantAnno = incosistantAnno
193 | 
194 |   cat(sprintf(paste0("Total gene analyzed: ",length(gid))),"\n")
195 | 
196 |   return(sset)
197 | }
198 | 
199 | #'calMut
200 | #' provide observed and predicted mutation rate
201 | #' @param Data MutSet class. Detail see \code{MutSet}.
202 | #' @param params output from fit_model. Detail see \code{fit_model}
203 | #' @param sampleN Sample names
204 | #' @param gids gene IDs
205 | #' @param bed path to bed file.
206 | #' @importFrom  data.table setkey
207 | #' @export
208 | #' @return a summary of predicted and expected mutation rate.
209 | calMut = function(Data, params, sampleN = NULL, gids = NULL, bed = NULL){
210 |   subData                      = Data$clone()
211 |   if(is.null(sampleN)) sampleN = unique(subData$samples$SampleID)
212 |   if(!is.null(bed)){
213 |     exomeGene = get_exomeGene(subData$exome, Bed = bed, mutContext = subData$mutContext)
214 |     gid       = intersect(subData$gid, exomeGene$gid)  ## need to update gid to bed file.
215 |     MutBed    = data.frame(Chromosome = subData$mut[, "Chromosome"],
216 |                            Start = subData$mut[, "Start_Position"],
217 |                            End = subData$mut[,"End_Position"])
218 |     subMut    = OverLap(MutBed,  regions= bed)
219 |     subData$mut  = subData$mut[subMut,]
220 |     subData$exomeGene = subData$exomeGene
221 |     subData$gid  = subData$gid
222 |   }
223 |   if(is.null(gids))    gids = subData$gid
224 | 
225 |   ### get parameters from modeling
226 |   gene.mu              = params[["geneMu"]]
227 |   gene.phi             = params[["genePhi"]]
228 |   o_scale              = params[["o_scale"]]
229 |   MRtriProb            = params[['MRtriProb']]
230 |   zero_p               = params[["geneZeroP"]]
231 |   if(!is.null(bed)){ ## if bed file provided, parameter need to recalculated.
232 |     MR_p               = MRtriProb$MR_p     # relative mutation frequency for each tri-nucleotide context #
233 |     ### get gene.prob
234 |     geneProb           = getGeneBgProb(subData$exomeGene, MR_p)
235 |     # count number of mutation per patient#
236 |     mutPerP            = count_Mut(subData$mut)
237 |     mutPerP$count      = mutPerP$count/get_glen(subData$exomeGene, selGid = names(subData$gid)) * 1000000
238 |     ### get gene length
239 |     geneLen            = get_glen(subData$exomeGene, byGene= TRUE)
240 |   }else{
241 |     ### get gene.prob
242 |     geneProb           = params[["geneProb"]]
243 |     ### get mutation per pateint
244 |     mutPerP            = params[["mutPerP"]]
245 |     ### get gene length
246 |     geneLen            = params[["geneLen"]]
247 | 
248 |   }
249 | 
250 | 
251 | 
252 |   if("data.table" %in% class(geneProb)){
253 |     geneProb           = data.table(geneProb)
254 |   }
255 |   setkey(geneProb, gid, consequence)
256 | 
257 | 
258 |   ## get mutation for non-silent/silent mutation ##
259 |   if(!is.null(zero_p)){
260 |     ## zero inflated model
261 |     mut_pre_nonsil       = as.matrix((1-zero_p) * gene.mu * ((geneProb[J(gids,1),]$prob + geneLen[gids]*MRtriProb$MR_p["indel"]) %*%
262 |                                                   t(as.matrix(mutPerP[match(sampleN, mutPerP$Tumor_Sample_Barcode),"count"]))))/exp(o_scale)
263 |     mut_pre_sil          = as.matrix((1-zero_p) * gene.mu * ((geneProb[J(gids,0),]$prob) %*%
264 |                                                   t(as.matrix(mutPerP[match(sampleN, mutPerP$Tumor_Sample_Barcode),"count"]))))/exp(o_scale)
265 |   }else{
266 |     mut_pre_nonsil       = as.matrix(gene.mu * (( geneProb[J(gids,1),]$prob + geneLen[gids]*MRtriProb$MR_p["indel"]) %*%
267 |                                                   t(as.matrix(mutPerP[match(sampleN, mutPerP$Tumor_Sample_Barcode),"count"]))))/exp(o_scale)
268 |     mut_pre_sil          = as.matrix(gene.mu * ((geneProb[J(gids,0),]$prob) %*%
269 |                                                   t(as.matrix(mutPerP[match(sampleN, mutPerP$Tumor_Sample_Barcode),"count"]))))/exp(o_scale)
270 | 
271 |   }
272 | 
273 |   rownames(mut_pre_nonsil) = rownames(mut_pre_sil) = names(gene.mu)
274 |   colnames(mut_pre_nonsil) = colnames(mut_pre_sil) = sampleN
275 | 
276 |   ## get observed mutation count for non-silent and silent mutation ##
277 |   mut_obs_sil          = count_Mut(Data$silent(), gid = gids, sampleN = sampleN)
278 |   mut_obs_nonsil       = count_Mut(Data$nonsilent(), gid = gids, sampleN = sampleN)
279 | 
280 |   out                  = melt(mut_obs_sil)
281 |   colnames(out)        = c("gid", "sample","mut_obs_sil")
282 |   out$mut_obs_nonsil   = melt(mut_obs_nonsil)$value
283 |   out$mut_pre_sil      = melt(mut_pre_sil)$value
284 |   out$mut_pre_nonsil   = melt(mut_pre_nonsil)$value
285 |   out$geneLen          = geneLen[as.character(out$gid)]
286 |   out$geneMu           = gene.mu[as.character(out$gid)]
287 |   out$geneProb0        = geneProb[J(as.character(out$gid),0),]$prob
288 |   out$geneProb1        = geneProb[J(as.character(out$gid),1),]$prob
289 |   return(out)
290 | }
291 | 
292 | 


--------------------------------------------------------------------------------
/R/procMaf.R:
--------------------------------------------------------------------------------
  1 | #' maf.cleanup
  2 | #' @param maf Input maf file. It can be a path to either rda or tsv file or data.frame
  3 | #' @param extraCols Default is NULL which no extra column will be included. If "all", all columns
  4 | #' will be output. If an array, only specificed columns will be reported.
  5 | #' @param keepNoncoding A boolen. If TRUE, the noncoding variants will be keeped. Default is FALSE,
  6 | #' which noncoding variants will be removed.
  7 | #' @param nonSilTypes To specify what variant classification should be considered as nonsilent variants.
  8 | #' Default is NULL which the default list of classification will be used.
  9 | #' @param SilTypes To specify what variant classification should be considered as silent variants.
 10 | #' Default is NULL which the default list of classification will be used.
 11 | #' @param ncTypes To specify what variant classification should be considered as variants in noncoding regions.
 12 | #' Default is NULL which the default list of classification will be used.
 13 | #' @param save A boolen. If TRUE, the file will be saved, FALSE, the file will not be saved
 14 | #' @param fn string, the output file name
 15 | #' @param gid Gene ID list.
 16 | #' @return cleaned up maf file.
 17 | #' @export
 18 | #' @examples
 19 | #' \dontrun{
 20 | #' maf.cleanup(x, output)
 21 | #' }
 22 | #'
 23 | mafCleanup = function(maf, extraCols = NULL, keepNoncoding = FALSE,
 24 |                       save = FALSE, fn = "./output.tsv",
 25 |                       nonSilTypes = NULL, SilTypes = NULL, ncTypes = NULL,
 26 |                       gid = NULL){
 27 | 
 28 |   ## variant classification category--------
 29 |   if(is.null(nonSilTypes)){
 30 |     nonSilTypes = c("Missense_Mutation","Nonsense_Mutation",
 31 |                     "Frame_Shift_Del","Frame_Shift_Ins",
 32 |                     "In_Frame_Ins","In_Frame_Del")
 33 |   }
 34 | 
 35 |   if(is.null(SilTypes)){
 36 |     SilTypes = c( "Silent")
 37 |   }
 38 | 
 39 |   if(is.null(ncTypes)){
 40 |     ncTypes = c("3'Flank",  "3'UTR", "5'Flank",  "5'UTR", "IGR", "Intron", "Splice_Region", "RNA", "Nonstop_Mutation",
 41 |                   "Translation_Start_Site","De_novo_Start_InFrame", "De_novo_Start_OutOfFrame", "Splice_Site")
 42 |   }
 43 | 
 44 |   ## load file  ---------
 45 |   if(length(class(maf)) ==1 && class(maf) == "character"){
 46 |     if(grepl(".rda$", maf)){
 47 |       mafTable = loadfile(maf)
 48 |     }else{
 49 |       mafTable = read.delim(maf, header=TRUE, sep="\t", stringsAsFactors=FALSE, skip=1)
 50 |     }
 51 |   }else{
 52 |     mafTable   = maf
 53 |   }
 54 | 
 55 |   mafTable     = as.matrix(apply(mafTable,2,as.character))
 56 |   mafTable     = mafTable[!is.na(mafTable[,"Chromosome"]),] #### remove positions without chromosome info.
 57 |   typeInd      = which(colnames(mafTable)=="Variant_Type")
 58 |   classInd     = which(colnames(mafTable)=="Variant_Classification")
 59 | 
 60 |   ## sanity check -----
 61 |   ### Silent variant should not be INDEL
 62 | #  if(any( mafTable[,classInd] %in% c("Silent") & mafTable[,typeInd] %in% c("INS", "DEL") ) ){
 63 | #    stop("Maf file contain variants which are silent but belong to INDEL in type. Please double check maf file.")
 64 | #  }
 65 | 
 66 |   ### "De_novo_Start_InFrame", "De_novo_Start_OutOfFrame" are not longer allowed. Just a warning
 67 |   if(any( mafTable[,classInd] %in% c("De_novo_Start_InFrame", "De_novo_Start_OutOfFrame")  ) ){
 68 |     warning("Variant_Classification field contain De_novo_Start_InFrame or De_novo_Start_OutOfFrame, which is no longer allowed. It will be treat as In_frame and frame_shift respectively")
 69 |   }
 70 | 
 71 |   ### check extra type
 72 |   if(any( !mafTable[,typeInd] %in% c("SNP", "DNP", "TNP", "ONP", "INS", "DEL") ) ){
 73 |     warning("There are extra type beside SNP, DNP, TNP, ONP, INS, DEL in variant_type field. It will be convert to either of above categories")
 74 |     runConvExtra = TRUE
 75 |   }else{
 76 |     runConvExtra = FALSE
 77 |   }
 78 | 
 79 |   ### check if variants are all on 1-22 and X, Y chromosome. Others will be removed
 80 |   if(any( !(mafTable[,"Chromosome"] %in% c(1:24,"X","Y") | mafTable[,"Chromosome"] %in% paste0("chr",c(1:24,"X","Y"))) ) ){
 81 |     warning("There are variants on genome contig other than 1-22 and X, Y. These variants will be removed.")
 82 |     mafTable      = mafTable[(mafTable[,"Chromosome"] %in% c(1:24,"X","Y") | mafTable[,"Chromosome"] %in% paste0("chr",c(1:24,"X","Y"))), ]
 83 |   }
 84 | 
 85 |   ### check if nonsilent mutation doesn't have gene info
 86 |   if(any( mafTable[,classInd] %in% nonSilTypes & is.na(mafTable[,"Gene"]) ) ){
 87 |     warning("There are variants are nonsilent variants but don't have gene info. These variants will be removed.")
 88 |     mafTable      = mafTable[!(mafTable[,classInd] %in% nonSilTypes & is.na(mafTable[,"Gene"])), ]
 89 |   }
 90 | 
 91 |   ## simplify maf type for indel -------
 92 |   FrameShiftTypes = c("Frame_Shift_Del", "Frame_Shift_Ins","Nonsense_Mutation",
 93 |                       "Splice_Site","Nonstop_Mutation","Translation_Start_Site", "De_novo_Start_OutOfFrame")
 94 |   InFrameType     = c("In_Frame_Del", "In_Frame_Ins","Missense_Mutation", "De_novo_Start_InFrame")
 95 | 
 96 |   FSrows          = mafTable[,typeInd] %in% c("INS", "DEL") & mafTable[, classInd] %in% FrameShiftTypes
 97 |   IFrows          = mafTable[,typeInd] %in% c("INS", "DEL") & mafTable[, classInd] %in% InFrameType
 98 | 
 99 |   mafTable[FSrows, typeInd] = "Frame_shift"
100 |   mafTable[IFrows, typeInd] = "In_frame"
101 | 
102 |   ## deal with extra type
103 |   if(runConvExtra){
104 |     mafTable      = ConvertExtraType(mafTable)
105 |   }
106 | 
107 |   ## format output
108 |   if( !is.null(extraCols) && extraCols == "all"){
109 |     defaultCols   = c("Ensembl_gene_id", "Gene", "Chromosome", "Start_Position","End_Position",
110 |                       "Variant_Type","Reference_Allele", "Tumor_Seq_Allele1",
111 |                       "Tumor_Seq_Allele2", "Tumor_Sample_Barcode")
112 |     extraCols     = colnames(mafTable)[!colnames(mafTable) %in% defaultCols]
113 |   }
114 |   maffinal        = cbind(Ensembl_gene_id=as.character(mafTable[,"Gene"]),
115 |                           mafTable[,c("Chromosome","Start_Position","End_Position",
116 |                                       "Variant_Type","Reference_Allele",
117 |                                       "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
118 |                                       "Tumor_Sample_Barcode")],
119 |                           Protein_position = mafTable[,"Protein_position"],
120 |                           Variant_Classification = mafTable[,"Variant_Classification"],
121 |                           mafTable[,extraCols])
122 |   maffinal        = as.data.frame(maf_dnp_converter(maffinal), stringsAsFactors = FALSE)
123 |   maffinal$Sil    = ifelse(maffinal[, "Variant_Classification"] %in% nonSilTypes, 1,
124 |                            ifelse(maffinal[,"Variant_Classification"] %in% SilTypes, 2, 3))
125 |   maffinal$Sil[maffinal$Variant_Type %in% c("Frame_shift","In_frame")] = 1
126 | 
127 |   if(!keepNoncoding){
128 |     maffinal      = maffinal[maffinal$Sil != 3, ]
129 |   }
130 | 
131 |   if(!is.null(gid)){
132 |     maffinal = maffinal[maffinal[,"Ensembl_gene_id"] %in% gid,]
133 |   }
134 | 
135 |   maffinal$Start_Position = as.numeric(maffinal$Start_Position)
136 | 
137 |   if(save) write.table(maffinal ,file = output,row.names=F,col.names=T,quote=F,sep="\t")
138 | 
139 |   return(maffinal)
140 | }
141 | 
142 | 
143 | #' ConvertExtraType
144 | #' @param maf maf matrix
145 | #' @return maf file with converted extra type
146 | #' @export
147 | #' @examples
148 | #' \dontrun{
149 | #' ConvertExtraType(maf)
150 | #' }
151 | #'
152 | ConvertExtraType = function(maf){
153 |   ExtraType       = which(!(maf[,"Variant_Type"] %in% c("SNP", "DNP", "TNP", "ONP", "Frame_shift", "In_frame", "INS", "DEL")))
154 |   for (i in 1:length(ExtraType)){
155 |     alleles = c(maf[ExtraType[i],"Reference_Allele"], maf[ExtraType[i], "Tumor_Seq_Allele1"],maf[ExtraType[i],  "Tumor_Seq_Allele2"])
156 |     maxchar = max(nchar(alleles))
157 |     if ("-" %in% alleles ) {
158 |       maf[ExtraType[i], "Variant_Type"] = "Frame_shift"
159 |       if (maxchar %% 3 == 0) maf[ExtraType[i], "Variant_Type"] = "In_frame"
160 |     } else{
161 |       maf[ExtraType[i], "Variant_Type"] = "SNP"
162 |       if (maxchar > 1) maf[ExtraType[i], "Variant_Type"] = "DNP"
163 |       if (maxchar > 2) maf[ExtraType[i], "Variant_Type"] = "TNP"
164 |       if (maxchar > 3) maf[ExtraType[i], "Variant_Type"] = "ONP"
165 |     }
166 |   }
167 |   return(maf)
168 | }
169 | 
170 | 
171 | #' maf_dnp_converter
172 | #' @param mutab maf matrix
173 | #' @return maf file with converted SNP
174 | #' @export
175 | #' @examples
176 | #' \dontrun{
177 | #' maf_dnp_converter(maf)
178 | #' }
179 | #'
180 | maf_dnp_converter = function(mutab){
181 |   mutab = as.matrix(mutab)
182 |   mutab.snp= mutab[ mutab[,"Variant_Type"]=="SNP" ,]
183 | 
184 |   if(sum(  mutab[,"Variant_Type"]=="In_frame" | mutab[,"Variant_Type"]=="Frame_shift"  )>0 )
185 |     mutab.indel=   mutab[ mutab[,4]=="In_frame" | mutab[,"Variant_Type"]=="Frame_shift",]   else mutab.indel=NULL
186 | 
187 |   mutab.dnp= mutab.tnp=mutab.onp=NULL
188 |   if(sum(mutab[,"Variant_Type"]=="DNP")>0){
189 |     mutab.dnp=  (as.matrix(mutab[ mutab[,"Variant_Type"]=="DNP" ,]))
190 |     print(head(mutab.dnp))
191 |     if (sum(mutab[,"Variant_Type"]=="DNP")==1) {mutab.dnp=  t(as.matrix(mutab[ mutab[,"Variant_Type"]=="DNP" ,])) }
192 |     #### convert DNP to SNP ##############
193 |     a= matrix(0, nrow= nrow(mutab.dnp)*2,ncol=ncol(mutab.dnp)   )
194 |     colnames(a)=colnames(mutab)
195 |     for(i in c(1,2,5,9:ncol(mutab.dnp))){
196 |       a[,i]=  rep(mutab.dnp[,i] ,each=2)
197 |     }
198 |     # a[,2]=  rep(mutab.dnp[,2] ,each=2)
199 |     # a[,4]=  rep(mutab.dnp[,5] ,each=2)
200 |     # a[,8]=  rep(mutab.dnp[,9] ,each=2)
201 |     # a[,9]=  rep(mutab.dnp[,10] ,each=2)
202 |     # a[,11]=  rep(mutab.dnp[,11] ,each=2)
203 | 
204 |     a[2*(1:nrow(mutab.dnp))-1,3]=  mutab.dnp[,3]
205 |     a[2*(1:nrow(mutab.dnp)),3]= as.numeric(mutab.dnp[,3]) + 1
206 |     a[,4]= a[,3]
207 |     a[,6]= unlist(  strsplit( mutab.dnp[,6]  ,""))
208 |     a[,7]= unlist(  strsplit( mutab.dnp[,7] ,""))
209 |     a[,8]= unlist(  strsplit( mutab.dnp[,8] ,""))
210 |     a = a[(a[,6] == a[,7] & a[,7] == a[,8]) == FALSE,]
211 |     mutab.dnp=a
212 |   }
213 |   if(sum(mutab[,"Variant_Type"]=="TNP")>0){
214 |     mutab.tnp=  mutab[ mutab[,"Variant_Type"]=="TNP" ,]
215 |     if (sum(mutab[,"Variant_Type"]=="TNP")==1) {mutab.tnp=  t(as.matrix(mutab[ mutab[,"Variant_Type"]=="TNP" ,])) }
216 |     #### convert TNP to SNP ##############
217 |     a=matrix(0, nrow= nrow(mutab.tnp)*3,ncol=ncol(mutab.tnp)   )
218 |     colnames(a)=colnames(mutab)
219 |     for(i in c(1,2,5,9:ncol(mutab.tnp))){
220 |       a[,i]=  rep(mutab.tnp[,i] ,each=3)
221 |     }
222 |     # a[,1]=  rep(mutab.tnp[,1] ,each=3)
223 |     # a[,2]=  rep(mutab.tnp[,2] ,each=3)
224 |     # a[,4]=  rep(mutab.tnp[,4] ,each=3)
225 |     # a[,8]=  rep(mutab.tnp[,8] ,each=3)
226 |     # a[,9]=  rep(mutab.tnp[,9] ,each=3)
227 |     # a[,10]=  rep(mutab.tnp[,10] ,each=3)
228 | 
229 |     a[3*(1:nrow(mutab.tnp))-2,3]=  mutab.tnp[,3]
230 |     a[3*(1:nrow(mutab.tnp))-1,3]= as.numeric(mutab.tnp[,3]) + 1
231 |     a[3*(1:nrow(mutab.tnp)),3]= as.numeric(mutab.tnp[,3]) + 2
232 |     a[,4]= a[,3]
233 |     a[,6]= unlist(  strsplit( mutab.tnp[,6]  ,""))
234 |     a[,7]= unlist(  strsplit( mutab.tnp[,7] ,""))
235 |     a[,8]= unlist(  strsplit( mutab.tnp[,8] ,""))
236 |     a = a[(a[,6] == a[,7] & a[,7] == a[,8]) == FALSE,]
237 |     mutab.tnp=a
238 |   }
239 |   if(sum(mutab[,"Variant_Type"]=="ONP")>0)
240 |   {
241 |     mutab.onp.all = mutab[ mutab[,"Variant_Type"]=="ONP" ,]
242 |     if (sum(mutab[,"Variant_Type"]=="ONP")==1) {mutab.onp.all=  t(as.matrix(mutab[ mutab[,"Variant_Type"]=="ONP" ,])) }
243 |     #### convert ONP to SNP ##############
244 |     maxchar = apply(cbind(nchar(mutab.onp.all[,"Reference_Allele"]),
245 |                           nchar(mutab.onp.all[,"Tumor_Seq_Allele1"]),
246 |                           nchar(mutab.onp.all[,"Tumor_Seq_Allele2"])),1,max)
247 |     umaxchar = unique(maxchar)
248 |     mutab.onp = mutab.onp.all
249 |     b = NULL
250 |     for (k in 1:length(umaxchar)){
251 |       mutab.onp = mutab.onp.all[maxchar==umaxchar[k],]
252 |       if (length(mutab.onp) == 8){mutab.onp = t(as.matrix(mutab.onp))}
253 |       a= matrix(0, nrow= nrow(mutab.onp)*umaxchar[k],ncol=ncol(mutab.onp)   )
254 |       colnames(a)=colnames(mutab)
255 |       for(i in c(1,2,5,9:ncol(mutab.onp))){
256 |         a[,i]=  rep(mutab.onp[,i] ,each=umaxchar[k])
257 |       }
258 |       # a[,1]=  rep(mutab.onp[,1] ,each=umaxchar[k])
259 |       # a[,2]=  rep(mutab.onp[,2] ,each=umaxchar[k])
260 |       # a[,4]=  rep(mutab.onp[,4] ,each=umaxchar[k])
261 |       # a[,8]=  rep(mutab.onp[,8] ,each=umaxchar[k])
262 |       # a[,9]=  rep(mutab.onp[,9] ,each=umaxchar[k])
263 |       # a[,10]=  rep(mutab.onp[,10] ,each=umaxchar[k])
264 | 
265 |       for (len in 1:umaxchar[k]){
266 |         a[umaxchar[k]*(1:nrow(mutab.onp))-(umaxchar[k]-len),3]=  as.numeric(mutab.onp[,3]) + (len-1)
267 |       }
268 |       a[,4] = a[,3]
269 |       a[,6]= unlist(  strsplit( mutab.onp[,6]  ,""))
270 |       a[,7]= unlist(  strsplit( mutab.onp[,7] ,""))
271 |       a[,8]= unlist(  strsplit( mutab.onp[,8] ,""))
272 |       a = a[(a[,6] == a[,7] & a[,7] == a[,8]) == FALSE,]
273 |       b = rbind(b,a)
274 |     }
275 |     mutab.onp = b
276 |   }
277 | 
278 |   mutab=rbind(mutab.snp,mutab.dnp,mutab.tnp,mutab.onp,mutab.indel)
279 |   ref=mutab[,"Reference_Allele"]      ### reference nucleotide
280 |   mut=ref
281 |   temp= (mutab[,"Tumor_Seq_Allele1"]!=ref)
282 |   mut[temp]=mutab[temp,"Tumor_Seq_Allele1"]
283 | 
284 |   temp= (mutab[,"Tumor_Seq_Allele2"]!=ref)
285 |   mut[temp]=mutab[temp,"Tumor_Seq_Allele2"]   ### mutated nucleotide
286 | 
287 |   mutab =  cbind(mutab,mut)
288 |   colnames(mutab)[ncol(mutab)] = "Mut"
289 | 
290 |   return(mutab)
291 | }
292 | 
293 | #' retrieve_context
294 | #' @param mutab maf matrix
295 | #' @param ref genome reference path.
296 | #' @return Retrieve mutation context code for each variants
297 | #' @export
298 | #' @examples
299 | #' \dontrun{
300 | #' maf_dnp_converter(mutab, ref, codeRetrieve)
301 | #' }
302 | #'
303 | retrieve_context = function(mutab, ref){
304 |   outDir = "./"
305 |   if(!all(grepl("chr", mutab$Chromosome))){mutab$Chromosome = paste0("chr", mutab$Chromosome)}
306 | 
307 |   code = file.path(system.file( "perl", package="ecTMB"), "Sequence_Retrieve.pl")
308 |   muttmpfile = tempfile(c("mutab"), tmpdir =outDir, fileext = ".tsv" )
309 | 
310 |   write.table(mutab, file = muttmpfile, quote = F, sep = "\t", row.names = FALSE)
311 |   # cmd = sprintf("export PATH=/sc1/groups/bfx-red/apps/datainsights/bedtools2/bin:$PATH;perl %s %s %s %s",
312 |   #               code, muttmpfile,ref, sub(".tsv", "out.tsv", muttmpfile))
313 |   cmd = sprintf("perl %s %s %s %s",
314 |                 code, muttmpfile,ref, sub(".tsv", "out.tsv", muttmpfile))
315 |   system(cmd)
316 |   out = read.delim(sub(".tsv", "out.tsv", muttmpfile), stringsAsFactors = FALSE)
317 |   cmd = sprintf(" rm %s %s", muttmpfile , sub(".tsv", "out.tsv", muttmpfile))
318 |   system(cmd)
319 | 
320 |   if(!all(grepl("chr", out[,"Chromosome"]))){
321 |     out[,"Chromosome"] = paste0("chr", out[,"Chromosome"])
322 |   }
323 |   return(out)
324 | }
325 | 
326 | 
327 | #' Get_incon_mut
328 | #' Get Variants with incosistant gene annotation from
329 | #' GSMuta reference data
330 | #' @param mutab maf matrix
331 | #' @param exome exome file.
332 | #' @return Get location of variants with incosistant gene annotation from
333 | #' GSMuta reference data
334 | #' @importFrom data.table setkey data.table
335 | #' @export
336 | #' @examples
337 | #' \dontrun{
338 | #' Get_incon_mut(mutab,exome)
339 | #' }
340 | #'
341 | Get_incon_mut = function(mutab,exome){
342 |   X        = unique(data.table(exome[,c("pos", "gid", "seq_code")]))
343 |   setkey(X, pos, gid)
344 |   which    = is.na(X[J(as.integer(mutab$Start_Position),mutab$Ensembl_gene_id),]$seq_code)
345 |   return(which)
346 | }
347 | 


--------------------------------------------------------------------------------
/R/stats.R:
--------------------------------------------------------------------------------
  1 | #' get data mu for each gene #####
  2 | #' @param y input data
  3 | #' @param offset offset
  4 | #' @return muhat
  5 | #'
  6 | get_muhat <- function(y, offset){
  7 |   return(apply(y/exp(offset), 1, mean))
  8 | }
  9 | 
 10 | 
 11 | #' get sigma ###
 12 | #' @param mu mu
 13 | #' @param mu_glm glm generated mu
 14 | #' @return sigma
 15 | getsigma <- function(mu, mu_glm){
 16 |   return(sd(log(mu)-log(mu_glm)))
 17 | }
 18 | 
 19 | #' get zero_p ###
 20 | #' @param zero_p_zip zero_p_zip
 21 | #' @return zero_p
 22 | #' @export
 23 | getzero_p <- function(zero_p_zip){
 24 |   return(exp(zero_p_zip)/(1+exp(zero_p_zip)))
 25 | }
 26 | 
 27 | 
 28 | #' MLE to estimate genewise dispersion ###
 29 | #' @param y input y
 30 | #' @param mu mu
 31 | #' @param offset offset
 32 | #' @param cores number of threads
 33 | #' @param ZI A boolen. Default is FALSE which Negative binomial model will be used.
 34 | #' If TRUE, zero-inflated model will be used.
 35 | #' @return mu
 36 | get_genewise_dispersion_mle_mu <- function(y, mu, offset, cores, ZI = FALSE){
 37 |   ngene       = nrow(y)
 38 |   nsam        = ncol(y)
 39 |   muA         = exp(offset)*mu
 40 |   tmp         = cbind(y, muA)
 41 |   getdispersionMLE = function(data){
 42 |     #print(1)
 43 |     n         = length(data)
 44 |     y         = data[1:(n/2)]
 45 |     mu        = data[(n/2+1):n]
 46 |     ind0      = y == 0
 47 |     obj       = function(x){
 48 |       if(ZI){
 49 |         zero_p = x
 50 |         # zero inflated
 51 |         return(-sum( log(zero_p + (1-zero_p) * exp(-mu[ind0]))) -
 52 |           sum(log(1-zero_p) -mu[!ind0] + y[!ind0] * log(mu[!ind0]) - lgamma(y[!ind0] + 1)))
 53 |       }else{
 54 |         # regular NB
 55 |         phi    = x
 56 |         return( -sum( lgamma(y + 1/phi) - lgamma(1/phi) -lgamma(y+1) - 1/phi*log(1+mu*phi) + y*( log(mu) - log(1/phi+mu) ) ))  ###optimize the probability mass function)
 57 |       }
 58 |     }
 59 |     if(ZI){
 60 |       return(optimize(obj, interval=c(0, 1))$minimum)
 61 |     }else{
 62 |       return(optimize(obj, interval=c(10^(-20), 10^20))$minimum)
 63 |     }
 64 |   }
 65 |   x           = lapply(apply(tmp, 1, FUN=list), unlist)
 66 |   dispersion  = unlist(mclapply(x, getdispersionMLE ,mc.cores=cores))
 67 |   #dispersion <- apply(tmp, 1, getdispersionMLE)
 68 |   return(dispersion)
 69 | }
 70 | 
 71 | 
 72 | #' getmu_post_optimize
 73 | #' @description  get optimized mu ########
 74 | # mu1 = mu_hat
 75 | # mu2 = mu_hat_glm
 76 | #' @param y The input data
 77 | #' @param offset offset of input data
 78 | #' @param mu1 mu_hat
 79 | #' @param mu2 mu_hat_glm
 80 | #' @param sigma dispersion
 81 | #' @param span The extention
 82 | #' @param phi dispersion parameter when negative binomial model is used. Otherwise,
 83 | #' it will be 0.
 84 | #' @param cores number of thread
 85 | #' @param zero_p probability of zero portion if zero-inflated model is used. Otherwise,
 86 | #' it will be 0
 87 | #' @return A vector mutation count for 96 trinucleotide context
 88 | #' @examples
 89 | #' \dontrun{
 90 | #' getmu_post_optimize(mut, gid)
 91 | #' }
 92 | #'
 93 | getmu_post_optimize = function(y, offset, mu1, mu2, phi, sigma, span,cores, zero_p = 0){
 94 |   ngene <- nrow(y)
 95 |   nsam <- ncol(y)
 96 |   if (length(phi)==1){
 97 |     phi <- rep(phi, ngene)
 98 |   }
 99 | 
100 |   if (length(zero_p)==1){
101 |     zero_p <- rep(zero_p, ngene)
102 |   }
103 |   tmp <- cbind(y, offset, mu1, mu2, phi, zero_p)
104 |   getmu <- function(data, sigma, span){
105 |     #print(1)
106 |     n <- length(data)
107 |     zero_p <- data[n]
108 |     disp <- data[n-1]
109 |     mu2 <- data[n-2]
110 |     mu1 <- data[n-3]
111 |     y <- data[1:((n-4)/2)]
112 |     ind0 = y == 0
113 |     offset <- exp(data[((n-4)/2+1):(n-4)])
114 |     obj <- function(mu){
115 |       if(zero_p != 0){
116 |         # zero-inflated
117 |         if(disp == 0){
118 |           # zero-inflated Poisson
119 |           return( (log(mu)-log(mu2))^2/(2 * sigma^2) - ## prior section
120 |                     sum(log(zero_p + ( 1 - zero_p) * exp((-mu * offset)[ind0]))) - # zero section disp.g is zero_p
121 |                     sum(log(1-zero_p) - (mu * offset)[!ind0]  + y[!ind0]*log((mu * offset)[!ind0]) -lgamma(y[!ind0]+1)))
122 | 
123 |         }
124 |       }else{
125 |         if(disp == 0){
126 |           ## regular Poisson
127 |           return((log(mu)-log(mu2))^2/(2 * sigma^2) - sum( y*log(mu * offset)-offset*mu -lgamma(y+1)) )
128 |         }else{
129 |           ## regular NB
130 |           return ((log(mu)-log(mu2))^2/(2 * sigma^2) -sum( lgamma(y + 1/disp) - lgamma(1/disp) -lgamma(y+1) -
131 |                                                              1/disp*log(1+mu*disp*offset) + y*( log(mu*offset) - log(1/disp+mu*offset) ) ) )
132 |         }
133 |       }
134 |     }
135 |     return(optimize(obj, interval=c(10^(-10), max(mu1, mu2)*span))$minimum)
136 |   }
137 |   x <- lapply(apply(tmp, 1, FUN=list), unlist)
138 |   mu <- unlist(mclapply(x, function(x) getmu(x, sigma,span) ,mc.cores=cores))
139 |   names(mu) <- names(mu2)
140 |   #mu <- apply(tmp, 1, getmu, sigma, span)
141 |   return(mu)
142 | }
143 | 
144 | #' get_mu_hat_mle
145 | #' @description  from phi estimate mu
146 | #' @param y input y
147 | #' @param disp disp
148 | #' @param offset offset
149 | #' @param ZI A boolen. Default is FALSE which Negative binomial model will be used.
150 | #' If TRUE, zero-inflated model will be used.
151 | #' @param cores number of threads
152 | #' @return mu
153 | get_mu_hat_mle <- function(y, disp, offset,cores, ZI = FALSE){
154 |   ngene <- nrow(y)
155 |   nsam <- ncol(y)
156 |   tmp <- cbind(y,offset,disp)
157 | 
158 |   getmuMLE <- function(data){
159 |     n <- length(data)-1
160 |     y <- data[1:(n/2)]
161 |     ind0 = y == 0
162 |     offset <- data[(n/2+1):n]
163 |     disp.g = data[(n+1)]
164 | 
165 |     obj <- function(x){
166 |       mu <- exp(offset)*x
167 |       if(ZI){
168 |         -sum(log(disp.g + ( 1 - disp.g) * exp(-mu[ind0]))) - # zero section disp.g is zero_p
169 |           sum((log(1-disp.g) - mu[!ind0] ) + y[!ind0]*log(mu[!ind0] ) -lgamma(y[!ind0]+1))
170 |       }else{
171 |         -sum( lgamma(y + 1/disp.g) - lgamma(1/disp.g) -lgamma(y+1) - 1/disp.g*log(1+mu*disp.g) + y*( log(mu) - log(1/disp.g+mu) ) )  ###optimize the probability mass function
172 |       }
173 |     }
174 |     return(optimize(obj, interval=c(10^(-20), 10^20))$minimum)
175 |   }
176 | 
177 |   x <- lapply(apply(tmp, 1, FUN=list), unlist)
178 |   mu <- unlist(mclapply(x, getmuMLE ,mc.cores=cores))
179 | 
180 |   return(mu)
181 | 
182 | }
183 | 
184 | ####### apply NB for mu & phi #####
185 | #' get_mu_phi
186 | #' @description apply NB for mu & phi
187 | #' @param y input y
188 | #' @param offset offset
189 | #' @return mu phi
190 | get_mu_phi = function(y, offset){
191 |   nsam <- ncol(y)
192 |   tmp = cbind(y,offset)
193 |   get_nb <- function(tmp){
194 |     data = data.frame(y=tmp[1:nsam], exp=rep(0,nsam), offset=tmp[(nsam+1):length(tmp)])
195 |     fit = glm.nb(y~exp+offset(offset),data=data)
196 |     return(c(exp(fit$coefficients[1]), 1/fit$theta))
197 |   }
198 |   mu_phi <- apply(tmp,1,get_nb)
199 |   return(mu_phi)
200 | }
201 | 
202 | 
203 | #' Calculate mu and zero_p by MLE for ZIP model #
204 | #' @param y observed mutation
205 | #' @param offset offset.
206 | #' @param cores number of cores
207 | #' @importFrom parallel mclapply
208 | #' @return a List of parameters
209 | #' @export
210 | #' @examples
211 | #' \dontrun{
212 | #' get_zip_pairparam_mle(Data, offset)
213 | #' }
214 | #'
215 | get_zip_pairparam_mle = function(y, offset,cores){
216 |   data    = y/offset
217 |   data    = lapply(apply(data, 1, FUN=list), unlist)
218 |   pairparams = mclapply(data,  function(x){ zipMLE(x)$param }, mc.cores = cores)
219 |   return(pairparams)
220 | }
221 | 
222 | #' Calculate mu and zero_p by MLE for ZIP model #
223 | #' @param x observed mutation
224 | #' @param offset offset.
225 | #' @param tol tol of convergency
226 | #' @return a List of parameters
227 | #' @export
228 | #' @examples
229 | #' \dontrun{
230 | #' zipMLE(Data, offset)
231 | #' }
232 | #'
233 | zipMLE = function (x, offset = 1, tol = 1e-09)
234 | {
235 |   if(all(x == 0)){
236 |     param <- c(0, 1)
237 |     names(param) <- c("lambda", "pi")
238 |     list(iters = 0, loglik = 0, param = param)
239 |   }else{
240 |     x  < - x/offset
241 |     no <- sum(x == 0)
242 |     n <- length(x)
243 |     prop <- no/n
244 |     n1 <- n - no
245 |     x1 <- x[x > 0]
246 |     sx <- sum(x1)
247 |     m <- sx/n
248 |     s <- (sum(x1^2) - m * sx)/(n - 1)
249 |     l1 <- s/m + m - 1
250 |     fx <- m - m * exp(-l1) - l1 + prop * l1
251 |     der <- m * exp(-l1) - 1 + prop
252 |     l2 <- l1 - fx/der
253 |     i <- 2
254 |     while (abs(l2 - l1) > tol) {
255 |       i <- i + 1
256 |       l1 <- l2
257 |       fx <- m - m * exp(-l1) - l1 + prop * l1
258 |       der <- m * exp(-l1) - 1 + prop
259 |       l2 <- l1 - fx/der
260 |       if(i > 10000){
261 |         cat("zipMLE not converge \n.")
262 |         break
263 |       }
264 |     }
265 |     if(i > 10000){
266 |       p <- prop
267 |       l2 <- m
268 |       loglik <- no * log(p + (1 - p) * exp(-l2)) + n1 * log(1 -
269 |                                                               p) + sum( x*log(l2)- l2 -lgamma(x+1))
270 |     }else{
271 |       p <- 1 - m/l2
272 |     }
273 |     loglik <- no * log(p + (1 - p) * exp(-l2)) + n1 * log(1 -
274 |                                                             p) + sum( x*log(l2)- l2 -lgamma(x+1))
275 |     param <- c(l2, p)
276 |     names(param) <- c("lambda", "pi")
277 |     list(iters = i, loglik = loglik, param = param)
278 |   }
279 | }
280 | 
281 | #' Calculate the relative mutation frequency for each tri-nucleotide context #
282 | #' @param Data mutSet object
283 | #' @param fraction fraction of gene used for nonsilent mutation.
284 | #' @return A vector mutation count for 96 trinucleotide context
285 | #' @export
286 | #' @examples
287 | #' \dontrun{
288 | #' get_bg_MRtri(Data)
289 | #' }
290 | #'
291 | getBgMRtri = function(Data, fraction = 0.6){
292 | 
293 |   if(class(Data$gid) == "list"){
294 |     cat("\tSamples' bed file are different. Common list of gene will be used to predict sampling bias 'r'.\n")
295 |     gid              = Reduce(intersect, gid)
296 |     gid_nonsil_p     = gid [ gid %in% Data$get_nonsil_passengers(fraction)]
297 |     ## need to be tested ###
298 |   }else{
299 |     gid              = Data$gid
300 |     gid_nonsil_p     = Data$get_nonsil_passengers(fraction)
301 |   }
302 | 
303 |   sil_all            = count_Mut(Data$exomeGene[Data$exomeGene$consequence == 0,], gid = gid, byContext = TRUE)
304 |   nonsil_p           = count_Mut(Data$exomeGene[Data$exomeGene$consequence != 0,], gid = gid_nonsil_p, byContext = TRUE)
305 |   sil_mut            = count_Mut(Data$silent(), gid = gid, byContext = TRUE)
306 |   nonsil_mut_p       = count_Mut(Data$nonsilent(), gid = gid_nonsil_p, byContext = TRUE)
307 | 
308 |   valid_ind          = which(sil_all >= 1000 & sil_mut > 0)       # select tri-nucleotide context which has enough incidence
309 |   r                  = mean( (nonsil_mut_p[valid_ind]/ nonsil_p[valid_ind]) * (sil_all[valid_ind] /sil_mut[valid_ind]))      ######### relative to silent mutations
310 |   p_all              = (nonsil_mut_p + sil_mut ) / ( sil_all + r * nonsil_p )  # part of phat
311 |   count_all          = (nonsil_mut_p + sil_mut )
312 | 
313 |   ref                =  p_all[1]  # reference cate is type 1
314 |   p                  = rep(0,96)
315 |   for (m in 1:96) {
316 |     p[m]             = p_all[m]/ref
317 |   }
318 | 
319 |   ## calculate p_{indel}, which will be used later for calculation of p_{frameshift} and p_{inframe}
320 |   mut                = Data$nonsilent()
321 |   mut                = mut[mut$Ensembl_gene_id %in% gid_nonsil_p,]
322 |   if(sum(  mut[,"Variant_Type"]=="In_frame" |  mut[,"Variant_Type"]=="Frame_shift"  )>0){
323 |     mutab_indel      = mut[ mut$Variant_Type=="In_frame" | mut$Variant_Type=="Frame_shift",]
324 |     pos              = get_glen(Data$exomeGene, selGid = gid_nonsil_p)
325 |     ind              = nrow(mutab_indel)/pos/r/ref  #### p_{indel}
326 |     indel_c          = nrow(mutab_indel)
327 |   }else{
328 |     ind              = 0
329 |     indel_c          = 0
330 |   }
331 |   p                  = c(p,ind)
332 |   names(p)           = c(1:96,"indel")
333 |   count_all          = c(count_all, indel_c)
334 |   names(count_all)   = c(1:96,"indel")
335 |   return(list(MR_p = p, r = r, ref = ref, count = count_all))
336 | }
337 | 
338 | 
339 | # calcuate number of mutation per patient  #
340 | #' CalTMB
341 | #' @description calcuate number of mutation per patient
342 | #' @param x mutSet object
343 | #' @param type Type of mutation to be included. It can be 'all' 'nonsil' 'sil' 'indel' 'snp' 'frameshit'
344 | #' @param sampleN Default is NULL. If it is specified, only subset of samples will be reported.
345 | #' @return A data.frame contain the mutation count
346 | #' @export
347 | #' @examples
348 | #' \dontrun{
349 | #' CalTMB(Data)
350 | #' }
351 | #'
352 | CalTMB = function(x, type = "all", sampleN = NULL){
353 |   if(is.null(sampleN)){
354 |     sampleN   = as.character(x$samples$SampleID)
355 |   }
356 |   if(type == "all"){
357 |     out         = count_Mut(x$mut, sampleN = sampleN)
358 |   }else if(type == "sil" ){
359 |     out         = count_Mut(x$silent(), sampleN = sampleN)
360 |   }else if(type == "nonsil"){
361 |     out         = count_Mut(x$nonsilent(),  sampleN = sampleN)
362 |   }else if(type == "indel"){
363 |     mut         = x$nonsilent()
364 |     out         = count_Mut(mut[mut$Variant_Classification %in% c("Frame_Shift_Del", "Frame_Shift_Ins"),],  sampleN = sampleN)
365 |   }else if(type == "snp"){
366 |     mut         = x$nonsilent()
367 |     out         = count_Mut(mut[mut$Variant_Type %in% c("SNP"),],  sampleN = sampleN)
368 |   }else if(type == "frameshift"){
369 |     mut         = x$nonsilent()
370 |     out         = count_Mut(mut[mut$Variant_Type %in% c("Frame_shift"),], sampleN = sampleN)
371 |   }else{
372 |     stop("type must be either all, sil or nonsil")
373 |   }
374 |   out$count      = out$count/get_glen(x$exomeGene, selGid = names(x$gid)) * 1000000
375 |   out            = out[sampleN, ]
376 |   return(out)
377 | }
378 | 
379 | 
380 | 
381 | # fit background model with negative-binomial regression #
382 | #' fit_model
383 | #' @description fit background model with negative-binomial regression
384 | #' @param Data mutSet object
385 | #' @param MRtriProb output from get_bg_MRtri
386 | #' @param method Either NB or Poisson
387 | #' @param cores Number of parallel core to be used
388 | #' @param bs.type Either 'all' or nonsil. When all is specificed, all mutation per Mb
389 | #' will be used as sample specific background mutation (TMB). When nonsil is specified,
390 | #' non-synonymous mutation per Mb will be used as sample specific background mutation (TMB).
391 | #' @param mut.nonsil A boolen. If TRUE, non-synonymous mutation will be used for background mutation
392 | #' modeling during training.
393 | #' @param nonsil.fraction Default is 0.6. The percentage of genes whose non-synonymous mutation
394 | #' will be used.
395 | #' @importFrom MASS glm.nb
396 | #' @importFrom parallel mclapply
397 | #' @importFrom data.table data.table setkey
398 | #' @return A vector mutation count for 96 trinucleotide context
399 | #' @export
400 | #' @examples
401 | #' \dontrun{
402 | #' fit_model2(Data)
403 | #' }
404 | #'
405 | fit_model = function(Data, MRtriProb, method = "NB", cores = 1,  bs.type = "nonsil", mut.nonsil = FALSE, nonsil.fraction = 0.6){
406 |   MR_p               = MRtriProb$MR_p     # relative mutation frequency for each tri-nucleotide context #
407 |   gene_bg_p          = getGeneBgProb(Data$exomeGene, MR_p)
408 |   sampleN            = Data$samples$SampleID
409 |   geneLen            = get_glen(Data$exomeGene, selGid = Data$gid, byGene = TRUE)
410 | 
411 | 
412 |   # count number of mutation per patient for offset calculation
413 |   if(bs.type == "all"){
414 |     mutPerP          = CalTMB(Data, sampleN = as.character(Data$samples$SampleID))
415 |   }else if(bs.type == "nonsil"){
416 |     mutPerP          = CalTMB(Data, sampleN = as.character(Data$samples$SampleID), type = "nonsil")
417 |   }
418 |   # for all genes with silent mutations
419 |   gid                = Data$gid
420 |   if(method %in% c("NB", "Poisson")){  ## original method
421 |     selGene          = gid[ gid %in%  Data$silent()[,"Ensembl_gene_id"]]
422 |   }else{
423 |     selGene          = gid           ## new method
424 |   }
425 | 
426 |   ## calculate offset and observed mutation
427 |   if(mut.nonsil){
428 |     gid_nonsil_p     = Data$get_nonsil_passengers(nonsil.fraction)
429 |     offset_nonsil    = as.matrix(as.numeric(gene_bg_p[J(selGene,1),]$prob + geneLen[selGene]*MR_p["indel"])) %*% t(as.matrix(as.numeric(mutPerP[sampleN,2])))
430 |     offset_nonsil[!selGene %in% gid_nonsil_p,] = 0         ## drivers nonsilent offset will be 0
431 |     offset_sil       = as.matrix(as.numeric(gene_bg_p[J(selGene,0),]$prob)) %*% t(as.matrix(as.numeric(mutPerP[sampleN,2])))
432 |     offset           = log(offset_nonsil + offset_sil)
433 |     sil_mut_matrix   = unname(count_Mut(Data$silent(), selGene, sampleN))
434 |     nonsil_mut_matrix= unname(count_Mut(Data$nonsilent(), selGene, sampleN))
435 |     nonsil_mut_matrix[!selGene %in% gid_nonsil_p, ] = 0   ## drivers nonsilent mutation count will be 0
436 |     y                = nonsil_mut_matrix + sil_mut_matrix
437 |     if(any(is.na(offset)| is.infinite(offset))) offset[is.na(offset) | is.infinite(offset)] = min(offset[!is.infinite(offset)], na.rm = T) - log(2)   ## some gene don't have silent/nonsilent mutation. Set it as 2 fold lower than min
438 |   }else{
439 |     offset             = log(as.matrix(as.numeric(gene_bg_p[J(selGene,0),]$prob)) %*% t(as.matrix(as.numeric(mutPerP[sampleN,2]))))
440 |     if(any(is.na(offset)| is.infinite(offset))) offset[is.na(offset) | is.infinite(offset)] = min(offset[!is.infinite(offset)], na.rm = T) - log(2)    ## some gene don't have silent mutation.
441 |     sil_mut_matrix     = unname(count_Mut(Data$silent(), selGene, sampleN))
442 |     y                 = sil_mut_matrix
443 |   }
444 | 
445 |   # process covar
446 |   covar              = as.matrix(Data$covar,stringsAsFactors=FALSE)
447 |   which              = rownames(covar)[!is.na(covar[,"expr"]) & !is.na(covar[,"reptime"]) & !is.na(covar[,"hic"])]
448 |   covar[is.na(covar[,"expr"]),"expr"] = mean(as.numeric(covar[,"expr"]),na.rm=T)
449 |   covar[is.na(covar[,"reptime"]),"reptime"] = mean(as.numeric(covar[,"reptime"]),na.rm=T)
450 |   covar[is.na(covar[,"hic"]),"hic"] = mean(as.numeric(covar[,"hic"]),na.rm=T)
451 |   covar_m           = prcomp(apply(covar[,c("expr", "reptime", "hic")],2,as.numeric),scale=T)
452 |   covar[,3:5]       = covar_m$x
453 |   m                 = data.frame(epc = as.numeric(gene_bg_p[J(selGene,0),]$prob),
454 |                                  exp = as.numeric(covar[selGene,"expr"]),
455 |                                  rep = as.numeric(covar[selGene,"reptime"]),
456 |                                  hic = as.numeric(covar[selGene,"hic"]),
457 |                                  or = factor(covar[selGene,"or"], levels =1:2))
458 |   if(length(unique(m$or)) == 1){
459 |     cat("There are no olfactory gene. Olfactory variable will be removed in the model.\n")
460 |     design            = model.matrix(~ exp + rep + hic, m) ### design
461 |   }else{
462 |     design            = model.matrix(~ exp + rep + hic + or, m) ### design
463 |   }
464 |   # fit model
465 |   o_scale           = max(offset)
466 |   offsetS           = offset-o_scale ### rescale 0~1
467 |   ysum              = apply(y, 1, sum)
468 |   offset_sum        = rowSums(exp(offsetS))
469 |   cat(sprintf("Gene total: %s; %s genes have 0 mutation detected synonymous/non-synonymous in training cohort.\n", length(ysum), sum(ysum == 0)))
470 | 
471 |   if(length(unique(m$or)) == 1){
472 |     dataA             = data.frame(ysum=ysum, exp=design[,"exp"], rep=design[,"rep"], hic=design[,"hic"], offset_sum=offset_sum)
473 |   }else{
474 |     dataA             = data.frame(ysum=ysum, exp=design[,"exp"], rep=design[,"rep"], hic=design[,"hic"], or2=design[,"or2"], offset_sum=offset_sum)
475 |   }
476 |   rownames(dataA)   = selGene
477 |   #  selGene           = selGene[selGene %in% which]     ## update selected gene to gene contain covar info
478 |   dataA_sel         = dataA[selGene[selGene %in% which], ]
479 | 
480 |   ######### (strategy 1 : negative binomial) ### get all_betas from negative binomial model, needs optimization .....
481 |   if (method == "NB"){
482 |     cat(sprintf("Method: %s \n", method))
483 |     if(length(unique(m$or)) == 1){
484 |       fit             = glm.nb(ysum ~ exp + rep + hic + offset(log(offset_sum)), data = dataA_sel,control=glm.control(maxit=100))
485 |     }else{
486 |       fit             = glm.nb(ysum ~ exp + rep + hic + or2 + offset(log(offset_sum)), data = dataA_sel,control=glm.control(maxit=100))
487 |     }
488 |     ###get mu_hat_glm
489 |     mu_glm          = as.numeric(exp(design%*%as.matrix(fit$coefficients)))
490 |     phi_glm         = fit$theta
491 | 
492 |     ### NB: joint estimation of mu_hat and phi_hat
493 |     cat("NB: joint estimation of mu_hat and phi_hat\n")
494 |     mu_hat          = get_muhat(y=y, offset=offsetS)
495 |     phi_hat_mle     = get_genewise_dispersion_mle_mu(y=y, mu=mu_hat, offset=offsetS, cores=cores)
496 | 
497 |     mu_hat_pre      = rep(0,length(mu_hat))
498 |     while(sum(abs(mu_hat-mu_hat_pre)) > 1){
499 |       ###get phi, dispersion
500 |       mu_hat_pre    = mu_hat
501 |       mu_hat        = get_mu_hat_mle(y=y, disp=phi_hat_mle, offset=offsetS, cores=cores)
502 |       phi_hat_mle   = get_genewise_dispersion_mle_mu(y=y, mu=mu_hat, offset=offsetS, cores=cores)
503 |       print(sum(abs(mu_hat-mu_hat_pre)))
504 |     }
505 | 
506 |     ###get sigma
507 |     cat("Estimate sigma and mu_post.\n")
508 |     sigma          = getsigma(mu=mu_hat, mu_glm=mu_glm)
509 | 
510 |     mu_post        = getmu_post_optimize(y=y, offset=offsetS, mu1=mu_hat, mu2=mu_glm, phi=phi_hat_mle, sigma = sigma, span=100, cores=cores)
511 |     mu_post_pre    =  rep(0, length(mu_hat))
512 |     while(sum(abs(mu_post-mu_post_pre)) > 1){
513 |       mu_post_pre  = mu_post
514 |       phi_hat_mle  = get_genewise_dispersion_mle_mu(y=y, mu=mu_post, offset=offsetS, cores=cores)
515 |       mu_post      = getmu_post_optimize(y=y, offset=offsetS, mu1=mu_hat, mu2=mu_glm, phi=phi_hat_mle, sigma = sigma, span=100, cores=cores)
516 |       print(sum(abs(mu_post-mu_post_pre)))
517 |     }
518 |     phi_hat        = phi_hat_mle
519 |   }
520 | 
521 | 
522 |   ########## (strategy 2: poisson with log-linear) #### get all_betas from Poisson model
523 |   if (method == "Poisson"){
524 |     cat(sprintf("Method: %s \n", method))
525 | 
526 |     if(length(unique(m$or)) == 1){
527 |       fit           = glm(ysum ~ exp + rep + hic  + offset(log(offset_sum)), data = dataA_sel,control=glm.control(maxit=100),family="poisson")
528 |     }else{
529 |       fit           = glm(ysum ~ exp + rep + hic + or2 + offset(log(offset_sum)), data = dataA_sel,control=glm.control(maxit=100),family="poisson")
530 |     }
531 | 
532 |     ###get mu_hat_glm
533 |     mu_glm          = as.numeric(exp(design%*%as.matrix(fit$coefficients)))
534 | 
535 |     ### mu_hat for Poisson regression
536 |     mu_hat          = rowSums(y)/rowSums(exp(offsetS))
537 | 
538 |     ### get sigma
539 |     sigma           = getsigma(mu=mu_hat, mu_glm=mu_glm)
540 | 
541 |     ### get optimized mu
542 |     mu_post        = getmu_post_optimize(y=y, offset=offsetS, mu1=mu_hat, mu2=mu_glm, phi=0, sigma = sigma, span=100, cores=cores)
543 |     phi_hat        = rep(0, length(selGene))
544 |   }
545 | 
546 |   ########## (strategy 3: zero-inflated poisson with log-linear) #### get all_betas from Poisson model
547 |   if (method == "ZIP"){
548 |     cat(sprintf("Method: %s \n", method))
549 |     if(length(unique(m$or)) == 1){
550 |       fit           = zeroinfl(ysum ~ exp + rep + hic, offset = log(offset_sum), data = dataA_sel, control=zeroinfl.control(maxit=1000),dist="poisson")
551 |     }else{
552 |       fit           = zeroinfl(ysum ~ exp + rep + hic + or2, offset = log(offset_sum), data = dataA_sel, control=zeroinfl.control(maxit=1000),dist="poisson")
553 |     }
554 |     mu_zip          = as.numeric(exp(design%*%as.matrix(fit$coefficients$count)))
555 |     zero_p_zip      = getzero_p(as.numeric(design%*%as.matrix(fit$coefficients$zero)))
556 |     names(zero_p_zip) = names(mu_zip) =rownames(design)
557 | 
558 |     ## get mu_hat and zero_p_hat
559 |     cat("ZIP: joint estimation of mu_hat and zero_p_hat\n")
560 |     pairparams     = get_zip_pairparam_mle(y, offset = exp(offsetS), cores = cores)
561 |     mu_hat         = unlist(lapply(pairparams, function(x){x['lambda']}))
562 |     zero_p_hat     = unlist(lapply(pairparams, function(x){x["pi"]}))
563 |     names(mu_hat)  = sub(".lambda", "", names(mu_hat))
564 |     names(zero_p_hat)  = sub(".pi", "", names(zero_p_hat))
565 | 
566 |     ### post mu ###################
567 |     ###get sigma
568 |     cat("Estimate sigma and mu_post.\n")
569 |     sigma          = getsigma(mu=mu_hat[mu_hat > 0], mu_glm=mu_zip[mu_hat > 0])
570 |     mu_post        = getmu_post_optimize(y=y, offset=offsetS, mu1=mu_hat, mu2=mu_zip, phi=0, sigma = sigma, span=1000, cores=cores, zero_p = zero_p_zip)
571 |   }
572 | 
573 | 
574 | 
575 |   if(method != "Poisson" & method != "NB" & method != "ZIP") stop("method should be either Poisson, NB, ZIP or ZINB.\n")
576 | 
577 | 
578 |   ### gene factor calculation ###
579 |   if(!method %in% c("ZIP", "ZINB")){
580 |     if(length(unique(m$or)) == 1){
581 |       base           = exp(fit$coefficients[1] + fit$coefficients[2] * as.numeric(covar[gid,3]) +
582 |                              fit$coefficients[3] * as.numeric(covar[gid,4]) +
583 |                              fit$coefficients[4] * as.numeric(covar[gid,5]))
584 |     }else{
585 |       base           = exp(fit$coefficients[1] + fit$coefficients[2] * as.numeric(covar[gid,3]) +
586 |                              fit$coefficients[3] * as.numeric(covar[gid,4]) +
587 |                              fit$coefficients[4] * as.numeric(covar[gid,5]) +
588 |                              fit$coefficients[5] * (as.numeric(as.character(covar[gid,6])) - 1))
589 |     }
590 | 
591 |     names(base)    = gid
592 | 
593 |     ### assign correction factor based on base ############
594 |     missGene       = gid[gid %in% selGene == FALSE]
595 |     or             = order(base[selGene])
596 |     selAdj         = data.table(data.frame(disp = phi_hat[or], p.base = (base[selGene])[or]))
597 |     selAdj$bin     = ceiling(1:nrow(selAdj)/50)
598 |     dispAvg        = selAdj[, {dispAvg = median(disp); list(dispAvg=dispAvg)}, by='bin']
599 |     setkey(dispAvg,bin)
600 | 
601 |     missData       = base[missGene]
602 |     missData[is.na(missData)] = 0
603 |     range          = findInterval(missData, selAdj$p.base)
604 |     range[range < 1] =1
605 |     range[range > nrow(selAdj)] = nrow(selAdj)
606 |     missDisp       = dispAvg[J(selAdj[range,]$bin),]$dispAvg
607 | 
608 |     ### adjustment: dispersion, beta ###
609 |     gene.phi       = rep(0,length(gid))
610 |     names(gene.phi) = gid
611 |     gene.mu        = gene.phi
612 | 
613 |     gene.phi[selGene] = phi_hat
614 |     gene.phi[missGene] = missDisp
615 | 
616 |     gene.mu[selGene] = mu_post
617 | 
618 |     offset         = log(as.matrix(as.numeric(gene_bg_p[J(missGene,0),]$prob)) %*% t(as.matrix(as.numeric(mutPerP[sampleN,2]))))
619 |     offsetS        = offset - o_scale ### rescale
620 |     y              = unname(count_Mut(Data$silent(), missGene, sampleN))
621 |     mu_post_miss   = getmu_post_optimize(y=y, offset=offsetS, mu1=rep(0,length(missGene)), mu2=base[missGene], phi=missDisp, sigma = sigma, span=100, cores=cores)
622 |     #gene.mu[miss.gene] = base[miss.gene]
623 |     mu_post_miss[which(is.na(rowSums(offsetS)))] = base[missGene[which(is.na(rowSums(offsetS)))]]
624 |     gene.mu[missGene] = mu_post_miss
625 |     gene.zeroP     = rep(0, length(gene.mu))
626 |     names(gene.zeroP)= names(gene.mu)
627 |   }else{
628 |     gene.mu        = mu_post
629 |     gene.phi       = rep(0, length(gene.mu))
630 |     names(gene.phi)= names(gene.mu)
631 |     gene.zeroP     = zero_p_zip
632 |   }
633 |   return(list( geneMu = gene.mu, geneProb = gene_bg_p,
634 |                mutPerP = mutPerP , geneDisp = gene.phi,
635 |                o_scale = o_scale, MRtriProb = MRtriProb,
636 |                geneLen = geneLen, geneZero_p = gene.zeroP,
637 |                trainData = dataA_sel,
638 |                otherparams = list(mu_glm = ifelse(method == "ZIP", mu_zip, mu_glm),
639 |                                   disp_glm = ifelse(method == "ZIP", zero_p_zip,
640 |                                                     ifelse(method == "NB", rep(phi_glm, length(gene.mu)) ,rep(0, length(gene.mu)))))
641 |                ))
642 | }
643 | 
644 | 
645 | 
646 | 
647 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' loadfile
  3 | #' @param x file path
  4 | #' @return content of loaded file
  5 | #' @export
  6 | #' @examples
  7 | #' \dontrun{
  8 | #' loadfile(x)
  9 | #' }
 10 | #'
 11 | 
 12 | loadfile = function(x){
 13 |   temp_space          =  new.env()
 14 |   bar                 =  load(x, temp_space)
 15 |   object              =  get(bar, temp_space)
 16 |   rm(temp_space)
 17 |   return(object)
 18 | }
 19 | 
 20 | #' OverLap
 21 | #' @param Bed targeted bed file. data.frame contain colunms "Chromosome" "Start" "End"
 22 | #' @param regions path to bed file or data.frame
 23 | #' @param base0 A boolen if it is 0 based.
 24 | #' @return a list of row number which targetBed overlap with regions.
 25 | #' @importFrom GenomicRanges makeGRangesFromDataFrame findOverlaps
 26 | #' @export
 27 | #' @examples
 28 | #' \dontrun{
 29 | #'   exomebed        = data.frame(Chromosome = exome[, "Chromosome"],
 30 | #'   Start = exome[, "pos"],
 31 | #'   End = exome[,"pos"])
 32 | #'   OverLap(exomebed, regions)
 33 | #' }
 34 | #'
 35 | OverLap = function(Bed, regions, base0 = FALSE){
 36 |   if(class(regions) == "character"){
 37 |     if(grepl(".bed$", regions)){
 38 |       header = system(sprintf("head -n 1 %s", regions), intern = TRUE)
 39 |       if(grepl("Start", header)){
 40 |         df = read.delim(regions, stringsAsFactors = F)
 41 |       }else{
 42 |         df = read.delim(regions, stringsAsFactors = F, header = FALSE)[,1:3]
 43 |         colnames(df) = c("Chromosome", "Start", "End")
 44 |       }
 45 |       regions      = makeGRangesFromDataFrame(df,
 46 |                                               keep.extra.columns=FALSE,
 47 |                                               ignore.strand=TRUE,
 48 |                                               start.field="Start",
 49 |                                               end.field=c("End"),
 50 |                                               strand.field="strand",
 51 |                                               starts.in.df.are.0based=FALSE)
 52 |     }
 53 |   }else if(class(regions) != "GRanges"){
 54 |     stop("regions should be either GRanges or path to bed/vcf file")
 55 |   }
 56 | 
 57 |   if(any(grepl("chr", as.character(seqnames(regions)))) & ! any(grepl("chr", Bed$Chromosome))){
 58 |     Bed$Chromosome      = paste0("chr",Bed$Chromosome)
 59 |   }
 60 | 
 61 | 
 62 |   exomeGR               = makeGRangesFromDataFrame(Bed,
 63 |                                            keep.extra.columns=FALSE,
 64 |                                            ignore.strand=TRUE,
 65 |                                            start.field="Start",
 66 |                                            end.field=c("End"),
 67 |                                            starts.in.df.are.0based=FALSE)
 68 | 
 69 |   over          = suppressWarnings(findOverlaps(exomeGR, regions))
 70 |   subexome      = unique(queryHits(over))
 71 |   return(subexome)
 72 | }
 73 | 
 74 | 
 75 | 
 76 | #' Get exomeGene
 77 | #' @param exome exome data
 78 | #' @param Bed path to bed file or data.frame
 79 | #' @param mutContext data.frame contain mutation context.
 80 | #' @importFrom data.table data.table
 81 | #' @return exomeGene file
 82 | #' @export
 83 | #' @examples
 84 | #' \dontrun{
 85 | #' get_exomeGene(exome, Bed, mutContext)
 86 | #' }
 87 | #'
 88 | get_exomeGene = function(exome, Bed = NULL, mutContext){
 89 |   if(is.null(Bed)){
 90 |     subexome         = exome
 91 |   }else{
 92 |     exomebed         = data.frame(Chromosome = exome[, "Chromosome"],
 93 |                                  Start = exome[, "pos"],
 94 |                                  End = exome[,"pos"])
 95 |     subexome         = exome[OverLap(exomebed, Bed),]
 96 |   }
 97 |   tmp              = data.table(data.frame(gid = subexome[,"gid"],
 98 |                                            triMut_code = paste0(subexome[,"seq_code"],1),
 99 |                                            consequence = subexome[,"A"],
100 |                                            pos = subexome[,"pos"],
101 |                                            stringsAsFactors = FALSE))
102 |   a                = data.frame(tmp[, length(pos), by = 'gid,triMut_code,consequence'])
103 | 
104 | 
105 |   tmp              = data.table(data.frame(gid = subexome[,"gid"],
106 |                                            triMut_code = paste0(subexome[,"seq_code"],4),
107 |                                            consequence = subexome[,"C"],
108 |                                            pos = subexome[,"pos"],
109 |                                            stringsAsFactors = FALSE))
110 |   b                = data.frame(tmp[, length(pos), by = 'gid,triMut_code,consequence'])
111 | 
112 | 
113 |   tmp              = data.table(data.frame(gid = subexome[,"gid"],
114 |                                            triMut_code = paste0(subexome[,"seq_code"],3),
115 |                                            consequence = subexome[,"G"],
116 |                                            pos = subexome[,"pos"],
117 |                                            stringsAsFactors = FALSE))
118 |   c                = data.frame(tmp[, length(pos), by = 'gid,triMut_code,consequence'])
119 | 
120 | 
121 | 
122 |   tmp              = data.table(data.frame(gid = subexome[,"gid"],
123 |                                            triMut_code = paste0(subexome[,"seq_code"],2),
124 |                                            consequence = subexome[,"T"],
125 |                                            pos = subexome[,"pos"],
126 |                                            stringsAsFactors = FALSE))
127 |   d                = data.frame(tmp[, length(pos), by = 'gid,triMut_code,consequence'])
128 | 
129 |   ## note replace N with 0
130 |   final            = rbind(a,b,c,d)
131 |   final$triMut_code = sub("N", "0", final$triMut_code)
132 |   final$triMut_code=as.numeric(final$triMut_code)
133 |   final            = final[final$triMut_code %in% mutContext[,"triMut_code"],]
134 | 
135 | 
136 | 
137 |   colnames(final)  = c("gid", "triMut_code", "consequence", "count")
138 |   final$tag        = mutContext[match(final$triMut_code, mutContext$triMut_code), "tag"]
139 | 
140 |   return(final)
141 | }
142 | 
143 | 
144 | #' Count mutatin per patient or per mutation tri-nuclear context.
145 | #' @param mut data frame contain mutation info which must contain 'Start_Position' and 'Tumor_Sample_Barcode' or
146 | #' exomeGene - data.frame.
147 | #' @param gid default is NULL. provide gid list
148 | #' @param sampleN default is NULL. Provide list of sample name. This parameter is only useful when byContext is FALSE
149 | #' @param byContext A boolen. If TRUE, the number of mutation per tri-nucleotide context will be reported.
150 | #' If FALSE, the number of mutation per patient will be reported. Default is FALSE.
151 | #' @return mutCount data.frame contain number of mutation for each patient or
152 | #' a vector contains number of mutation per tri-nucleotide context
153 | #' @importFrom data.table data.table
154 | #' @export
155 | #' @examples
156 | #' \dontrun{
157 | #' count_Mut(mut)
158 | #' }
159 | count_Mut = function(mut, gid = NULL, sampleN = NULL, byContext = FALSE){
160 |   if(byContext){
161 |     if(is.null(gid)) gid = unique(mut[,"Ensembl_gene_id"])
162 | 
163 |     mutCount        = rep(0,96)
164 |     if("Context" %in% colnames(mut)){
165 |       mutab.snp     = mut[ mut[,"Context"] != "INDEL" & mut[,"Ensembl_gene_id"] %in% gid ,]
166 |       x             = table(mutab.snp[,"tag"])
167 |       y             = x[match(1:96,names(x))]
168 |     }else{
169 |       mutab.snp     = data.table(mut[  as.character(mut[,"gid"]) %in% gid ,])
170 |       x             = mutab.snp[, sum(count), by = 'tag']
171 |       y             = x[match(1:96, x$tag)]$V1
172 |     }
173 | 
174 |     y[is.na(y)]     = 0
175 |     names(y)        = 1:96
176 |     mutCount        = mutCount + y
177 | 
178 |   }else{
179 |     ## report per patient
180 |     if(is.null(gid) ){
181 |       if(is.null(sampleN)) sampleN = as.character(unique(mut$Tumor_Sample_Barcode))
182 |       mutCount      = data.frame(Tumor_Sample_Barcode = sampleN, count = 0, stringsAsFactors = F)
183 |       rownames(mutCount) = sampleN
184 |       count        = aggregate(Start_Position ~ Tumor_Sample_Barcode, mut, length)
185 |       mutCount[count$Tumor_Sample_Barcode, "count"] = count$Start_Position
186 |     }else if((! is.null(gid))){
187 |       count           = matrix(0, nrow = length(gid), ncol = length(sampleN))
188 |       rownames(count) = gid
189 |       colnames(count) = sampleN
190 |       mutCount = aggregate(Start_Position ~ Ensembl_gene_id + Tumor_Sample_Barcode, mut,length)
191 |       for (i in 1:nrow(mutCount)){
192 |         if(as.character(mutCount[i,1]) %in% gid & as.character(mutCount[i,2]) %in% sampleN){
193 |           count[as.character(mutCount[i,1]),as.character(mutCount[i,2])] = mutCount[i,3]
194 |         }
195 |       }
196 |       mutCount        = count
197 |     }
198 |     if("Start_Position" %in% colnames(mutCount)){ colnames(mutCount)[which(colnames(mutCount) %in% "Start_Position")] = "count"}
199 |   }
200 |   return(mutCount)
201 | }
202 | 
203 | 
204 | #' get_glen
205 | #' @description  get gene length
206 | #' @param exomeGene data.frame
207 | #' @param selGid a vector of selected genes.
208 | #' @param byGene a boolen.If TRUE, a vector of length for each gene will be reported
209 | #' If FALSE, a number of total length for all gene will be reported. Default is FALSE.
210 | #' @export
211 | #' @importFrom data.table data.table
212 | #' @return a number or a vector
213 | #'
214 | get_glen = function(exomeGene, selGid = NULL, byGene= FALSE){
215 |   if(byGene){
216 |     if(!is.null(selGid)){
217 |       u        = data.table(exomeGene[exomeGene$gid %in% selGid, ])
218 |     }else{
219 |       u        = data.table(exomeGene)
220 |     }
221 |     x          = u[, sum(count), by = 'gid']
222 |     out        = x$V1/3
223 |     names(out) = x$gid
224 | 
225 |   }else{
226 |     if(is.null(selGid)){
227 |       out       = sum(exomeGene[ "count"])/3
228 |     }else{
229 |       out       = sum(exomeGene[exomeGene$gid %in% selGid, "count"])/3
230 |     }
231 |   }
232 |   return(out)
233 | 
234 | }
235 | 
236 | 
237 | ## ind: default is c(1,2,3,4,5) which are nonsil mutation
238 | #' getGeneBgProb
239 | #' @description  get background gene muation probabilty without regression
240 | #' @param exomeGene data.frame
241 | #' @param prob probability of each tri-nucleotides mutation context
242 | #' @param consequences default is 1 which are nonsil mutation.
243 | #' @param gid Gene ID list.
244 | #' @importFrom data.table data.table setkey
245 | #' @export
246 | #' @return data.frame background gene muation probabilty without regression
247 | #'
248 | getGeneBgProb = function(exomeGene, prob, consequences = c(1,2,3,4,5), gid = NULL){
249 |   if(!is.null(gid)){
250 |     exomeGene        = exomeGene[exomeGene$gid %in% gid, ]
251 |   }
252 |   gids               = unique(exomeGene$gid)
253 |   df                 = data.table(data.frame(gid = exomeGene[,"gid"],
254 |                                              prob = prob[as.numeric(exomeGene[,"tag"])]* as.numeric(exomeGene[,"count"]),
255 |                                              consequence = as.numeric(exomeGene[,"consequence"] %in% consequences)))
256 |   gene_background_p  = df[,sum(prob), by = 'gid,consequence']
257 |   setkey(gene_background_p, gid, consequence)
258 |   colnames(gene_background_p)[3] = "prob"
259 |   out                = data.table(rbind( gene_background_p[J(gids, 0),], gene_background_p[J(gids, 1),]))
260 |   probmin0           = min(out$prob[out$consequence == 0], na.rm = T)
261 |   probmin1           = min(out$prob[out$consequence ==1 ], na.rm = T)
262 |   out$prob[is.na(out$prob) & out$consequence == 0] = probmin0
263 |   out$prob[is.na(out$prob) & out$consequence == 1] = probmin1
264 |   setkey(out, gid, consequence)
265 |   return(out)
266 | }
267 | 
268 | 
269 | #' getsubData
270 | #' @description  get subset of orginal MutSet
271 | #' @param Data MutSet
272 | #' @param sampleN A list of sample names.
273 | #' @param gid A list of gene id.
274 | #' @export
275 | #' @return MutSet a subset of orginal MutSet
276 | #
277 | getsubData = function(Data, sampleN = NULL, gid = NULL){
278 |   subData                 = Data$clone()
279 |   if(!is.null(sampleN)){
280 |     subData$mut           = subData$mut[Data$mut$Tumor_Sample_Barcode %in% sampleN,]
281 |     subData$samples       = subData$samples[Data$samples$SampleID %in% sampleN,]
282 |   }
283 | 
284 |   if(!is.null(gid)){
285 |     subData$mut           = subData$mut[subData$mut$Ensembl_gene_id %in% gid,]
286 |     subData$gid           = subData$gid[subData$gid %in% gid]
287 |     subData$covar         = subData$cover[gid, ]
288 |     subData$exomeGene     = subData$exomeGene[subData$exomeGene$gid %in% gid, ]
289 |     subData$exome         = subData$exome[subData$exome$gid %in% gid, ]
290 |   }
291 | 
292 |   return(subData)
293 | }
294 | 
295 | 
296 | 
297 | #' check each sample have at least one mutation.
298 | #' @description check each sample have at least one mutation.
299 | #' @param sample data.frame of sample
300 | #' @param mut Mutation table
301 | #' @return restrict sample to only the ones that contain at least one mutation
302 | #
303 | checkMutC = function(sample, mut){
304 |   if(!all(unique(sample$SampleID) %in% mut$Tumor_Sample_Barcode)){
305 |     cat(sprintf("Total %s out of %s samples with at least one mutation detected.\nOnly the ones with at least one mutation will be used.\n",
306 |                      length(unique(mut$Tumor_Sample_Barcode)), length(unique(sample$SampleID))))
307 |     # sample                = sample[sample$SampleID %in% mut$Tumor_Sample_Barcode,]
308 |   }
309 |   return(sample)
310 | }
311 | 
312 | 
313 | #' getEnsemblID
314 | #' @description  get ensembl ID from gene symbol
315 | #' @param geneID gene symbol
316 | #' @param geneInfof store the info symbol and ensembl ID.
317 | #' @importFrom limma alias2Symbol
318 | #' @export
319 | #' @return a vector of ensemble ID
320 | #
321 | getEnsemblID = function(geneID, geneInfof = "~/Data/GSMuta_data/GSMutaRData/extdata/ensembl_92_exon_pos.hg38.rda"){
322 |   geneinfo         = loadfile(geneInfof)
323 |   out              = geneinfo$ensembl_gene_id[match(geneID, geneinfo$hgnc_symbol)]
324 |   for(i in which(is.na(out))){
325 |     gene           = geneID[i]
326 |     alias          = alias2Symbol(gene)
327 |     if(length(alias) > 1) alias = alias[1]
328 |     if(gene != alias){
329 |       out[i]       = geneinfo$ensembl_gene_id[match(alias, geneinfo$hgnc_symbol)]
330 |     }
331 |   }
332 |   return(out)
333 | }
334 | 
335 | 
336 | #' getGeneSymbol
337 | #' @description  get gene symbol from ensembl ID
338 | #' @param ensemblID ensemble ID
339 | #' @param geneInfof store the info symbol and ensembl ID.
340 | #' @export
341 | #' @return a vector of gene symbol
342 | getGeneSymbol = function(ensemblID, geneInfof = "~/Data/GSMuta_data/GSMutaRData/extdata/ensembl_92_exon_pos.hg38.rda"){
343 |   geneinfo         = loadfile(geneInfof)
344 |   out              = geneinfo$hgnc_symbol[match(ensemblID, geneinfo$ensembl_gene_id)]
345 |   return(out)
346 | }
347 | 
348 | 
349 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ecTMB: estimation and classification of TMB
  2 | 
  3 | ecTMB is a powerful and flexible statistical framework for TMB estimation and classification. It uses an explicit background mutation mdoel for more robust and consistent TMB prediction. The backgournd mutation model takes account of unknown as well as known mutational heterogeneous factors, including tri-nucleotide context, sample mutational burden, gene expression level and replication timing by utilization of a Bayesian framework. The discovery of three TMB-based subtypes, including one novel subtype TMB-extreme, enable ecTMB to classify samples to biological and clinically relavent TMB subtypes.
  4 | 
  5 | ## Table of Contents
  6 | **[Dependency](#dependency)**<br>
  7 | **[Installation](#installation)**<br>
  8 | **[Example Usage](#example-usage)**<br>
  9 | **[License](#license)**<br>
 10 | 
 11 | ## Dependency
 12 | ecTMB has been sucessfully tested on Intel(R) Xeon(R) CPU E5-2680 v4 Machine with 28 cores.
 13 | 
 14 | ecTMB import following R packages: ggplot2, limma, reshape2, dplyr, R6, MASS, GenomicRanges, data.table, parallel, mixtools
 15 | 
 16 | ecTMB also depends on bedtools 2.27.1 and R = 3.5.1
 17 | 
 18 | You can install these packages using [anaconda](https://www.anaconda.com/download)/[miniconda](https://conda.io/miniconda.html) :
 19 | ```
 20 | conda install bedtools=2.27.1 r=3.5.1
 21 | ```
 22 | Then you can export the conda paths as:
 23 | ```
 24 | export PATH="/PATH/TO/CONDA/bin:$PATH"
 25 | export LD_LIBRARY_PATH="/PATH/TO/CONDA/lib:$LD_LIBRARY_PATH"
 26 | ```
 27 | 
 28 | ## Installation
 29 | ```
 30 | install.packages("devtools")
 31 | library(devtools);
 32 | devtools::install_github("bioinform/ecTMB");
 33 | ```
 34 | ## Download Example and Reference Data
 35 | ```
 36 | #Example file download from URL: https://www.dropbox.com/s/knpgl73samhdtvg/ecTMB_data.tar.gz?dl=1
 37 | URL = "https://github.com/bioinform/ecTMB/releases/download/v0.1.0/ecTMB_data.tar.gz"
 38 | download.file(URL,destfile = "ecTMB.example.tar.gz")
 39 | untar("./ecTMB.example.tar.gz")
 40 | 
 41 | URL_ref = "https://api.gdc.cancer.gov/data/254f697d-310d-4d7d-a27b-27fbf767a834"
 42 | download.file(URL_ref,destfile = "GRCh38.d1.vd1.fa.tar.gz")
 43 | untar("./GRCh38.d1.vd1.fa.tar.gz")
 44 | ```
 45 | 
 46 | 
 47 | ## Example Usage
 48 | * **Load ecTMB package and genome annotation reference files**
 49 | ```
 50 | library(ecTMB)
 51 | load("./ecTMB_data/example/UCEC.rda")
 52 | extdataDir             = "./ecTMB_data/references"
 53 | exomef                 = file.path(extdataDir, "exome_hg38_vep.Rdata" )  #### hg38 exome file
 54 | covarf                 = file.path(extdataDir,"gene.covar.txt")   ### gene properties
 55 | mutContextf            = file.path(extdataDir,"mutation_context_96.txt" )  ### 96 mutation contexts
 56 | TST170_panel           = file.path(extdataDir,"TST170_DNA_targets_hg38.bed" )  ### 96 mutation contexts
 57 | ref                    = file.path("./","GRCh38.d1.vd1.fa" )
 58 | 
 59 | ```
 60 | * **Set random 70% as training and rest as test set**
 61 | ```
 62 | set.seed(1002200)
 63 | SampleID_all   = UCEC_cli$sample
 64 | SampleID_train = sample(SampleID_all, size = round(2 * length(SampleID_all)/3), replace = F)
 65 | SampleID_test  = SampleID_all[!SampleID_all %in% SampleID_train]
 66 | ```
 67 | * **Generate train and test data object**
 68 | ```
 69 | ## mutations which are inconsistent with reference annotation files will be removed.
 70 | ## train data
 71 | trainData      = UCEC_mafs[UCEC_mafs$Tumor_Sample_Barcode %in% as.character(SampleID_train),]
 72 | trainset       = readData(trainData, exomef, covarf, mutContextf, ref)
 73 | 
 74 | ## test data for panel TST 170
 75 | sample         = data.frame(SampleID = SampleID_test, BED = TST170_panel, stringsAsFactors = FALSE)
 76 | testData       = UCEC_mafs[UCEC_mafs$Tumor_Sample_Barcode %in% as.character(SampleID_test),]
 77 | testset_panel  = readData(testData, exomef, covarf, mutContextf, ref, samplef = sample)
 78 | testset_WES    = readData(testData, exomef, covarf, mutContextf, ref)  ## to calculate WES-TMB for test samples
 79 | ```
 80 | * **Background mutation model training** 
 81 | ---
 82 | **NOTE**
 83 | 
 84 | This step takes up to ~12 mins when 24 parallel processes are used. You can skip 
 85 | and use the pre-loaded parameters defined from training data set.
 86 | 
 87 | ---
 88 | ```
 89 | MRtriProb_train= getBgMRtri(trainset)
 90 | trainedModel   = fit_model(trainset, MRtriProb_train, cores = 24)
 91 | ```
 92 | 
 93 | * **Predict TMB for TST170 panel**
 94 | ```
 95 | ## process time less than 1s. 
 96 | TMBs          =  pred_TMB(testset_panel, WES = testset_WES, cores = 1,
 97 |                         params = trainedModel, mut.nonsil = T, gid_nonsil_p = trainset$get_nonsil_passengers(0.95))
 98 |                         
 99 | ## plot the prediction.    
100 | library(dplyr)
101 | library(ggplot2)
102 | 
103 | TMBs %>% melt(id.vars = c("sample","WES_TMB")) %>% 
104 |   ggplot( aes(x = WES_TMB, y = value,  color = factor(variable, levels = c("ecTMB_panel_TMB",  "count_panel_TMB")), 
105 |               group = factor(variable))) + 
106 |   geom_point() +
107 |   geom_abline(slope = 1, intercept = 0) + 
108 |   scale_x_continuous(trans='log2') +
109 |   scale_y_continuous(trans='log2') +
110 |   theme_bw() +
111 |   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
112 |   theme(legend.title=element_blank()) +
113 |   labs(x = "TMB defined by WES", y = sprintf("Predicted TMB from panel: TST170"))
114 | ```
115 | 
116 | * **Classify sample to 3 subtypes**
117 | ```
118 | Subtypes      = assignClass(TMBs$ecTMB_panel_TMB, prior = GMM_params)
119 | ```
120 | 
121 | ## License
122 | ecTMB is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/inst/nf/bed2vcf.changes.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | use Getopt::Long;
 7 | 
 8 | my $exon_file = "/net/kodiak/volumes/river/shared/users/yaol12/software/GSMuta_data/GSMuta/ensembl_81_exon_pos.hg38.txt";
 9 | my $output = "./";
10 | my $help = 0; 
11 | my $fasta;
12 | 
13 | GetOptions('e=s'              => \$exon_file,
14 |            'o=s'              => \$output,
15 |            'f=s'              => \$fasta,
16 |            'h'                => \$help,  
17 | 	);
18 | 
19 | 
20 | unless ($help == 0){
21 |         print <<EOF;
22 | 
23 |         Usage :  perl annotate.exome.changes.pl -e exon_file -f fasta -o output_dir
24 |                         -e : exon_file (3 fields, tab delimited; chr, start, end) 
25 |                         -f : path to fasta file
26 |                         -o : [ optional ] output directory, default is './'
27 | 
28 | EOF
29 |         exit;
30 | }
31 | 
32 | 
33 | if ( !-d $output){
34 | 	`mkdir $output`;
35 | }
36 | 
37 | my $outbed="${output}/extended.bed";
38 | my $outvcf="${output}/output.vcf";
39 | 
40 | 
41 | open(OUT, ">$outvcf")||die;
42 | print "-- Generating VCF ... \n";
43 | 
44 | `awk '{OFS="\t"} {print \$1,\$2-3,\$3+2}' $exon_file  > $outbed`;
45 | my @line = split /\n+/,`awk '{OFS="\t"}{print \$1,\$2-1,\$3+1}' $outbed | fastaFromBed -fi $fasta -bed stdin -fo stdout`;
46 | 
47 | print "Retrieve reference is done ... \n";
48 | 
49 | my %ref;
50 | my $start;
51 | my $end; 
52 | foreach my $line (@line){
53 |   if ($line =~ />/){
54 |     ($start, $end) = (split />|:|-/,$line)[2..3];
55 |   }else{
56 |     chomp $line;
57 |     my @string = split //,$line;
58 |     for my $j (0 ..$#string){
59 |       $ref{($j+$start)} = $string[$j];
60 |     }
61 |   }
62 |     
63 | }
64 | 
65 | open(A, $outbed)||die;
66 | while(<A>){
67 |   my ($chr,$start,$end) = split /\s+/,$_;
68 |   for my $k($start ..($end-1)){
69 |     my $ref_allele = uc($ref{$k});
70 |     my $context = join("", uc($ref{($k-1)}), $ref_allele, uc($ref{($k+1)}));
71 |     $context =~ tr/ACGT/1432/;
72 |     
73 |     if ($ref_allele ne "A"){
74 |       print OUT $chr,"\t",$k+1,"\t", ".\t", $ref_allele,"\tA\t.\t.\t$context\n";
75 |     }
76 |     if ($ref_allele ne "C"){
77 |       print OUT $chr,"\t",$k+1,"\t", ".\t", $ref_allele,"\tC\t.\t.\t$context\n";
78 |     }
79 |     if ($ref_allele ne "G"){
80 |       print OUT $chr,"\t",$k+1,"\t", ".\t", $ref_allele,"\tG\t.\t.\t$context\n";
81 |     }
82 |     if ($ref_allele ne "T"){
83 |       print OUT $chr,"\t",$k+1,"\t", ".\t", $ref_allele,"\tT\t.\t.\t$context\n";
84 |     }
85 |   }
86 | }
87 | undef %ref;
88 | undef @line;
89 | 
90 | 
91 |   


--------------------------------------------------------------------------------
/inst/nf/create.exome.Rdata.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | library(data.table)
 4 | options(scipen = 999)
 5 | 
 6 | args <- commandArgs(trailingOnly=TRUE)
 7 | 
 8 | 
 9 | if (length(args)==0){
10 | 	stop("Usage: Rscript create.exome.Rdata.R input_file mutation_context_file (e.g. mutation_context_96.txt) output_file_1(e.g. exome_hg19_vep.Rdata)", call.=FALSE)
11 | }
12 | 
13 | 
14 | #' Generate output_file_1 (e.g. exome_hg19_vep.Rdata)
15 | data <- read.table(pipe(sprintf("sort -k 1,1 -k2,2n %s",args[1])),sep="\t", stringsAsFactors = F)
16 | 
17 | sel = data[data[,1] %in% paste0("chr", c(1:22,"X","Y")),]
18 | # sel = sel[order(data[, 2]), ]
19 | # sel = sel[order(data[, 1]), ]
20 | sel = unique(sel[,c(1,2,9,3:8)])
21 | colnames(sel) = paste0("V",1:9)
22 | class(sel$V3) = 'character'
23 | 
24 | 
25 | # res = vector("list",24)
26 | 
27 | # for (i in 1:24){
28 | #   cat("Chromosome",i, "\n")
29 | 
30 | # 	if ( i ==23){
31 | # 		sel = data[data[,1]=="chrX",]
32 | # 	}else if (i==24){
33 | # 		sel =data[data[,1]=="chrY",]
34 | # 	}else{
35 | # 		sel = data[data[,1]==paste0("chr",i),]
36 | #   }
37 | 
38 | # 	cat("Number Positions:", nrow(sel), "\n")
39 | # 	cat("Number Genes:", length(unique(sel[,7])), "\n")
40 | # 	sel = sel[order(as.numeric(sel[,2])),]
41 | # 	res[[i]] = unique(sel[,c(1,2,9,3:8)])
42 | # 	colnames(res[[i]]) = paste0("V",1:9)
43 | # 	class(res[[i]]$V3) = 'character'
44 | # }
45 | 
46 | # exome = do.call(rbind, res)
47 | # exome$V1 = as.character(exome$V1)
48 | # exome$V3 = as.character(exome$V3)
49 | # exome$V8 = as.character(exome$V8)
50 | # exome$V9 = as.character(exome$V9)
51 | exome = sel
52 | save(exome, file=args[3])
53 | write.table(exome, file=sub("Rdata","tsv", args[2]), sep="\t", quote=F, row.names=F, col.names=T)
54 | 
55 | 
56 | 
57 | # #' Generate output_file_2 (e.g. exome_gene_hg19_vep.Rdata)
58 | # mut.context = read.table(args[2])
59 | 
60 | # preprocess.BM<-function(X,mut.context)
61 | # {
62 | 
63 | # 	final = NULL
64 | # 	for(i in 1:23)
65 | #   	{
66 | # 		if(nrow(X[[i]]) > 0)
67 | # 		{
68 | #      			cat("Chromosome", i, "\n")
69 | #       			m =  X[[i]]
70 | 
71 | #       			tmp = data.table(data.frame(gene = m[,7],tag = paste0(m[,2],1),ind = m[,3], pos = m[,1]))
72 | #       			a = data.frame(tmp[, length(pos), by = 'gene,tag,ind'])
73 | 
74 | #       			tmp = data.table(data.frame(gene = m[,7],tag = paste0(m[,2],4),ind = m[,4], pos = m[,1]))
75 | #       			b = data.frame(tmp[, length(pos), by = 'gene,tag,ind'])
76 | 
77 | #       			tmp = data.table(data.frame(gene = m[,7],tag = paste0(m[,2],3),ind = m[,5], pos = m[,1]))
78 | #       			c = data.frame(tmp[, length(pos), by = 'gene,tag,ind'])
79 | 
80 | #       			tmp = data.table(data.frame(gene = m[,7],tag = paste0(m[,2],2),ind = m[,6], pos = m[,1]))
81 | #       			d = data.frame(tmp[, length(pos), by = 'gene,tag,ind'])
82 | 
83 | #       			final = rbind(final,a[a[,2] %in% mut.context[,3],])
84 | #       			final = rbind(final,b[b[,2] %in% mut.context[,3],])
85 | #       			final = rbind(final,c[c[,2] %in% mut.context[,3],])
86 | #       			final = rbind(final,d[d[,2] %in% mut.context[,3],])
87 | 
88 | #      		}
89 | #   	}
90 | # 	return(final)
91 | # }
92 | 
93 | # cat("Start generating output_file_2\n")
94 | 
95 | # exomgene = preprocess.BM(res,mut.context)
96 | # exomgene = unique(exomgene)
97 | # save(exomgene,file = args[4])
98 | 
99 | 


--------------------------------------------------------------------------------
/inst/nf/ecTMB_refbuild.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | 
  3 | @Grab('com.xlson.groovycsv:groovycsv:1.1')
  4 | import static com.xlson.groovycsv.CsvParser.parseCsv
  5 | 
  6 | /*
  7 | vim: syntax=groovy
  8 | -*- mode: groovy;-*-
  9 | */
 10 | 
 11 | // parameters
 12 | params.exon = ""
 13 | params.ref  = ""
 14 | params.assembly = "GRCh38"
 15 | params.vepCache = "/sc1/groups/bfx-red/data/datainsights/users/yaol12/vep_data/"  
 16 | params.vepDocker = "ensemblorg/ensembl-vep:release_96.0"
 17 | params.vepCacheVersion = "96"
 18 | params.outDir = "./"
 19 | params.codeDir = "/sc1/groups/bfx-red/users/yaol12/software/ecTMB_refbuild/inst/nf/"
 20 | params.mutContext96="/sc1/groups/bfx-red/data/datainsights/users/yaol12/GSMuta_data/GSMuta/mutation_context_96.txt"
 21 | params.nline = 10
 22 | params.pick = false
 23 | params.vcf2maf="/sc1/groups/bfx-red/users/yaol12/software/vcf2maf/tmp/vcf2maf/vcf2maf_edit.pl"
 24 | 
 25 | 
 26 | println "=======INFO==================================================="
 27 | println "=======BEGIN=================================================="
 28 | println "exon bed file: $params.exon"
 29 | println "ref path: $params.ref"
 30 | println "assembly: $params.assembly"
 31 | println "vep Cache version: $params.vepCacheVersion"
 32 | println "vepCache: $params.vepCache"
 33 | println "vepDocker:$params.vepDocker"
 34 | println "outDir:   $params.outDir"
 35 | println "codeDir:  $params.codeDir"
 36 | println "mutContext96: $params.mutContext96"
 37 | println "nline: $params.nline"
 38 | println "pick: $params.pick"
 39 | println "vcf2maf: $params.vcf2maf"
 40 | println "=======END===================================================="
 41 | println ""
 42 | println ""
 43 | 
 44 | bed2vcf = params.codeDir + "bed2vcf.changes.pl"
 45 | processVEP = params.codeDir + "process.vep.output.pl"
 46 | processMAF = params.codeDir + "process.maf.output.pl"
 47 | createRdata = params.codeDir + "create.exome.Rdata.R"
 48 | 
 49 | ///////////////// Split bed file //////////////////////////////
 50 | process splitbed{
 51 | 	tag ("$params.exon")
 52 | 	validExitStatus 0,42
 53 |     cpus 1
 54 |     clusterOptions { "-q all.q -l h_vmem=${1 + 2 * (task.attempt -1)}G -pe smp 1 -P rssrbfx -v PATH" }
 55 | 
 56 |     input:
 57 |     	file(exon) from Channel.fromPath(params.exon)
 58 | 
 59 | 
 60 | 	output:
 61 | 		file ('split_*') into splitbedCh mode flatten
 62 | 
 63 | 	script:
 64 | 		"""
 65 | 		split -l ${params.nline} ${exon} split_
 66 | 		"""
 67 | 
 68 | }
 69 | 
 70 | 
 71 | // process vepvcf{
 72 | // 	tag ("${bedN}")
 73 | // 	validExitStatus 0,42
 74 | //     cpus 1
 75 | //     clusterOptions { "-q all.q -l h_vmem=${5 + 2 * (task.attempt -1)}G -pe smp 1 -P rssrbfx -v PATH" }
 76 | 
 77 | //     input:
 78 | //     	file(bed) from splitbedCh
 79 | 
 80 | //     output:
 81 | //     	file( "${bedN}.ann.vcf" ) into (annVcfCh, annVcfCh1, annVcfCh2)
 82 | 
 83 | //     script:
 84 | //     	bedN = bed.getName()
 85 | //         if(params.pick){
 86 | //             extra="--pick"
 87 | //         }else{
 88 | //             extra = ""
 89 | //         }
 90 | 
 91 | //     	"""
 92 | //     	perl ${bed2vcf} -e ${bed} -f ${params.ref}
 93 | 
 94 | //     	docker run --rm -u=\$UID -v /sc1/:/sc1/ -v ${params.vepCache}:/opt/vep/.vep  \
 95 | //     	-v \$PWD:\$PWD -w \$PWD ${params.vepDocker} /opt/vep/src/ensembl-vep/vep \
 96 | //     	--offline --no_stats --domains --failed 1 --total_length --biotype \
 97 | //     	--fork 1 --force_overwrite --fa ${params.ref} --dir /opt/vep/.vep \
 98 | //     	--vcf -i output.vcf -o ${bedN}.ann.vcf --assembly ${params.assembly} --cache_version ${params.vepCacheVersion} ${extra}
 99 | //     	"""
100 | // }
101 | 
102 | annVcfCh1.toSortedList().set{annVcfChmerge}
103 | 
104 | process mergeVCF{
105 |     tag("mergeVCF")
106 |     validExitStatus 0,42
107 |     cpus 1
108 |     clusterOptions { "-q all.q -l h_vmem=${5 + 2 * (task.attempt -1)}G -pe smp 1 -P rssrbfx -v PATH" }
109 |     publishDir "${params.outDir}/${params.vepCacheVersion}_${params.assembly}/", mode: 'link'
110 | 
111 |     input:
112 |         file "split_*.ann.vcf" from annVcfChmerge
113 |     output:
114 |         file "merged.ann.vcf" into finalVCF
115 | 
116 |     script:
117 |         """
118 |         ## output header
119 |         grep "#" split_1.ann.vcf > header.tsv
120 | 
121 |         ## concat
122 |         { cat header.tsv;  cat split_*.ann.vcf | grep -v '#'; } > merged.ann.vcf
123 |         """
124 | }
125 | 
126 | 
127 | 
128 | if(params.pick){
129 |     process postprocess{
130 |         tag("$vcfN")
131 |         validExitStatus 0,42
132 |         cpus 1
133 |         clusterOptions { "-q all.q -l h_vmem=${5 + 2 * (task.attempt -1)}G -pe smp 1 -P rssrbfx -v PATH" }
134 | 
135 |         input:
136 |             file(vcf) from annVcfCh
137 | 
138 |         output:
139 |             file( "${vcfN}_processed_vep.tsv" ) into annVEPCh
140 | 
141 |         script:
142 |             vcfN = vcf.getName().replaceAll(/\.ann\.vcf/, "")
143 | 
144 |             """
145 |             ## process vep, output name processed_vep.tsv
146 |             perl ${processVEP} -i ${vcf}
147 |             mv processed_vep.tsv ${vcfN}_processed_vep.tsv
148 |             """
149 |     }
150 |     annVEPCh.toSortedList().set{annVEPChmerge}
151 | 
152 | 
153 |     process mergeVEP{
154 |         tag("mergeVEP")
155 |         validExitStatus 0,42
156 |         cpus 1
157 |         clusterOptions { "-q all.q -l h_vmem=${5 + 2 * (task.attempt -1)}G -pe smp 1 -P rssrbfx -v PATH" }
158 |         publishDir "${params.outDir}/${params.vepCacheVersion}_${params.assembly}/", mode: 'link'
159 | 
160 |         input:
161 |             file "split_*._processed_vep.tsv" from annVEPChmerge
162 |         output:
163 |             file "merged.processed_vep.tsv" into finalVEP
164 | 
165 |         script:
166 |             """
167 |             ## concat
168 |             cat  split_*._processed_vep.tsv | sort -k 1,1 -k2,2n > merged.processed_vep.tsv
169 |             """
170 | 
171 |     }
172 | 
173 |     process finalprocess{
174 |         tag("merge")
175 |         validExitStatus 0,42
176 |         cpus 1
177 |         clusterOptions { "-q highmem.q -l hp -l h_vmem=${190 + 50 * (task.attempt -1)}G -pe smp 1 -P rssrbfx -v PATH" }
178 |         publishDir "${params.outDir}/${params.vepCacheVersion}_${params.assembly}/", mode: 'link'
179 | 
180 |         input:
181 |             file(vep) from finalVEP
182 | 
183 |         output:
184 |             file("exome_${params.vepCacheVersion}_${params.assembly}_vep.Rdata") into finalch
185 | 
186 |         script:
187 |             """
188 |             ## 
189 |             Rscript ${createRdata} ${vep} ${params.mutContext96} exome_${params.vepCacheVersion}_${params.assembly}_vep.Rdata
190 |             """
191 |     }
192 | 
193 | }else{
194 |     
195 |     process vcf_maf{
196 |         tag("$vcfN")
197 |         validExitStatus 0,42
198 |         cpus 1
199 |         clusterOptions { "-q all.q -l h_vmem=${5 + 2 * (task.attempt -1)}G -pe smp 1 -P rssrbfx -v PATH" }
200 | 
201 |         input:
202 |             file(vcf) from annVcfCh2
203 | 
204 |         output:
205 |             file( "${vcfN}_processed_maf.tsv" ) into processedMAFCh
206 | 
207 |         script:
208 |             vcfN = vcf.getName().replaceAll(/\.ann\.vcf/, "")
209 | 
210 |             """
211 |             ## convert vcf to maf, 
212 |             perl ${params.vcf2maf} --input-vcf ${vcf} --output-maf ${vcfN}.maf --ref-fasta  ${params.ref} --filter-vcf 0 --ncbi-build ${params.assembly} 
213 |             paste <(grep -v '#' ${vcf} | cut -f8 | cut -d ';' -f 1 ) <( egrep -v "Hugo_Symbol|#"  ${vcfN}.maf) >  ${vcfN}.forprocess.maf
214 |             perl ${processMAF} -i ${vcfN}.forprocess.maf
215 |             mv processed_MAF.tsv ${vcfN}_processed_maf.tsv
216 | 
217 |             """
218 | 
219 |     }
220 | 
221 |     processedMAFCh.toSortedList().set{annMAFChmerge}
222 | 
223 |     process mergeMAF{
224 |         tag("mergeMAF")
225 |         validExitStatus 0,42
226 |         cpus 1
227 |         clusterOptions { "-q all.q -l h_vmem=${5 + 2 * (task.attempt -1)}G -pe smp 1 -P rssrbfx -v PATH" }
228 |         publishDir "${params.outDir}/${params.vepCacheVersion}_${params.assembly}/", mode: 'link'
229 | 
230 |         input:
231 |             file "split_*._processed_maf.tsv" from annMAFChmerge
232 |         output:
233 |             file "merged.processed_maf.tsv" into finalMAF
234 | 
235 |         script:
236 |             """
237 |             ## concat
238 |             cat  split_*._processed_maf.tsv | sort -k 1,1 -k2,2n > merged.processed_maf.tsv
239 |             """
240 | 
241 |     }
242 | 
243 | 
244 | 
245 |     process finalprocessMAF{
246 |         tag("merge")
247 |         validExitStatus 0,42
248 |         cpus 1
249 |         clusterOptions { "-q highmem.q -l hp -l h_vmem=${190 + 50 * (task.attempt -1)}G -pe smp 1 -P rssrbfx -v PATH" }
250 |         publishDir "${params.outDir}/${params.vepCacheVersion}_${params.assembly}/", mode: 'link'
251 | 
252 |         input:
253 |             file(vep) from finalMAF
254 | 
255 |         output:
256 |             file("exome_${params.vepCacheVersion}_${params.assembly}_vep.Rdata") into finalchmaf
257 | 
258 |         script:
259 |             """
260 |             ## 
261 |             Rscript ${createRdata} ${vep} ${params.mutContext96} exome_${params.vepCacheVersion}_${params.assembly}_vep.Rdata
262 |             """
263 |     }
264 | 
265 | }
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 


--------------------------------------------------------------------------------
/inst/nf/nextflow.config:
--------------------------------------------------------------------------------
 1 | trace {
 2 |     enabled = true
 3 |     file = 'pipeline_trace.txt'
 4 |     fields = 'task_id,name,container,status,exit,start,realtime,%cpu,rss,hash,native_id,process,tag,module,attempt,submit,complete,duration,%mem,vmem,peak_rss,peak_vmem,rchar,wchar,syscr,syscw,read_bytes,write_bytes'
 5 |     raw = true
 6 | }
 7 | 
 8 | timeline {
 9 |     enabled = true
10 |     file = 'pipeline_timeline.html'
11 | }
12 | 
13 | profiles {
14 |   standard {
15 |     process.executor = 'local'
16 |     process.errorStrategy = 'finish'
17 |     process.maxRetries = 1
18 |   }
19 | 
20 |   cluster {
21 |     process.executor = 'uge'
22 |     executor.queueSize = 200
23 |     process.penv = 'smp'
24 |     process.errorStrategy = 'retry'
25 |     process.maxRetries = 3
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/inst/nf/process.maf.output.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | use Getopt::Long;
  7 | 
  8 | my $input;
  9 | my $outDir = "./";
 10 | 
 11 | GetOptions('i=s'              => \$input,
 12 |            'o=s'              => \$outDir,
 13 |         );
 14 | 
 15 | unless ($input){
 16 |         print <<EOF;
 17 | 
 18 |         Usage :  perl process.vep.output.pl -i input_dir -o output
 19 |                         -i : input directory containing the MAF output files
 20 |                         -o : outDir - [ optional ], default is './'
 21 | 
 22 | EOF
 23 |         exit;
 24 | }
 25 | 
 26 | if (! -e "$input"){
 27 | 	exit;
 28 | }
 29 | 
 30 | 
 31 | open(OUT,">$outDir/processed_MAF.tsv") || die;
 32 | 
 33 | ### A,C, G,T 
 34 | 
 35 | my $start = 0;
 36 | my $chr_old;
 37 | my $gene_old;
 38 | my $protein_old;
 39 | my $context_old;
 40 | my %saw;
 41 | 
 42 | 
 43 | 
 44 | open(IN,"$input")||die;
 45 | while(<IN>){
 46 | 	next if /^#/;
 47 | 	chomp $_;
 48 | 	
 49 | 	my ($chr,$pos,$alt,$context, $cons, $gene, $protein) = (split /\t/, $_)[ 5,6,13,0,9,48,54];
 50 | 	# $chr =~ s/^chr//;
 51 | 
 52 |     print STDOUT "$cons\t" ;
 53 |     my $a = &effect($cons);
 54 |     print STDOUT "$a\n";
 55 | 
 56 | 
 57 | 
 58 | 	$cons = &effect($cons);
 59 | 	if ($cons eq "NA"){
 60 | 		next;
 61 | 	}	
 62 | 	if ($pos ne $start){
 63 | 		if ($start != 0){
 64 |             print OUT join("\t",$chr_old,$start, $saw{"A"}||'0', $saw{"C"} ||'0', $saw{"G"} || '0', $saw{"T"} || '0', $gene_old, $protein_old,$context_old),"\n";
 65 |         }
 66 |         undef %saw;
 67 |         $saw{$alt} = $cons;
 68 |     }else{
 69 |         $saw{$alt} = $cons;
 70 |     }
 71 | 	$start = $pos;
 72 |     $chr_old = $chr;
 73 |     $gene_old = $gene;
 74 |     $protein_old = $protein;
 75 |     $context_old = $context;
 76 | 
 77 | 	if (eof(IN)){
 78 |         print OUT join("\t",$chr_old,$start, $saw{"A"}||'0', $saw{"C"} ||'0', $saw{"G"} || '0', $saw{"T"} || '0', $gene_old, $protein_old,$context_old),"\n";
 79 | 	}
 80 | }
 81 | close IN;
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | sub effect{
 88 | 	my ($effect) = @_;
 89 |     if ($effect =~ /Silent/ ){
 90 |         return "0";
 91 |     }elsif($effect =~ m(Missense_Mutation|In_Frame_Del |Nonsense_Mutation|In_Frame_Ins |Frame_Shift_Del |Frame_Shift_Ins)){
 92 |         return "1"
 93 |     }else{
 94 |         return "NA"
 95 |     }
 96 | }
 97 |     
 98 | 
 99 | 
100 | #  0 - silent
101 | #  1 - nonsilent
102 | 
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/inst/nf/process.vep.output.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | use Getopt::Long;
  7 | 
  8 | my $input;
  9 | my $outDir = "./";
 10 | 
 11 | GetOptions('i=s'              => \$input,
 12 |            'o=s'              => \$outDir,
 13 |         );
 14 | 
 15 | unless ($input){
 16 |         print <<EOF;
 17 | 
 18 |         Usage :  perl process.vep.output.pl -i input_dir -o output
 19 |                         -i : input directory containing the VEP output files
 20 |                         -o : outDir - [ optional ], default is './'
 21 | 
 22 | EOF
 23 |         exit;
 24 | }
 25 | 
 26 | if (! -e "$input"){
 27 | 	exit;
 28 | }
 29 | 
 30 | 
 31 | open(OUT,">$outDir/processed_vep.tsv") || die;
 32 | 
 33 | ### A,C, G,T 
 34 | 
 35 | my $start = 0;
 36 | my $chr_old;
 37 | my $gene_old;
 38 | my $protein_old;
 39 | my $context_old;
 40 | my %saw;
 41 | 
 42 | 
 43 | 
 44 | open(IN,"$input")||die;
 45 | while(<IN>){
 46 | 	next if /^#/;
 47 | 	chomp $_;
 48 | 	
 49 | 	my ($chr,$pos,$alt,$context, $cons, $gene, $protein) = (split /\t|;|\|/, $_)[0,1,4,7,9,12,22];
 50 | 	# $chr =~ s/^chr//;
 51 | 
 52 | 	$cons = &effect($cons);
 53 | 	if ($cons eq "NA"){
 54 | 		next;
 55 | 	}	
 56 | 	if ($pos ne $start){
 57 | 		if ($start != 0){
 58 |             print OUT join("\t",$chr_old,$start, $saw{"A"}||'0', $saw{"C"} ||'0', $saw{"G"} || '0', $saw{"T"} || '0', $gene_old, $protein_old,$context_old),"\n";
 59 |         }
 60 |         undef %saw;
 61 |         $saw{$alt} = $cons;
 62 |     }else{
 63 |         $saw{$alt} = $cons;
 64 |     }
 65 | 	$start = $pos;
 66 |     $chr_old = $chr;
 67 |     $gene_old = $gene;
 68 |     $protein_old = $protein;
 69 |     $context_old = $context;
 70 | 
 71 | 	if (eof(IN)){
 72 |         print OUT join("\t",$chr_old,$start, $saw{"A"}||'0', $saw{"C"} ||'0', $saw{"G"} || '0', $saw{"T"} || '0', $gene_old, $protein_old,$context_old),"\n";
 73 | 	}
 74 | }
 75 | close IN;
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | sub effect{
 82 | 	my ($effect) = @_;
 83 |        	return "5" if ($effect =~ /splice_acceptor_variant|splice_donor_variant|transcript_ablation/);    		  
 84 | 	return "2" if( $effect =~ /stop_gained/ );
 85 |     	return "3" if( $effect =~ /stop_lost/ );
 86 |     	return "4" if( $effect =~ /initiator_codon_variant|start_lost/ );
 87 |     	return "1" if( $effect =~ /missense_variant|conservative_missense_variant|rare_amino_acid_variant/ );
 88 | 	return "NA" if ($effect =~ /transcript_amplification/);
 89 |     	return "0" if( $effect =~ /synonymous_variant|stop_retained_variant/ );
 90 |     	return "NA" if ( $effect =~ /splice_region_variant/ );
 91 | 	return "0" if ($effect =~ /incomplete_terminal_codon_variant/);
 92 | 	return "1" if ($effect =~ /protein_altering_variant|coding_sequence_variant/);
 93 | 	return "NA";		
 94 | }
 95 | 
 96 | 
 97 | #  0 - silent
 98 | #  1 - miss-sense
 99 | #  2 - nonsense
100 | #  3 - nonstop
101 | #  4 - TSS
102 | #  5 - splice 
103 | 


--------------------------------------------------------------------------------
/inst/perl/Sequence_Retrieve.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | my $maf = $ARGV[0];
 4 | my $ref = $ARGV[1];
 5 | my $out = $ARGV[2];
 6 | 
 7 | 
 8 | my $chr;
 9 | my $id;
10 | my $pos;
11 | my %ref;
12 | my @line = split /\n+/,`grep -v "Ensembl" $maf | awk '{OFS="\t"}{print \$2,\$3-2,\$3+1}' | fastaFromBed -fi $ref -bed stdin -fo stdout`;
13 | foreach my $line (@line){
14 | 	if ($line =~ />/){
15 | 		($chr, $pos) = (split />|:|-/,$line)[1..2];
16 |                 $id = join("",$chr,":",$pos);
17 | 	}else{
18 | 		 chomp $line;
19 |                  $ref{$id} =$line;
20 | 	}
21 | }
22 | 
23 | 
24 | open(O,">$out")||die;
25 | open(IN,$maf)||die;
26 | while(<IN>){
27 | 	if (/^Ensembl_gene_id/){
28 | 		chomp $_;
29 | 		s/\s+$//g;
30 | 		print O $_,"\t","Context\n";
31 | 	}else{
32 | 		chomp $_;
33 | 		if (/In_frame/ || /Frame_shift/){
34 | 			print O $_,"\tINDEL\n";
35 | 		}else{
36 | 			my @temp = split /\t/,$_;
37 | 			$id = join("",$temp[1],":",($temp[2]-2));
38 | 			print O $_,"\t",&convert(join("",$ref{$id},$temp[11])),"\n";
39 | 		}
40 | 	}
41 | }
42 | 
43 | sub convert{
44 | 	my ($input) = @_;
45 | 	$input =~ tr/ATGC/1234/;
46 | 	return ($input);
47 | }
48 | 


--------------------------------------------------------------------------------
/inst/stan/NB_bs.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> J;  // number of gene
 3 |   int<lower=0> S;  // number of samples
 4 |   int<lower=0> N;  // J * S
 5 |   int<lower=0> jj[N]; // index of j
 6 |   int<lower=0> ss[N]; // index of samples
 7 |   int<lower=0> y[N];
 8 |   real offset[J];
 9 |   real<lower=0> geneDisp[J];
10 | }
11 | parameters {
12 |   real<lower=0, upper= 100000> bs[S];
13 | }
14 | 
15 | 
16 | model {
17 |   for (n in 1:N) {
18 |     target +=  neg_binomial_2_lpmf(y[n] | bs[ss[n]] * offset[jj[n]], geneDisp[jj[n]]);
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/inst/stan/NB_bs2.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> J;  // number of gene
 3 |   int<lower=0> S;  // number of samples
 4 |   int<lower=0> N;  // J * S
 5 |   int<lower=0> jj[N]; // index of j
 6 |   int<lower=0> ss[N]; // index of samples
 7 |   int<lower=0> y[N];
 8 |   real offset[J];
 9 |   real<lower=0> geneDisp[J];
10 |   real<lower=0> mu1;
11 |   real<lower=0> mu2;
12 |   real<lower=0> sigma1;
13 |   real<lower=0> sigma2;
14 |   real<lower=0, upper=1> lambda1;
15 |   real<lower=0, upper=1> lambda2;
16 | }
17 | parameters {
18 |   real<lower=0, upper= 100000> bs[S];
19 | }
20 | 
21 | 
22 | model {
23 |   for (n in 1:N) {
24 |     target +=  neg_binomial_2_lpmf(y[n] | bs[ss[n]] * offset[jj[n]], geneDisp[jj[n]])
25 |     + log_sum_exp(log(lambda1) + normal_lpdf(log(bs[ss[n]]) | mu1, sigma1),
26 |     log(lambda2) + normal_lpdf(log(bs[ss[n]]) | mu2, sigma2));
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/inst/stan/NB_bs3.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> J;  // number of gene
 3 |   int<lower=0> S;  // number of samples
 4 |   int<lower=0> N;  // J * S
 5 |   int<lower=0> jj[N]; // index of j
 6 |   int<lower=0> ss[N]; // index of samples
 7 |   int<lower=0> y[N];
 8 |   real offset[J];
 9 |   real<lower=0> geneDisp[J];
10 |   real<lower=0> mu1;
11 |   real<lower=0> mu2;
12 |   real<lower=0> mu3;
13 |   real<lower=0> sigma1;
14 |   real<lower=0> sigma2;
15 |   real<lower=0> sigma3;
16 |   real<lower=0, upper=1> lambda1;
17 |   real<lower=0, upper=1> lambda2;
18 |   real<lower=0, upper=1> lambda3;
19 | }
20 | parameters {
21 |   real<lower=0, upper= 100000> bs[S];
22 | }
23 | 
24 | 
25 | model {
26 |   for (n in 1:N) {
27 |     target +=  neg_binomial_2_lpmf(y[n] | bs[ss[n]] * offset[jj[n]], geneDisp[jj[n]])
28 |     + log_sum_exp(log(lambda1) + normal_lpdf(log(bs[ss[n]]) | mu1, sigma1),
29 |                   log(lambda2) + normal_lpdf(log(bs[ss[n]]) | mu2, sigma2),
30 |                   log(lambda3) + normal_lpdf(log(bs[ss[n]]) | mu3, sigma3));
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/inst/stan/Poisson_bs.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> J;  // number of gene
 3 |   int<lower=0> S;  // number of samples
 4 |   int<lower=0> N;  // J * S
 5 |   int<lower=0> jj[N]; // index of j
 6 |   int<lower=0> ss[N]; // index of samples
 7 |   int<lower=0> y[N];
 8 |   real offset[J];
 9 | }
10 | parameters {
11 |   real<lower=0, upper= 100000> bs[S];
12 | }
13 | 
14 | 
15 | model {
16 |   for (n in 1:N) {
17 |     target +=   poisson_lpmf(y[n] | bs[ss[n]] * offset[jj[n]]);
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/inst/stan/Poisson_bs2.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> J;  // number of gene
 3 |   int<lower=0> S;  // number of samples
 4 |   int<lower=0> N;  // J * S
 5 |   int<lower=0> jj[N]; // index of j
 6 |   int<lower=0> ss[N]; // index of samples
 7 |   int<lower=0> y[N];
 8 |   real offset[J];
 9 |   real<lower=0> mu1;
10 |   real<lower=0> mu2;
11 |   real<lower=0> sigma1;
12 |   real<lower=0> sigma2;
13 |   real<lower=0, upper=1> lambda1;
14 |   real<lower=0, upper=1> lambda2;
15 | }
16 | parameters {
17 |   real<lower=0, upper= 100000> bs[S];
18 | }
19 | 
20 | 
21 | model {
22 |   for (n in 1:N) {
23 |     target +=   poisson_lpmf(y[n] | bs[ss[n]] * offset[jj[n]])
24 |     + log_sum_exp(log(lambda1) + normal_lpdf(log(bs[ss[n]]) | mu1, sigma1),
25 |                   log(lambda2) + normal_lpdf(log(bs[ss[n]]) | mu2, sigma2));
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/inst/stan/Poisson_bs3.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> J;  // number of gene
 3 |   int<lower=0> S;  // number of samples
 4 |   int<lower=0> N;  // J * S
 5 |   int<lower=0> jj[N]; // index of j
 6 |   int<lower=0> ss[N]; // index of samples
 7 |   int<lower=0> y[N];
 8 |   real offset[J];
 9 |   real<lower=0> mu1;
10 |   real<lower=0> mu2;
11 |   real<lower=0> mu3;
12 |   real<lower=0> sigma1;
13 |   real<lower=0> sigma2;
14 |   real<lower=0> sigma3;
15 |   real<lower=0, upper=1> lambda1;
16 |   real<lower=0, upper=1> lambda2;
17 |   real<lower=0, upper=1> lambda3;
18 | }
19 | parameters {
20 |   real<lower=0, upper= 100000> bs[S];
21 | }
22 | 
23 | 
24 | model {
25 |   for (n in 1:N) {
26 |     target +=   poisson_lpmf(y[n] | bs[ss[n]] * offset[jj[n]])
27 |     + log_sum_exp(log(lambda1) + normal_lpdf(log(bs[ss[n]]) | mu1, sigma1),
28 |                   log(lambda2) + normal_lpdf(log(bs[ss[n]]) | mu2, sigma2),
29 |                   log(lambda3) + normal_lpdf(log(bs[ss[n]]) | mu3, sigma3));
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/inst/stan/ZIP.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> I;  // number of patients
 3 |   int<lower=0> J;  // number of gene
 4 |   int N;
 5 |   int<lower=1,upper=I> ii[N]; // patient for observation n
 6 |   int<lower=1,upper=J> jj[N]; // gene for observation n
 7 |   int<lower=0> y[N];
 8 |   real offset[N];
 9 | }
10 | parameters {
11 |   real<lower=0, upper=1> zero_p[J];
12 |   real<lower=0> lambda[J];
13 | }
14 | 
15 | // transformed parameters {
16 | //   real mu = lambda[jj[N]] * offset;
17 | // }
18 | model {
19 |   for (n in 1:N) {
20 |     if (y[n] == 0)
21 |       target += log_sum_exp(bernoulli_lpmf(1 | zero_p[jj[n]]),
22 |                             bernoulli_lpmf(0 | zero_p[jj[n]])
23 |                               + poisson_lpmf(y[n] | lambda[jj[n]] * offset[n]));
24 |     else
25 |       target += bernoulli_lpmf(0 | zero_p[jj[n]])
26 |       + poisson_lpmf(y[n] | lambda[jj[n]] * offset[n]);
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/inst/stan/ZIPmix.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> I;  // number of patients
 3 |   int<lower=0> J;  // number of gene
 4 |   int N;
 5 |   int<lower=1,upper=I> ii[N]; // patient for observation n
 6 |   int<lower=1,upper=J> jj[N]; // gene for observation n
 7 |   int<lower=0> y[N];
 8 |   real<lower=0> a_zip;
 9 |   real offset[N];
10 |   real<lower=0> sigma;
11 | }
12 | parameters {
13 |   real<lower=0, upper=1> zero_p;
14 |   real<lower=0> a_opt;
15 | }
16 | 
17 | 
18 | model {
19 |   for (n in 1:N) {
20 |     if (y[n] == 0)
21 |       target += log_sum_exp(bernoulli_lpmf(1 | zero_p),
22 |                             bernoulli_lpmf(0 | zero_p)
23 |                               + poisson_lpmf(y[n] | a_opt * offset[n]))
24 |                               + normal_lpdf(log(a_opt) | log(a_zip), sigma);
25 |     else
26 |       target += bernoulli_lpmf(0 | zero_p)
27 |       + poisson_lpmf(y[n] |  a_opt* offset[n])
28 |       + normal_lpdf(log(a_opt) | log(a_zip), sigma);
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/inst/stan/ZIPmix_bs.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> J;  // number of gene
 3 |   int<lower=0> S;  // number of samples
 4 |   int<lower=0> N;  // J * S
 5 |   int<lower=0> jj[N]; // index of j
 6 |   int<lower=0> ss[N]; // index of samples
 7 |   int<lower=0> y[N];
 8 |   real offset[J];
 9 |   real<lower=0, upper=1> zero_p[J];
10 | }
11 | parameters {
12 |   real<lower=0, upper= 100000> bs[S];
13 | }
14 | 
15 | 
16 | model {
17 |   for (n in 1:N) {
18 |     if (y[n] == 0)
19 |       target += log_sum_exp(bernoulli_lpmf(1 | zero_p[jj[n]]),
20 |                             bernoulli_lpmf(0 | zero_p[jj[n]])
21 |                               + poisson_lpmf(y[n] | bs[ss[n]] * offset[jj[n]]));
22 |       else
23 |         target += bernoulli_lpmf(0 | zero_p[jj[n]])
24 |         + poisson_lpmf(y[n] |  bs[ss[n]]* offset[jj[n]]);
25 |   }
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/inst/stan/ZIPmix_bs2.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> J;  // number of gene
 3 |   int<lower=0> S;  // number of samples
 4 |   int<lower=0> N;  // J * S
 5 |   int<lower=0> jj[N]; // index of j
 6 |   int<lower=0> ss[N]; // index of samples
 7 |   int<lower=0> y[N];
 8 |   real offset[J];
 9 |   real<lower=0, upper=1> zero_p[J];
10 |   real<lower=0> mu1;
11 |   real<lower=0> mu2;
12 |   real<lower=0> sigma1;
13 |   real<lower=0> sigma2;
14 |   real<lower=0, upper=1> lambda1;
15 |   real<lower=0, upper=1> lambda2;
16 | }
17 | parameters {
18 |   real<lower=0, upper= 100000> bs[S];
19 | }
20 | 
21 | 
22 | model {
23 |   for (n in 1:N) {
24 |     if (y[n] == 0)
25 |       target += log_sum_exp(bernoulli_lpmf(1 | zero_p[jj[n]]),
26 |                             bernoulli_lpmf(0 | zero_p[jj[n]])
27 |                               + poisson_lpmf(y[n] | bs[ss[n]] * offset[jj[n]]))
28 |       + log_sum_exp(log(lambda1) + normal_lpdf(log(bs[ss[n]]) | mu1, sigma1),
29 |                      log(lambda2) + normal_lpdf(log(bs[ss[n]]) | mu2, sigma2));
30 |       else
31 |         target += bernoulli_lpmf(0 | zero_p[jj[n]])
32 |         + poisson_lpmf(y[n] |  bs[ss[n]]* offset[jj[n]])
33 |         + log_sum_exp(log(lambda1) + normal_lpdf(log(bs[ss[n]]) | mu1, sigma1),
34 |                       log(lambda2) + normal_lpdf(log(bs[ss[n]]) | mu2, sigma2));
35 |   }
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/inst/stan/ZIPmix_bs3.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=0> J;  // number of gene
 3 |   int<lower=0> S;  // number of samples
 4 |   int<lower=0> N;  // J * S
 5 |   int<lower=0> jj[N]; // index of j
 6 |   int<lower=0> ss[N]; // index of samples
 7 |   int<lower=0> y[N];
 8 |   real offset[J];
 9 |   real<lower=0, upper=1> zero_p[J];
10 |   real<lower=0> mu1;
11 |   real<lower=0> mu2;
12 |   real<lower=0> mu3;
13 |   real<lower=0> sigma1;
14 |   real<lower=0> sigma2;
15 |   real<lower=0> sigma3;
16 |   real<lower=0, upper=1> lambda1;
17 |   real<lower=0, upper=1> lambda2;
18 |   real<lower=0, upper=1> lambda3;
19 | }
20 | parameters {
21 |   real<lower=0, upper= 100000> bs[S];
22 | }
23 | 
24 | 
25 | model {
26 |   for (n in 1:N) {
27 |     if (y[n] == 0)
28 |       target += log_sum_exp(bernoulli_lpmf(1 | zero_p[jj[n]]),
29 |                             bernoulli_lpmf(0 | zero_p[jj[n]])
30 |                               + poisson_lpmf(y[n] | bs[ss[n]] * offset[jj[n]]))
31 |       + log_sum_exp(log(lambda1) + normal_lpdf(log(bs[ss[n]]) | mu1, sigma1),
32 |                      log(lambda2) + normal_lpdf(log(bs[ss[n]]) | mu2, sigma2),
33 |                      log(lambda3) + normal_lpdf(log(bs[ss[n]]) | mu3, sigma3));
34 |       else
35 |         target += bernoulli_lpmf(0 | zero_p[jj[n]])
36 |         + poisson_lpmf(y[n] |  bs[ss[n]]* offset[jj[n]])
37 |         + log_sum_exp(log(lambda1) + normal_lpdf(log(bs[ss[n]]) | mu1, sigma1),
38 |                       log(lambda2) + normal_lpdf(log(bs[ss[n]]) | mu2, sigma2),
39 |                       log(lambda3) + normal_lpdf(log(bs[ss[n]]) | mu3, sigma3));
40 |   }
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/man/CM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TMBpred.R
 3 | \name{CM}
 4 | \alias{CM}
 5 | \title{CM}
 6 | \usage{
 7 | CM(Data, mu, o_scale, MRtriProb, gids, zero_p = 0)
 8 | }
 9 | \arguments{
10 | \item{Data}{mutSet}
11 | 
12 | \item{mu}{mu of the gene}
13 | 
14 | \item{o_scale}{o_scale}
15 | 
16 | \item{MRtriProb}{MRtriProb}
17 | 
18 | \item{gids}{gene IDs}
19 | 
20 | \item{zero_p}{zero_p}
21 | }
22 | \value{
23 | A list for all the inputs for Instan
24 | }
25 | \description{
26 | predict number of mutation
27 | }
28 | \examples{
29 | \dontrun{
30 | CM(Data)
31 | }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/man/CalTMB.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{CalTMB}
 4 | \alias{CalTMB}
 5 | \title{CalTMB}
 6 | \usage{
 7 | CalTMB(x, type = "all", sampleN = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{mutSet object}
11 | 
12 | \item{type}{Type of mutation to be included. It can be 'all' 'nonsil' 'sil' 'indel' 'snp' 'frameshit'}
13 | 
14 | \item{sampleN}{Default is NULL. If it is specified, only subset of samples will be reported.}
15 | }
16 | \value{
17 | A data.frame contain the mutation count
18 | }
19 | \description{
20 | calcuate number of mutation per patient
21 | }
22 | \examples{
23 | \dontrun{
24 | CalTMB(Data)
25 | }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/ConvertExtraType.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/procMaf.R
 3 | \name{ConvertExtraType}
 4 | \alias{ConvertExtraType}
 5 | \title{ConvertExtraType}
 6 | \usage{
 7 | ConvertExtraType(maf)
 8 | }
 9 | \arguments{
10 | \item{maf}{maf matrix}
11 | }
12 | \value{
13 | maf file with converted extra type
14 | }
15 | \description{
16 | ConvertExtraType
17 | }
18 | \examples{
19 | \dontrun{
20 | ConvertExtraType(maf)
21 | }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/man/ExGMM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TMBpred.R
 3 | \name{ExGMM}
 4 | \alias{ExGMM}
 5 | \title{ExGMM}
 6 | \usage{
 7 | ExGMM(x, msi = NULL, single = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{number of mutation}
11 | 
12 | \item{msi}{msi status}
13 | 
14 | \item{single}{A boolen. If True, only one class will be identified.}
15 | }
16 | \value{
17 | A vector mutation count for 96 trinucleotide context
18 | }
19 | \description{
20 | fit the data distribution to a mixture of Gaussian mixture
21 | }
22 | \examples{
23 | \dontrun{
24 | ExGMM(Data)
25 | }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/Get_incon_mut.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/procMaf.R
 3 | \name{Get_incon_mut}
 4 | \alias{Get_incon_mut}
 5 | \title{Get_incon_mut
 6 | Get Variants with incosistant gene annotation from
 7 | GSMuta reference data}
 8 | \usage{
 9 | Get_incon_mut(mutab, exome)
10 | }
11 | \arguments{
12 | \item{mutab}{maf matrix}
13 | 
14 | \item{exome}{exome file.}
15 | }
16 | \value{
17 | Get location of variants with incosistant gene annotation from
18 | GSMuta reference data
19 | }
20 | \description{
21 | Get_incon_mut
22 | Get Variants with incosistant gene annotation from
23 | GSMuta reference data
24 | }
25 | \examples{
26 | \dontrun{
27 | Get_incon_mut(mutab,exome)
28 | }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/man/MutSet.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ecTMB.R
 3 | \docType{class}
 4 | \name{MutSet}
 5 | \alias{MutSet}
 6 | \title{MutSet class}
 7 | \format{An \code{\link{R6Class}} object.}
 8 | \usage{
 9 | MutSet
10 | }
11 | \value{
12 | Object of class \code{MutSet}
13 | }
14 | \description{
15 | MutSet class
16 | }
17 | \section{Slots}{
18 | 
19 | \describe{
20 | \item{\code{mut}}{table from MAF}
21 | 
22 | \item{\code{gid}}{list of gene id for analysis}
23 | 
24 | \item{\code{exome}}{table contains all possible mutations}
25 | 
26 | \item{\code{exomeGene}}{table contains all posiible mutations for each gene.}
27 | 
28 | \item{\code{covar}}{table contains covariates for each gene}
29 | 
30 | \item{\code{mutContext}}{The context of 96 mutations}
31 | 
32 | \item{\code{samples}}{data.frame store sample info. if BED colunm is specified,
33 | the background process will be using the regions limited to bed file.}
34 | 
35 | \item{\code{incosistantAnno}}{Variants with incosistant gene annotation from
36 | GSMuta reference data .
37 | \describe{
38 |   \item{\code{new()}}{Create a MutSet }
39 |   \item{\code{nonsilent()}}{Get nonsilent mutations table}
40 |   \item{\code{silent()}}{Get silent mutations table}
41 |   \item{\code{nc()}}{Get noncoding mutations table}
42 | }}
43 | }}
44 | 
45 | \examples{
46 | \dontrun{
47 | MutSet$new()
48 | }
49 | }
50 | \keyword{datasets}
51 | 


--------------------------------------------------------------------------------
/man/OverLap.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{OverLap}
 4 | \alias{OverLap}
 5 | \title{OverLap}
 6 | \usage{
 7 | OverLap(Bed, regions, base0 = FALSE)
 8 | }
 9 | \arguments{
10 | \item{Bed}{targeted bed file. data.frame contain colunms "Chromosome" "Start" "End"}
11 | 
12 | \item{regions}{path to bed file or data.frame}
13 | 
14 | \item{base0}{A boolen if it is 0 based.}
15 | }
16 | \value{
17 | a list of row number which targetBed overlap with regions.
18 | }
19 | \description{
20 | OverLap
21 | }
22 | \examples{
23 | \dontrun{
24 |   exomebed        = data.frame(Chromosome = exome[, "Chromosome"],
25 |   Start = exome[, "pos"],
26 |   End = exome[,"pos"])
27 |   OverLap(exomebed, regions)
28 | }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/man/assignClass.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TMBpred.R
 3 | \name{assignClass}
 4 | \alias{assignClass}
 5 | \title{AssignClass}
 6 | \usage{
 7 | assignClass(x, prior, type = "low_high")
 8 | }
 9 | \arguments{
10 | \item{x}{predicted TMB}
11 | 
12 | \item{prior}{prior parameters.}
13 | 
14 | \item{type}{If 'low_high", only class 2 and 3 will be grouped to high.
15 | If 'exact', exact class will be reported}
16 | }
17 | \value{
18 | A vector of class
19 | }
20 | \description{
21 | predict TMB stat function
22 | }
23 | \examples{
24 | \dontrun{
25 | assignClass(Data)
26 | }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/man/calMut.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ecTMB.R
 3 | \name{calMut}
 4 | \alias{calMut}
 5 | \title{calMut
 6 | provide observed and predicted mutation rate}
 7 | \usage{
 8 | calMut(Data, params, sampleN = NULL, gids = NULL, bed = NULL)
 9 | }
10 | \arguments{
11 | \item{Data}{MutSet class. Detail see \code{MutSet}.}
12 | 
13 | \item{params}{output from fit_model. Detail see \code{fit_model}}
14 | 
15 | \item{sampleN}{Sample names}
16 | 
17 | \item{gids}{gene IDs}
18 | 
19 | \item{bed}{path to bed file.}
20 | }
21 | \value{
22 | a summary of predicted and expected mutation rate.
23 | }
24 | \description{
25 | calMut
26 | provide observed and predicted mutation rate
27 | }
28 | 


--------------------------------------------------------------------------------
/man/checkMutC.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{checkMutC}
 4 | \alias{checkMutC}
 5 | \title{check each sample have at least one mutation.}
 6 | \usage{
 7 | checkMutC(sample, mut)
 8 | }
 9 | \arguments{
10 | \item{sample}{data.frame of sample}
11 | 
12 | \item{mut}{Mutation table}
13 | }
14 | \value{
15 | restrict sample to only the ones that contain at least one mutation
16 | }
17 | \description{
18 | check each sample have at least one mutation.
19 | }
20 | 


--------------------------------------------------------------------------------
/man/count_Mut.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{count_Mut}
 4 | \alias{count_Mut}
 5 | \title{Count mutatin per patient or per mutation tri-nuclear context.}
 6 | \usage{
 7 | count_Mut(mut, gid = NULL, sampleN = NULL, byContext = FALSE)
 8 | }
 9 | \arguments{
10 | \item{mut}{data frame contain mutation info which must contain 'Start_Position' and 'Tumor_Sample_Barcode' or
11 | exomeGene - data.frame.}
12 | 
13 | \item{gid}{default is NULL. provide gid list}
14 | 
15 | \item{sampleN}{default is NULL. Provide list of sample name. This parameter is only useful when byContext is FALSE}
16 | 
17 | \item{byContext}{A boolen. If TRUE, the number of mutation per tri-nucleotide context will be reported.
18 | If FALSE, the number of mutation per patient will be reported. Default is FALSE.}
19 | }
20 | \value{
21 | mutCount data.frame contain number of mutation for each patient or
22 | a vector contains number of mutation per tri-nucleotide context
23 | }
24 | \description{
25 | Count mutatin per patient or per mutation tri-nuclear context.
26 | }
27 | \examples{
28 | \dontrun{
29 | count_Mut(mut)
30 | }
31 | }
32 | 


--------------------------------------------------------------------------------
/man/fit_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{fit_model}
 4 | \alias{fit_model}
 5 | \title{fit_model}
 6 | \usage{
 7 | fit_model(Data, MRtriProb, method = "NB", cores = 1, bs.type = "all",
 8 |   mut.nonsil = FALSE, nonsil.fraction = 0.6)
 9 | }
10 | \arguments{
11 | \item{Data}{mutSet object}
12 | 
13 | \item{MRtriProb}{output from get_bg_MRtri}
14 | 
15 | \item{method}{Either NB or Poisson}
16 | 
17 | \item{cores}{Number of parallel core to be used}
18 | 
19 | \item{bs.type}{Either 'all' or nonsil. When all is specificed, all mutation per Mb
20 | will be used as sample specific background mutation (TMB). When nonsil is specified,
21 | non-synonymous mutation per Mb will be used as sample specific background mutation (TMB).}
22 | 
23 | \item{mut.nonsil}{A boolen. If TRUE, non-synonymous mutation will be used for background mutation
24 | modeling during training.}
25 | 
26 | \item{nonsil.fraction}{Default is 0.6. The percentage of genes whose non-synonymous mutation
27 | will be used.}
28 | }
29 | \value{
30 | A vector mutation count for 96 trinucleotide context
31 | }
32 | \description{
33 | fit background model with negative-binomial regression
34 | }
35 | \examples{
36 | \dontrun{
37 | fit_model2(Data)
38 | }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/man/getBgMRtri.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{getBgMRtri}
 4 | \alias{getBgMRtri}
 5 | \title{Calculate the relative mutation frequency for each tri-nucleotide context #}
 6 | \usage{
 7 | getBgMRtri(Data, fraction = 0.6)
 8 | }
 9 | \arguments{
10 | \item{Data}{mutSet object}
11 | 
12 | \item{fraction}{fraction of gene used for nonsilent mutation.}
13 | }
14 | \value{
15 | A vector mutation count for 96 trinucleotide context
16 | }
17 | \description{
18 | Calculate the relative mutation frequency for each tri-nucleotide context #
19 | }
20 | \examples{
21 | \dontrun{
22 | get_bg_MRtri(Data)
23 | }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/man/getEnsemblID.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{getEnsemblID}
 4 | \alias{getEnsemblID}
 5 | \title{getEnsemblID}
 6 | \usage{
 7 | getEnsemblID(geneID,
 8 |   geneInfof = "~/Data/GSMuta_data/GSMutaRData/extdata/ensembl_92_exon_pos.hg38.rda")
 9 | }
10 | \arguments{
11 | \item{geneID}{gene symbol}
12 | 
13 | \item{geneInfof}{store the info symbol and ensembl ID.}
14 | }
15 | \value{
16 | a vector of ensemble ID
17 | }
18 | \description{
19 | get ensembl ID from gene symbol
20 | }
21 | 


--------------------------------------------------------------------------------
/man/getGeneBgProb.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{getGeneBgProb}
 4 | \alias{getGeneBgProb}
 5 | \title{getGeneBgProb}
 6 | \usage{
 7 | getGeneBgProb(exomeGene, prob, consequences = c(1, 2, 3, 4, 5),
 8 |   gid = NULL)
 9 | }
10 | \arguments{
11 | \item{exomeGene}{data.frame}
12 | 
13 | \item{prob}{probability of each tri-nucleotides mutation context}
14 | 
15 | \item{consequences}{default is c(1,2,3,4,5) which are nonsil mutation.}
16 | 
17 | \item{gid}{Gene ID list.}
18 | }
19 | \value{
20 | data.frame background gene muation probabilty without regression
21 | }
22 | \description{
23 | get background gene muation probabilty without regression
24 | }
25 | 


--------------------------------------------------------------------------------
/man/getGeneSymbol.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{getGeneSymbol}
 4 | \alias{getGeneSymbol}
 5 | \title{getGeneSymbol}
 6 | \usage{
 7 | getGeneSymbol(ensemblID,
 8 |   geneInfof = "~/Data/GSMuta_data/GSMutaRData/extdata/ensembl_92_exon_pos.hg38.rda")
 9 | }
10 | \arguments{
11 | \item{ensemblID}{ensemble ID}
12 | 
13 | \item{geneInfof}{store the info symbol and ensembl ID.}
14 | }
15 | \value{
16 | a vector of gene symbol
17 | }
18 | \description{
19 | get gene symbol from ensembl ID
20 | }
21 | 


--------------------------------------------------------------------------------
/man/getTMBs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TMBpred.R
 3 | \name{getTMBs}
 4 | \alias{getTMBs}
 5 | \title{getTMBs}
 6 | \usage{
 7 | getTMBs(y, offset, phi, cores, zero_p = 0, prior = NULL, span = 10,
 8 |   low = 10^(-4), high = 10^4)
 9 | }
10 | \arguments{
11 | \item{y}{A matrix for observed mutation count}
12 | 
13 | \item{offset}{offset}
14 | 
15 | \item{phi}{gene dispersion}
16 | 
17 | \item{cores}{A number which specifiy the nubmer of core to use for parallel computing.}
18 | 
19 | \item{zero_p}{gene zero fraction}
20 | 
21 | \item{prior}{prior parameters. If it is NULL, no prior will be used.}
22 | 
23 | \item{span}{The extension of low and high limit. High = high * span and low = low/span}
24 | 
25 | \item{low}{The lower limit.}
26 | 
27 | \item{high}{The higher limit.}
28 | }
29 | \value{
30 | A data.frame contain the result.
31 | }
32 | \description{
33 | predict TMB stat function
34 | }
35 | \examples{
36 | \dontrun{
37 | getTMBs(Data)
38 | }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/man/get_exomeGene.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_exomeGene}
 4 | \alias{get_exomeGene}
 5 | \title{Get exomeGene}
 6 | \usage{
 7 | get_exomeGene(exome, Bed = NULL, mutContext)
 8 | }
 9 | \arguments{
10 | \item{exome}{exome data}
11 | 
12 | \item{Bed}{path to bed file or data.frame}
13 | 
14 | \item{mutContext}{data.frame contain mutation context.}
15 | }
16 | \value{
17 | exomeGene file
18 | }
19 | \description{
20 | Get exomeGene
21 | }
22 | \examples{
23 | \dontrun{
24 | get_exomeGene(exome, Bed, mutContext)
25 | }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/get_genewise_dispersion_mle_mu.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{get_genewise_dispersion_mle_mu}
 4 | \alias{get_genewise_dispersion_mle_mu}
 5 | \title{MLE to estimate genewise dispersion ###}
 6 | \usage{
 7 | get_genewise_dispersion_mle_mu(y, mu, offset, cores, ZI = FALSE)
 8 | }
 9 | \arguments{
10 | \item{y}{input y}
11 | 
12 | \item{mu}{mu}
13 | 
14 | \item{offset}{offset}
15 | 
16 | \item{cores}{number of threads}
17 | 
18 | \item{ZI}{A boolen. Default is FALSE which Negative binomial model will be used.
19 | If TRUE, zero-inflated model will be used.}
20 | }
21 | \value{
22 | mu
23 | }
24 | \description{
25 | MLE to estimate genewise dispersion ###
26 | }
27 | 


--------------------------------------------------------------------------------
/man/get_glen.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_glen}
 4 | \alias{get_glen}
 5 | \title{get_glen}
 6 | \usage{
 7 | get_glen(exomeGene, selGid = NULL, byGene = FALSE)
 8 | }
 9 | \arguments{
10 | \item{exomeGene}{data.frame}
11 | 
12 | \item{selGid}{a vector of selected genes.}
13 | 
14 | \item{byGene}{a boolen.If TRUE, a vector of length for each gene will be reported
15 | If FALSE, a number of total length for all gene will be reported. Default is FALSE.}
16 | }
17 | \value{
18 | a number or a vector
19 | }
20 | \description{
21 | get gene length
22 | }
23 | 


--------------------------------------------------------------------------------
/man/get_mu_hat_mle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{get_mu_hat_mle}
 4 | \alias{get_mu_hat_mle}
 5 | \title{get_mu_hat_mle}
 6 | \usage{
 7 | get_mu_hat_mle(y, disp, offset, cores, ZI = FALSE)
 8 | }
 9 | \arguments{
10 | \item{y}{input y}
11 | 
12 | \item{disp}{disp}
13 | 
14 | \item{offset}{offset}
15 | 
16 | \item{cores}{number of threads}
17 | 
18 | \item{ZI}{A boolen. Default is FALSE which Negative binomial model will be used.
19 | If TRUE, zero-inflated model will be used.}
20 | }
21 | \value{
22 | mu
23 | }
24 | \description{
25 | from phi estimate mu
26 | }
27 | 


--------------------------------------------------------------------------------
/man/get_mu_phi.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{get_mu_phi}
 4 | \alias{get_mu_phi}
 5 | \title{get_mu_phi}
 6 | \usage{
 7 | get_mu_phi(y, offset)
 8 | }
 9 | \arguments{
10 | \item{y}{input y}
11 | 
12 | \item{offset}{offset}
13 | }
14 | \value{
15 | mu phi
16 | }
17 | \description{
18 | apply NB for mu & phi
19 | }
20 | 


--------------------------------------------------------------------------------
/man/get_muhat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{get_muhat}
 4 | \alias{get_muhat}
 5 | \title{get data mu for each gene #####}
 6 | \usage{
 7 | get_muhat(y, offset)
 8 | }
 9 | \arguments{
10 | \item{y}{input data}
11 | 
12 | \item{offset}{offset}
13 | }
14 | \value{
15 | muhat
16 | }
17 | \description{
18 | get data mu for each gene #####
19 | }
20 | 


--------------------------------------------------------------------------------
/man/get_zip_pairparam_mle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{get_zip_pairparam_mle}
 4 | \alias{get_zip_pairparam_mle}
 5 | \title{Calculate mu and zero_p by MLE for ZIP model #}
 6 | \usage{
 7 | get_zip_pairparam_mle(y, offset, cores)
 8 | }
 9 | \arguments{
10 | \item{y}{observed mutation}
11 | 
12 | \item{offset}{offset.}
13 | 
14 | \item{cores}{number of cores}
15 | }
16 | \value{
17 | a List of parameters
18 | }
19 | \description{
20 | Calculate mu and zero_p by MLE for ZIP model #
21 | }
22 | \examples{
23 | \dontrun{
24 | get_zip_pairparam_mle(Data, offset)
25 | }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/getmu_post_optimize.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{getmu_post_optimize}
 4 | \alias{getmu_post_optimize}
 5 | \title{getmu_post_optimize}
 6 | \usage{
 7 | getmu_post_optimize(y, offset, mu1, mu2, phi, sigma, span, cores,
 8 |   zero_p = 0)
 9 | }
10 | \arguments{
11 | \item{y}{The input data}
12 | 
13 | \item{offset}{offset of input data}
14 | 
15 | \item{mu1}{mu_hat}
16 | 
17 | \item{mu2}{mu_hat_glm}
18 | 
19 | \item{phi}{dispersion parameter when negative binomial model is used. Otherwise,
20 | it will be 0.}
21 | 
22 | \item{sigma}{dispersion}
23 | 
24 | \item{span}{The extention}
25 | 
26 | \item{cores}{number of thread}
27 | 
28 | \item{zero_p}{probability of zero portion if zero-inflated model is used. Otherwise,
29 | it will be 0}
30 | }
31 | \value{
32 | A vector mutation count for 96 trinucleotide context
33 | }
34 | \description{
35 | get optimized mu ########
36 | }
37 | \examples{
38 | \dontrun{
39 | getmu_post_optimize(mut, gid)
40 | }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/man/getsigma.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{getsigma}
 4 | \alias{getsigma}
 5 | \title{get sigma ###}
 6 | \usage{
 7 | getsigma(mu, mu_glm)
 8 | }
 9 | \arguments{
10 | \item{mu}{mu}
11 | 
12 | \item{mu_glm}{glm generated mu}
13 | }
14 | \value{
15 | sigma
16 | }
17 | \description{
18 | get sigma ###
19 | }
20 | 


--------------------------------------------------------------------------------
/man/getsubData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{getsubData}
 4 | \alias{getsubData}
 5 | \title{getsubData}
 6 | \usage{
 7 | getsubData(Data, sampleN = NULL, gid = NULL)
 8 | }
 9 | \arguments{
10 | \item{Data}{MutSet}
11 | 
12 | \item{sampleN}{A list of sample names.}
13 | 
14 | \item{gid}{A list of gene id.}
15 | }
16 | \value{
17 | MutSet a subset of orginal MutSet
18 | }
19 | \description{
20 | get subset of orginal MutSet
21 | }
22 | 


--------------------------------------------------------------------------------
/man/getzero_p.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{getzero_p}
 4 | \alias{getzero_p}
 5 | \title{get zero_p ###}
 6 | \usage{
 7 | getzero_p(zero_p_zip)
 8 | }
 9 | \arguments{
10 | \item{zero_p_zip}{zero_p_zip}
11 | }
12 | \value{
13 | zero_p
14 | }
15 | \description{
16 | get zero_p ###
17 | }
18 | 


--------------------------------------------------------------------------------
/man/loadfile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{loadfile}
 4 | \alias{loadfile}
 5 | \title{loadfile}
 6 | \usage{
 7 | loadfile(x)
 8 | }
 9 | \arguments{
10 | \item{x}{file path}
11 | }
12 | \value{
13 | content of loaded file
14 | }
15 | \description{
16 | loadfile
17 | }
18 | \examples{
19 | \dontrun{
20 | loadfile(x)
21 | }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/man/mafCleanup.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/procMaf.R
 3 | \name{mafCleanup}
 4 | \alias{mafCleanup}
 5 | \title{maf.cleanup}
 6 | \usage{
 7 | mafCleanup(maf, extraCols = NULL, keepNoncoding = FALSE,
 8 |   save = FALSE, fn = "./output.tsv", nonSilTypes = NULL,
 9 |   SilTypes = NULL, ncTypes = NULL, gid = NULL)
10 | }
11 | \arguments{
12 | \item{maf}{Input maf file. It can be a path to either rda or tsv file or data.frame}
13 | 
14 | \item{extraCols}{Default is NULL which no extra column will be included. If "all", all columns
15 | will be output. If an array, only specificed columns will be reported.}
16 | 
17 | \item{keepNoncoding}{A boolen. If TRUE, the noncoding variants will be keeped. Default is FALSE,
18 | which noncoding variants will be removed.}
19 | 
20 | \item{save}{A boolen. If TRUE, the file will be saved, FALSE, the file will not be saved}
21 | 
22 | \item{fn}{string, the output file name}
23 | 
24 | \item{nonSilTypes}{To specify what variant classification should be considered as nonsilent variants.
25 | Default is NULL which the default list of classification will be used.}
26 | 
27 | \item{SilTypes}{To specify what variant classification should be considered as silent variants.
28 | Default is NULL which the default list of classification will be used.}
29 | 
30 | \item{ncTypes}{To specify what variant classification should be considered as variants in noncoding regions.
31 | Default is NULL which the default list of classification will be used.}
32 | 
33 | \item{gid}{Gene ID list.}
34 | }
35 | \value{
36 | cleaned up maf file.
37 | }
38 | \description{
39 | maf.cleanup
40 | }
41 | \examples{
42 | \dontrun{
43 | maf.cleanup(x, output)
44 | }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/man/maf_dnp_converter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/procMaf.R
 3 | \name{maf_dnp_converter}
 4 | \alias{maf_dnp_converter}
 5 | \title{maf_dnp_converter}
 6 | \usage{
 7 | maf_dnp_converter(mutab)
 8 | }
 9 | \arguments{
10 | \item{mutab}{maf matrix}
11 | }
12 | \value{
13 | maf file with converted SNP
14 | }
15 | \description{
16 | maf_dnp_converter
17 | }
18 | \examples{
19 | \dontrun{
20 | maf_dnp_converter(maf)
21 | }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/man/pred_TMB.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TMBpred.R
 3 | \name{pred_TMB}
 4 | \alias{pred_TMB}
 5 | \title{pred_TMB}
 6 | \usage{
 7 | pred_TMB(x, params, WES = NULL, prior = list(mu = prior_bs$mu, sigma =
 8 |   prior_bs$sigma, lambda = prior_bs$lambda), mut.nonsil = FALSE,
 9 |   gid_nonsil_p = NULL, method = "MLE", cores = 10, span = 1,
10 |   low = 10^(-4), high = 10^4)
11 | }
12 | \arguments{
13 | \item{x}{test mutSet object}
14 | 
15 | \item{params}{A list contain result from fit}
16 | 
17 | \item{WES}{test mutSet object which contain whole exome data which will be used to generate the truth.}
18 | 
19 | \item{prior}{A list contain prior parameters}
20 | 
21 | \item{mut.nonsil}{A boolen. If it is TRUE, the nonsilent mutation will be used for prediction}
22 | 
23 | \item{gid_nonsil_p}{A vector of gid for passenger genes}
24 | 
25 | \item{method}{A string which specify the method to predict the TMB}
26 | 
27 | \item{cores}{A number which specifiy the nubmer of core to use for parallel computing.}
28 | 
29 | \item{span}{The extension of low and high limit. High = high * span and low = low/span}
30 | 
31 | \item{low}{The lower limit.}
32 | 
33 | \item{high}{The higher limit.}
34 | }
35 | \value{
36 | A data.frame contain the result.
37 | }
38 | \description{
39 | fit the data distribution to a mixture of Gaussian mixture
40 | }
41 | \examples{
42 | \dontrun{
43 | pred_TMB(Data)
44 | }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/man/readData.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ecTMB.R
 3 | \name{readData}
 4 | \alias{readData}
 5 | \title{read and load maf file and necessary reference file}
 6 | \usage{
 7 | readData(mutf, exomef, covarf, mutContextf, ref, samplef = NULL)
 8 | }
 9 | \arguments{
10 | \item{mutf}{Path to maf file. Detail see \code{mafCleanup}.}
11 | 
12 | \item{exomef}{Path to exome file}
13 | 
14 | \item{covarf}{Path to covariable file}
15 | 
16 | \item{mutContextf}{Path to mutContext file.}
17 | 
18 | \item{ref}{Path to reference genome}
19 | 
20 | \item{samplef}{Path to sample file or data.frame. The column name to specify bed file must be BED}
21 | }
22 | \value{
23 | a MutSet
24 | }
25 | \description{
26 | read and load maf file and necessary reference file
27 | }
28 | 


--------------------------------------------------------------------------------
/man/remove_outliers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/TMBpred.R
 3 | \name{remove_outliers}
 4 | \alias{remove_outliers}
 5 | \title{remove_outliers}
 6 | \usage{
 7 | remove_outliers(x, na.rm = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{number of mutation}
11 | 
12 | \item{na.rm}{how to deal with na}
13 | 
14 | \item{...}{The extra parameters for quantile.}
15 | }
16 | \value{
17 | outlier is labeled as NA
18 | }
19 | \description{
20 | remove outlier based on 1.5 quantile
21 | }
22 | \examples{
23 | \dontrun{
24 | remove_outliers(x)
25 | }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/man/retrieve_context.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/procMaf.R
 3 | \name{retrieve_context}
 4 | \alias{retrieve_context}
 5 | \title{retrieve_context}
 6 | \usage{
 7 | retrieve_context(mutab, ref)
 8 | }
 9 | \arguments{
10 | \item{mutab}{maf matrix}
11 | 
12 | \item{ref}{genome reference path.}
13 | }
14 | \value{
15 | Retrieve mutation context code for each variants
16 | }
17 | \description{
18 | retrieve_context
19 | }
20 | \examples{
21 | \dontrun{
22 | maf_dnp_converter(mutab, ref, codeRetrieve)
23 | }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/man/zipMLE.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats.R
 3 | \name{zipMLE}
 4 | \alias{zipMLE}
 5 | \title{Calculate mu and zero_p by MLE for ZIP model #}
 6 | \usage{
 7 | zipMLE(x, offset = 1, tol = 1e-09)
 8 | }
 9 | \arguments{
10 | \item{x}{observed mutation}
11 | 
12 | \item{offset}{offset.}
13 | 
14 | \item{tol}{tol of convergency}
15 | }
16 | \value{
17 | a List of parameters
18 | }
19 | \description{
20 | Calculate mu and zero_p by MLE for ZIP model #
21 | }
22 | \examples{
23 | \dontrun{
24 | zipMLE(Data, offset)
25 | }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------