├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── README.md
├── data
    ├── qa_all.rda
    └── stats0.rda
├── inst
    ├── doc
    │   └── hg19_alias.tab
    ├── extdata
    │   ├── ALLphenoData.tsv
    │   ├── NCI60.Rda
    │   ├── nci.data
    │   └── nci.info
    └── script
    │   └── creatingNCI60.R
└── vignettes
    ├── A01.1_IntroductionToR.Rmd
    ├── A01.2_IntroductionToBioconductor.Rmd
    ├── A01.3_BioconductorForSequenceAnalysis.Rmd
    ├── A01_Introduction.Rmd
    ├── B02.1.1_RNASeqLab.Rmd
    ├── B02.1_RNASeq.Rmd
    ├── B02.2.3_CopyNumber.Rmd
    ├── B02.2_CommonWorkFlows.Rmd
    ├── B02.3_MachineLearning.Rmd
    ├── B02.4_Annotation.Rmd
    ├── B02.5_GeneSetEnrichment.Rmd
    ├── C03.1_LargeData.Rmd
    ├── C03.2_CodeToPackages.Rmd
    ├── C03.3_ReproducibleResearch.Rmd
    ├── C03.4_Visualization.Rmd
    ├── D04.1_InstallIGV.Rmd
    ├── hg19_alias.tab
    └── our_figures
        ├── GRanges.png
        ├── GRangesList.png
        ├── RangeOperations.png
        ├── SequencingEcosystem.png
        ├── SequencingEcosystem_no_bioc_pkgs.png
        ├── Solexa-bridge-pcr.jpg
        ├── SummarizedExperiment.png
        ├── copy_number_QC_2.png
        ├── cross_validation.png
        ├── knn.png
        └── nrg2825-f2.jpg


/.gitignore:
--------------------------------------------------------------------------------
1 | Agenda.docx
2 | bigdata
3 | vignettes/*html
4 | vignettes/*R
5 | vignettes/*.md
6 | vignettes/figure
7 | vignettes/cache
8 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: LearnBioconductor
 2 | Type: Package
 3 | Title: Learning R / Bioconductor for Sequence Analysis
 4 | Version: 0.1.6
 5 | Authors@R: c(person("Martin", "Morgan", role=c("aut", "cre"),
 6 |                     email="mtmorgan@fhcrc.org"),
 7 |              person("Sonali", "Arora", role="aut"))
 8 | License: Artistic-2.0
 9 | VignetteBuilder: knitr
10 | Description: This course is directed at beginning and intermediate
11 |     students who would like an introduction to the analysis and
12 |     comprehension of high-throughput sequence data using R and
13 |     Bioconductor. Day 1 focuses on learning essential background: an
14 |     introduction to the R programming language; central concepts for
15 |     effective use of Bioconductor software; and an overview of
16 |     high-throughput sequence analysis work flows. Day 2 emphasizes use
17 |     of Bioconductor for specific tasks: an RNA-seq differential
18 |     expression work flow; exploratory, machine learning, and other
19 |     statistical tasks; gene set enrichment; and annotation.  Day 3
20 |     transitions to understanding effective approaches for managing
21 |     larger challenges: strategies for working with large data, writing
22 |     re-usable functions, developing reproducible reports and work
23 |     flows, and visualizing results. The course combines lectures with
24 |     extensive hands-on practicals; students are required to bring a
25 |     laptop with wireless internet access and a modern version of the
26 |     Chrome or Safari web browser.
27 | Packaged: 2014-10-27 01:31:35 UTC; dante
28 | Author: Martin Morgan [aut, cre],
29 |   Sonali Arora [aut]
30 | Maintainer: Martin Morgan <mtmorgan@fhcrc.org>
31 | Suggests: knitr, BiocStyle, BiocInstaller, ALL,
32 |     BSgenome.Hsapiens.UCSC.hg19, BiocParallel, Biostrings,
33 |     GenomicAlignments, GenomicFeatures, Gviz, MLSeq, PoiClaClu,
34 |     RColorBrewer, RNAseqData.HNRNPC.bam.chr14, Rsamtools, ShortRead,
35 |     TxDb.Hsapiens.UCSC.hg19.knownGene, VariantAnnotation, airway,
36 |     class, cn.mops, dendextend, fission, genefilter, ggplot2, gplots,
37 |     org.Hs.eg.db, sva, xtable, PoiClaClu, sva, fission, kernlab,
38 |     e1071
39 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | exportPattern("^[^\\.]")
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Learning R / Bioconductor for Sequence Analysis
 2 | ===============================================
 3 | 
 4 | Fred Hutchinson Cancer Research Center, Seattle, WA<br />
 5 | October 27-29
 6 | 
 7 | Location: October 27 / 28: Arnold Building M1-A303; October 29:
 8 | Thomas D1-080
 9 | 
10 | Contact: Martin Morgan (content,
11 | [mtmorgan@fhcrc.org](mailto:mtmorgan@fhcrc.org)); Melissa Alvendia
12 | (administration, [malvendia@fhcrc.org](mailto:malvendia@fhcrc.org))
13 | 
14 | This course is directed at beginning and intermediate users who would
15 | like an introduction to the analysis and comprehension of
16 | high-throughput sequence data using _R_ and
17 | _[Bioconductor](http://bioconductor.org)_. Day 1 focuses on learning
18 | essential background: an introduction to the _R_ programming language;
19 | central concepts for effective use of _Bioconductor_ software; and an
20 | overview of high-throughput sequence analysis work flows. Day 2
21 | emphasizes use of _Bioconductor_ for specific tasks: an RNA-seq
22 | differential expression work flow; exploratory, machine learning, and
23 | other statistical tasks; gene set enrichment; and annotation.  Day 3
24 | transitions to understanding effective approaches for managing larger
25 | challenges: strategies for working with large data, writing re-usable
26 | functions, developing reproducible reports and work flows, and
27 | visualizing results.  The course combines lectures with extensive
28 | hands-on practicals; students are required to bring a laptop with
29 | wireless internet access and a modern version of the Chrome or Safari
30 | web browser.
31 | 
32 | Schedule (tentative)
33 | --------------------
34 | 
35 | Day 1: Learn _R_ / _Bioconductor_
36 | 
37 | - 9:00 - 10:30 Introduction to _R_: objects, functions, help!
38 | - 11:00 - 12:30 Introduction to _Bioconductor_: working with packages and classes
39 | - 1:30 - 5:00 (break: 3:00 - 3:30) Introduction to sequence analysis:
40 |   typical work flow; data types and quality assessment; essential
41 |   _Bioconductor_ packages
42 | 
43 | Day 2: Use _R_ / _Bioconductor_ 
44 | 
45 | - 9:00 - 12:30 (break (10:30 - 11:00) An RNA-seq differential
46 |   expression work flow (detail)
47 | - 1:30 - 2:00 Other work flows (survey): ChIP-seq, variants, copy
48 |   number, epigenomics
49 | - 2:00 - 3:00 Machine learning; exploratory and other statistical
50 |   analysis
51 | - 3:30 - 4:00 Annotating genes, genomes, and variants
52 | - 4:00 - 5:00 Approaches to gene set enrichment
53 | 
54 | Day 3: Develop Skills and Best Practices
55 | 
56 | - 9:00 - 10:30 Working with large data
57 | - 11:00 - 12:30 Organizing code in functions, files, and packages
58 | - 1:30 - 3:00 Reproducible reports and work flows: markdown
59 | - 3:30 - 4:30 Visualization
60 | - 4:30 - 5:00 Summary
61 | 


--------------------------------------------------------------------------------
/data/qa_all.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/data/qa_all.rda


--------------------------------------------------------------------------------
/data/stats0.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/data/stats0.rda


--------------------------------------------------------------------------------
/inst/doc/hg19_alias.tab:
--------------------------------------------------------------------------------
 1 | gi|224384768|gb|CM000663.1| chr1
 2 | gi|224384767|gb|CM000664.1| chr2
 3 | gi|224384766|gb|CM000665.1| chr3
 4 | gi|224384765|gb|CM000666.1| chr4
 5 | gi|224384764|gb|CM000667.1| chr5
 6 | gi|224384763|gb|CM000668.1| chr6
 7 | gi|224384762|gb|CM000669.1| chr7
 8 | gi|224384761|gb|CM000670.1| chr8
 9 | gi|224384760|gb|CM000671.1| chr9
10 | gi|224384759|gb|CM000672.1| chr10
11 | gi|224384758|gb|CM000673.1| chr11
12 | gi|224384757|gb|CM000674.1| chr12
13 | gi|224384756|gb|CM000675.1| chr13
14 | gi|224384755|gb|CM000676.1| chr14
15 | gi|224384754|gb|CM000677.1| chr15
16 | gi|224384753|gb|CM000678.1| chr16
17 | gi|224384752|gb|CM000679.1| chr17
18 | gi|224384751|gb|CM000680.1| chr18
19 | gi|224384750|gb|CM000681.1| chr19
20 | gi|224384749|gb|CM000682.1| chr20
21 | gi|224384748|gb|CM000683.1| chr21
22 | gi|224384747|gb|CM000684.1| chr22
23 | 


--------------------------------------------------------------------------------
/inst/extdata/ALLphenoData.tsv:
--------------------------------------------------------------------------------
  1 | id	diagnosis	sex	age	BT	remission	CR	date.cr	t(4;11)	t(9;22)	cyto.normal	citog	mol.biol	fusion protein	mdr	kinet	ccr	relapse	transplant	f.u	date last seen
  2 | 1005	5/21/1997	M	53	B2	CR	CR	8/6/1997	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p210	NEG	dyploid	FALSE	FALSE	TRUE	BMT / DEATH IN CR	NA
  3 | 1010	3/29/2000	M	19	B2	CR	CR	6/27/2000	FALSE	FALSE	FALSE	simple alt.	NEG	NA	POS	dyploid	FALSE	TRUE	FALSE	REL	8/28/2000
  4 | 3002	6/24/1998	F	52	B4	CR	CR	8/17/1998	NA	NA	NA	NA	BCR/ABL	p190	NEG	dyploid	FALSE	TRUE	FALSE	REL	10/15/1999
  5 | 4006	7/17/1997	M	38	B1	CR	CR	9/8/1997	TRUE	FALSE	FALSE	t(4;11)	ALL1/AF4	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	1/23/1998
  6 | 4007	7/22/1997	M	57	B2	CR	CR	9/17/1997	FALSE	FALSE	FALSE	del(6q)	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	11/4/1997
  7 | 4008	7/30/1997	M	17	B1	CR	CR	9/27/1997	FALSE	FALSE	FALSE	complex alt.	NEG	NA	NEG	hyperd.	FALSE	TRUE	FALSE	REL	12/15/1997
  8 | 4010	10/30/1997	F	18	B1	CR	CR	1/7/1998	FALSE	FALSE	FALSE	complex alt.	NEG	NA	POS	hyperd.	FALSE	TRUE	FALSE	REL	3/5/1998
  9 | 4016	2/10/2000	M	16	B1	CR	CR	4/17/2000	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	9/26/2000
 10 | 6002	3/19/1997	M	15	B2	CR	CR	6/9/1997	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	3/18/1998
 11 | 8001	1/15/1997	M	40	B2	CR	CR	3/26/1997	FALSE	FALSE	FALSE	del(p15)	BCR/ABL	p190	NEG	NA	FALSE	TRUE	FALSE	REL	7/11/1997
 12 | 8011	8/21/1998	M	33	B3	CR	CR	10/8/1998	FALSE	FALSE	FALSE	del(p15/p16)	BCR/ABL	p190/p210	NEG	dyploid	FALSE	FALSE	TRUE	BMT / DEATH IN CR	NA
 13 | 8012	10/22/1998	M	55	B3	CR	CR	1/9/1999	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	4/9/1999
 14 | 8018	8/27/1999	M	5	B3	CR	CR	10/18/1999	NA	NA	NA	NA	E2A/PBX1	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	5/23/2000
 15 | 8024	7/20/2000	M	18	B2	CR	DEATH IN CR	NA	FALSE	FALSE	FALSE	simple alt.	NEG	NA	POS	dyploid	NA	NA	NA	NA	NA
 16 | 9008	12/17/1999	M	41	B3	CR	CR	2/15/2000	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	p190	NEG	hyperd.	TRUE	FALSE	TRUE	BMT / CCR	00/09/01
 17 | 9017	2/3/2000	F	27	B	CR	CR	3/23/2000	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	9/11/2001
 18 | 11005	6/1/1998	M	27	B2	CR	DEATH IN CR	8/3/1998	FALSE	FALSE	FALSE	del(7q) + altro	BCR/ABL	p190	NEG	dyploid	FALSE	FALSE	FALSE	DEATH IN CR	NA
 19 | 12006	2/20/1997	M	46	B3	REF	REF	NA	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p210	NEG	dyploid	NA	NA	NA	NA	NA
 20 | 12007	3/11/1997	M	37	B2	CR	CR	5/7/1997	FALSE	TRUE	FALSE	t(9;22)+ del(9)	BCR/ABL	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	9/26/1997
 21 | 12012	5/21/1997	F	36	B3	REF	REF	NA	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p190	NEG	dyploid	NA	NA	NA	NA	NA
 22 | 12019	9/4/1997	M	53	B2	CR	CR	11/11/1997	FALSE	FALSE	TRUE	normal	NEG	NA	POS	dyploid	TRUE	FALSE	FALSE	CCR	6/6/2002
 23 | 12026	5/29/1998	M	39	B2	REF	REF	NA	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p190/p210	NA	dyploid	FALSE	FALSE	FALSE	DEATH IN CR	NA
 24 | 14016	5/27/1999	M	53	B2	NA	NA	NA	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p210	NEG	NA	NA	NA	NA	NA	NA
 25 | 15001	9/3/1997	M	20	B1	CR	CR	11/11/1997	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	6/21/2002
 26 | 15004	2/10/2000	M	44	B1	CR	CR	4/3/2000	TRUE	FALSE	FALSE	t(4;11)	ALL1/AF4	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	12/19/2000
 27 | 15005	5/4/2000	M	28	B2	CR	CR	6/26/2000	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	p190	POS	hyperd.	FALSE	TRUE	FALSE	REL	11/15/2000
 28 | 16004	4/19/1997	F	58	B1	CR	CR	7/15/1997	TRUE	FALSE	FALSE	t(4;11)	ALL1/AF4	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	12/9/1997
 29 | 16009	7/11/2000	F	43	B2	NA	NA	NA	NA	NA	NA	NA	NEG	NA	POS	dyploid	TRUE	FALSE	FALSE	CCR / OFF	5/23/2002
 30 | 19005	11/15/1997	F	48	B1	CR	CR	2/3/1998	FALSE	FALSE	TRUE	normal	ALL1/AF4	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	2/4/1998
 31 | 20002	5/9/1997	F	58	B2	CR	CR	8/19/1997	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	p190	NEG	dyploid	FALSE	TRUE	FALSE	REL	12/15/1997
 32 | 22009	8/10/1999	F	19	B	NA	NA	NA	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	dyploid	NA	NA	NA	NA	NA
 33 | 22010	12/31/1999	F	26	B	NA	NA	NA	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p190/p210	NEG	dyploid	NA	NA	NA	NA	NA
 34 | 22011	4/7/2000	M	19	B2	CR	CR	5/19/2000	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	7/31/2002
 35 | 22013	8/17/2000	M	32	B2	REF	REF	NA	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	p190/p210	NEG	dyploid	NA	NA	NA	NA	NA
 36 | 24001	10/4/1996	F	17	B2	CR	CR	12/20/1996	NA	NA	NA	NA	BCR/ABL	p190	NEG	dyploid	FALSE	TRUE	FALSE	REL	2/10/1997
 37 | 24005	1/3/1997	F	45	B1	CR	CR	4/8/1997	TRUE	FALSE	FALSE	t(4;11)	ALL1/AF4	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	8/28/1997
 38 | 24008	5/14/1997	F	20	B2	CR	CR	7/31/1997	NA	NA	NA	NA	NEG	NA	NEG	dyploid	FALSE	FALSE	TRUE	BMT / CCR	00/09/20+T12501
 39 | 24010	6/3/1997	F	16	B2	CR	CR	8/11/1997	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p190/p210	NEG	dyploid	FALSE	TRUE	TRUE	BMT / REL	8/24/1998
 40 | 24011	8/5/1997	F	51	B2	NA	DEATH IN INDUCTION	NA	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p210	POS	dyploid	NA	NA	NA	NA	NA
 41 | 24017	9/15/1998	M	57	B2	CR	CR	12/7/1998	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	p190	NEG	hyperd.	FALSE	TRUE	FALSE	REL	2/22/2000
 42 | 24018	2/18/1999	F	29	B2	CR	CR	5/4/1999	NA	NA	NA	NA	NEG	NA	POS	dyploid	FALSE	TRUE	FALSE	REL	7/22/2000
 43 | 24019	5/4/1999	M	16	B4	CR	CR	7/28/1999	FALSE	FALSE	FALSE	t(1;19) + del(6q)	E2A/PBX1	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	5/25/2000
 44 | 24022	12/21/1999	F	32	B4	REF	REF	NA	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p190	POS	dyploid	NA	NA	NA	NA	NA
 45 | 25003	5/22/1998	M	15	B2	CR	CR	8/4/1998	FALSE	FALSE	FALSE	simple alt.	NEG	NA	POS	dyploid	TRUE	FALSE	FALSE	CCR	6/10/2002
 46 | 25006	3/18/2000	NA	NA	B2	CR	CR	5/8/2000	NA	NA	NA	NA	NEG	NA	NEG	NA	TRUE	FALSE	FALSE	CCR	3/3/2002
 47 | 26001	9/27/1997	M	21	B2	CR	CR	12/11/1997	NA	NA	NA	NA	NEG	NA	POS	dyploid	TRUE	FALSE	FALSE	CCR	7/31/2002
 48 | 26003	2/18/1998	F	49	B4	CR	CR	4/21/1998	FALSE	FALSE	FALSE	del(p15/p16)	BCR/ABL	p210	NEG	dyploid	FALSE	TRUE	FALSE	REL	7/1/1998
 49 | 26005	8/21/1998	M	38	B2	CR	CR	11/15/1998	NA	NA	NA	NA	p15/p16	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	3/4/2002
 50 | 26008	8/25/1999	F	17	B1	CR	CR	10/14/1999	FALSE	FALSE	TRUE	normal	ALL1/AF4	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	6/26/2000
 51 | 27003	1/17/1998	F	26	B2	CR	CR	3/16/1998	NA	NA	NA	NA	BCR/ABL	p190/p210	POS	dyploid	FALSE	TRUE	FALSE	REL	5/6/1998
 52 | 27004	10/20/1998	F	48	B2	REF	REF	NA	FALSE	TRUE	FALSE	t(9;22)+del(p15)	BCR/ABL	p190	NEG	dyploid	NA	NA	NA	NA	NA
 53 | 28001	10/19/1996	M	16	B3	REF	REF	NA	FALSE	FALSE	TRUE	normal	NEG	NA	POS	dyploid	NA	NA	NA	NA	NA
 54 | 28003	11/28/1996	M	18	B4	CR	CR	1/17/1997	NA	NA	NA	NA	E2A/PBX1	NA	NEG	hyperd.	TRUE	FALSE	FALSE	CCR	12/31/2002
 55 | 28005	12/27/1996	M	17	B3	CR	CR	2/26/1997	FALSE	FALSE	FALSE	complex alt.	NEG	NA	POS	hyperd.	FALSE	TRUE	FALSE	REL	3/6/1998
 56 | 28006	1/13/1997	M	22	B3	CR	CR	3/14/1997	FALSE	FALSE	FALSE	del(6q)	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	1/16/1998
 57 | 28007	2/21/1997	F	47	B3	CR	CR	4/7/1997	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	3/22/2002
 58 | 28019	2/10/1998	M	21	B4	CR	CR	4/2/1998	NA	NA	NA	NA	BCR/ABL	p190	NEG	hyperd.	TRUE	FALSE	TRUE	BMT / CCR	3/21/2001
 59 | 28021	3/18/1998	F	54	B3	CR	DEATH IN CR	5/22/1998	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	p190/p210	NEG	hyperd.	FALSE	FALSE	FALSE	DEATH IN CR (ICR)	NA
 60 | 28023	2/26/1998	M	26	B3	CR	CR	4/20/1998	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL / SNC	2/5/1999
 61 | 28024	4/19/1998	F	19	B1	CR	CR	6/17/1998	FALSE	FALSE	FALSE	complex alt.	NEG	NA	NEG	hyperd.	TRUE	FALSE	FALSE	CCR	12/31/2002
 62 | 28028	7/8/1998	M	47	B1	CR	CR	9/3/1998	TRUE	FALSE	FALSE	t(4;11)	ALL1/AF4	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	10/20/1999
 63 | 28031	9/23/1998	M	18	B1	CR	CR	12/7/1998	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	1/17/1999
 64 | 28032	9/26/1998	F	52	B1	CR	CR	10/30/1998	TRUE	FALSE	FALSE	t(4;11)	ALL1/AF4	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	5/16/2002
 65 | 28035	12/21/1998	M	27	B3	CR	CR	2/12/1999	NA	NA	NA	NA	NEG	NA	POS	hyperd.	TRUE	FALSE	FALSE	CCR	5/20/2002
 66 | 28036	12/23/1998	M	52	B3	CR	CR	3/8/1999	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p190	NEG	dyploid	FALSE	TRUE	FALSE	REL	3/15/1999
 67 | 28037	12/30/1998	M	18	B3	CR	CR	3/4/1999	FALSE	FALSE	TRUE	normal	NEG	NA	POS	dyploid	FALSE	TRUE	FALSE	REL	4/23/2001
 68 | 28042	6/18/1999	M	18	B3	CR	CR	8/9/1999	NA	NA	NA	NA	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	11/30/2002
 69 | 28043	6/28/1999	M	23	B3	CR	CR	8/23/1999	NA	NA	NA	NA	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	3/29/2001
 70 | 28044	7/20/1999	M	16	B3	CR	CR	9/20/1999	FALSE	FALSE	FALSE	complex alt.	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	12/23/1999
 71 | 28047	4/17/2000	M	NA	B3	CR	CR	NA	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	9/4/2000
 72 | 30001	1/16/1997	F	54	B3	NA	DEATH IN INDUCTION	NA	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	p190	NEG	hyperd.	NA	NA	NA	NA	NA
 73 | 31007	2/20/1997	M	25	B1	CR	CR	5/15/1997	TRUE	FALSE	FALSE	t(4;11)	ALL1/AF4	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	12/3/1998
 74 | 31011	11/15/1997	M	31	B3	CR	CR	1/21/1998	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p210	POS	dyploid	FALSE	TRUE	TRUE	AUBMT / REL	6/1/1998
 75 | 33005	2/10/1998	F	19	B1	CR	CR	4/29/1998	FALSE	FALSE	FALSE	complex alt.	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	6/28/2002
 76 | 36001	9/29/1997	F	24	B4	CR	CR	12/5/1997	FALSE	FALSE	TRUE	normal	E2A/PBX1	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	1/7/1998
 77 | 36002	4/1/1998	M	23	B2	CR	CR	6/9/1998	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	hyperd.	FALSE	TRUE	FALSE	REL	3/29/2001
 78 | 37013	5/28/1998	M	NA	B2	NA	NA	NA	FALSE	FALSE	FALSE	del(p15/p16)	BCR/ABL	NA	NEG	dyploid	NA	NA	NA	NA	NA
 79 | 43001	11/14/1996	M	41	B1	CR	CR	1/30/1997	FALSE	TRUE	FALSE	t(9;22)	BCR/ABL	p190/p210	POS	dyploid	FALSE	TRUE	FALSE	REL	6/28/1998
 80 | 43004	2/4/1997	F	37	B3	CR	CR	4/1/1997	NA	NA	NA	NA	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	3/20/2001
 81 | 43007	10/14/1997	M	54	B4	CR	CR	12/30/1997	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	hyperd.	TRUE	FALSE	FALSE	CCR	5/29/2002
 82 | 43012	1/15/1999	M	18	B4	CR	CR	3/22/1999	NA	NA	NA	NA	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	4/20/2000
 83 | 48001	3/22/1997	M	19	B2	CR	CR	5/20/1997	FALSE	FALSE	FALSE	complex alt.	NEG	NA	NEG	hyperd.	FALSE	FALSE	FALSE	MUD / DEATH IN CR	12/18/1998
 84 | 49006	8/12/1998	F	43	B2	CR	CR	11/19/1998	NA	NA	NA	NA	BCR/ABL	p210	NEG	dyploid	FALSE	TRUE	FALSE	REL	4/26/1999
 85 | 57001	1/29/1997	F	53	B3	NA	DEATH IN INDUCTION	NA	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	hyperd.	NA	NA	NA	NA	NA
 86 | 62001	11/11/1997	F	50	B4	REF	REF	NA	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	NA	NEG	hyperd.	NA	NA	NA	NA	NA
 87 | 62002	1/15/1998	M	54	B4	NA	DEATH IN INDUCTION	NA	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	NA	NEG	hyperd.	NA	NA	NA	NA	NA
 88 | 62003	12/4/1998	M	53	B4	CR	CR	1/28/1999	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	p210	NEG	hyperd.	FALSE	TRUE	FALSE	REL	8/8/2000
 89 | 63001	7/8/1997	M	49	B1	CR	CR	9/2/1997	NA	NA	NA	NA	ALL1/AF4	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	6/10/1998
 90 | 64001	8/28/1997	M	20	B2	CR	CR	10/27/1997	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	1/21/1998
 91 | 64002	10/21/1997	F	26	B2	CR	CR	1/21/1998	NA	NA	NA	NA	NEG	NA	NEG	hyperd.	FALSE	FALSE	TRUE	BMT / DEATH IN CR	NA
 92 | 65005	7/20/1999	M	22	B2	REF	REF	NA	FALSE	TRUE	FALSE	t(9;22)+del(p15/p16)	BCR/ABL	p190	NEG	dyploid	NA	NA	NA	NA	NA
 93 | 68001	5/15/1997	M	36	B1	CR	CR	7/22/1997	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	5/10/2002
 94 | 68003	4/11/2000	F	27	B2	NA	DEATH IN INDUCTION	NA	FALSE	TRUE	FALSE	t(9;22)+other	BCR/ABL	p190	NEG	NA	NA	NA	NA	NA	NA
 95 | 84004	9/25/1998	M	50	B	CR	CR	12/1/1998	NA	NA	NA	NA	BCR/ABL	p190	NEG	dyploid	FALSE	TRUE	FALSE	REL	1/25/1999
 96 | LAL5	NA	NA	NA	B	NA	NA	NA	NA	NA	NA	NA	E2A/PBX1	NA	NA	NA	NA	NA	NA	NA	NA
 97 | 1003	2/18/1997	M	31	T	CR	CR	4/29/1997	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	7/11/1997
 98 | 1007	9/30/1998	F	16	T3	CR	CR	11/30/1998	FALSE	FALSE	FALSE	simple alt.	NUP-98	NA	NEG	hyperd.	FALSE	FALSE	TRUE	BMT / DEATH IN CR	NA
 99 | 2020	3/23/2000	F	48	T2	NA	DEATH IN INDUCTION	NA	FALSE	FALSE	FALSE	complex alt.	NEG	NA	NEG	dyploid	NA	NA	NA	NA	NA
100 | 4018	3/24/2000	M	17	T2	CR	CR	5/23/2000	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	5/14/2001
101 | 9002	5/14/1998	F	40	T3	CR	CR	7/21/1998	NA	NA	NA	NA	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL / SNC	9/14/1999
102 | 10005	9/30/1997	M	22	T2	CR	CR	1/26/1998	NA	NA	NA	NA	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	10/10/1999
103 | 11002	12/23/1996	M	30	T	CR	CR	3/18/1997	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	4/14/1998
104 | 12008	3/22/1997	M	18	T4	REF	REF	NA	FALSE	FALSE	FALSE	del(6q)	NEG	NA	NEG	dyploid	NA	NA	NA	NA	NA
105 | 15006	6/10/1998	M	22	T2	CR	CR	8/11/1998	FALSE	FALSE	FALSE	complex alt.	NEG	NA	NEG	hyperd.	FALSE	TRUE	FALSE	REL	9/15/2000
106 | 16002	4/10/1997	M	50	T3	CR	CR	6/10/1997	NA	NA	NA	NA	NEG	NA	NEG	hyperd.	FALSE	TRUE	FALSE	REL	12/7/1999
107 | 16007	11/1/1998	M	41	T3	CR	CR	11/5/1998	NA	NA	NA	NA	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	1/8/2002
108 | 17003	4/8/1997	F	40	T	REF	REF	NA	NA	NA	NA	NA	NEG	NA	POS	dyploid	NA	NA	NA	NA	NA
109 | 18001	4/23/1997	F	28	T2	REF	REF	NA	NA	NA	NA	NA	NEG	NA	NEG	hyperd.	NA	NA	NA	NA	NA
110 | 19002	1/29/1997	M	25	T3	CR	CR	4/10/1997	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	hyperd.	FALSE	TRUE	FALSE	REL	11/10/1997
111 | 19008	4/29/1998	F	16	T2	REF	REF	NA	FALSE	FALSE	TRUE	normal	NEG	NA	POS	dyploid	NA	NA	NA	NA	NA
112 | 19014	3/24/1999	M	31	T2	CR	CR	5/13/1999	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	4/1/2000
113 | 19017	3/17/2000	M	14	T2	CR	CR	6/15/2000	FALSE	FALSE	TRUE	normal	NEG	NA	POS	dyploid	FALSE	TRUE	FALSE	REL	7/5/2001
114 | 20005	3/15/2000	M	24	T1	CR	CR	5/5/2000	NA	NA	NA	NA	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	3/20/2002
115 | 24006	1/14/1997	F	19	T4	CR	CR	not known	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	6/5/2002
116 | 26009	8/26/1999	M	37	T	CR	CR	10/18/1999	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	6/12/2000
117 | 28008	3/27/1997	M	23	T2	CR	CR	5/27/1997	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	hyperd.	TRUE	FALSE	FALSE	CCR	4/9/2002
118 | 28009	4/19/1997	F	30	T3	CR	CR	6/13/1997	NA	NA	NA	NA	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	6/30/1998
119 | 31015	12/3/1998	M	48	T2	NA	DEATH IN INDUCTION	NA	NA	NA	NA	NA	NEG	NA	POS	dyploid	NA	NA	NA	NA	NA
120 | 37001	1/30/1997	M	22	T2	CR	CR	4/3/1997	NA	NA	NA	NA	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	1/7/1998
121 | 43006	6/17/1997	M	41	T2	REF	REF	NA	FALSE	FALSE	FALSE	simple alt.	NEG	NA	POS	dyploid	NA	NA	NA	NA	NA
122 | 43015	2/29/2000	M	52	T2	CR	CR	6/8/2000	FALSE	FALSE	TRUE	normal	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	3/15/2002
123 | 44001	1/15/1997	M	32	T3	CR	CR	3/7/1997	FALSE	FALSE	FALSE	del(6q)	NEG	NA	NEG	NA	FALSE	TRUE	FALSE	REL	11/25/1997
124 | 49004	9/18/1997	M	24	T3	CR	CR	11/11/1997	FALSE	FALSE	FALSE	del(7q)	NEG	NA	POS	dyploid	TRUE	FALSE	FALSE	CCR	6/14/2001
125 | 56007	8/6/1999	M	37	T3	CR	CR	9/24/1999	NA	NA	NA	NA	NEG	NA	NEG	dyploid	TRUE	FALSE	FALSE	CCR	1/26/2001
126 | 64005	10/1/1998	M	19	T2	REF	REF	NA	NA	NA	NA	NA	NEG	NA	NEG	dyploid	NA	NA	NA	NA	NA
127 | 65003	3/27/1998	M	30	T3	CR	CR	5/27/1998	FALSE	FALSE	FALSE	simple alt.	NEG	NA	NEG	dyploid	FALSE	TRUE	FALSE	REL	2/11/1999
128 | 83001	10/23/1998	M	29	T2	CR	CR	12/21/1998	FALSE	FALSE	FALSE	complex alt.	NEG	NA	NEG	hyperd.	TRUE	FALSE	FALSE	CCR	5/24/2002
129 | 


--------------------------------------------------------------------------------
/inst/extdata/NCI60.Rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/inst/extdata/NCI60.Rda


--------------------------------------------------------------------------------
/inst/extdata/nci.info:
--------------------------------------------------------------------------------
 1 | NCI microarray data (chap 14)
 2 | 
 3 | Source and reference:
 4 | 
 5 | http://genome-www.stanford.edu/nci60/
 6 | 
 7 | 
 8 | NCI microarray data
 9 |  
10 | 6830 genes
11 | missing values have been imputed via SVD
12 | 60 cell lines, labels are below
13 | 
14 | 
15 | CNS          
16 | CNS        
17 | CNS        
18 | RENAL      
19 | BREAST    
20 | CNS       
21 | CNS       
22 | BREAST      
23 | NSCLC       
24 | NSCLC    
25 | RENAL         
26 | RENAL     
27 | RENAL       
28 | RENAL       
29 | RENAL       
30 | RENAL         
31 | RENAL        
32 | BREAST 
33 | NSCLC      
34 | RENAL       
35 | UNKNOWN    
36 | OVARIAN    
37 | MELANOMA
38 | PROSTATE   
39 | OVARIAN    
40 | OVARIAN  
41 | OVARIAN     
42 | OVARIAN    
43 | OVARIAN    
44 | PROSTATE   
45 | NSCLC        
46 | NSCLC        
47 | NSCLC     
48 | LEUKEMIA
49 | K562B-repro               
50 | K562A-repro               
51 | LEUKEMIA   
52 | LEUKEMIA  
53 | LEUKEMIA    
54 | LEUKEMIA  
55 | LEUKEMIA       
56 | COLON      
57 | COLON       
58 | COLON      
59 | COLON        
60 | COLON     
61 | COLON      
62 | COLON     
63 | MCF7A-repro               
64 | BREAST       
65 | MCF7D-repro               
66 | BREAST     
67 | NSCLC     
68 | NSCLC     
69 | NSCLC    
70 | MELANOMA 
71 | BREAST 
72 | BREAST      
73 | MELANOMA     
74 | MELANOMA
75 | MELANOMA  
76 | MELANOMA 
77 | MELANOMA  
78 | MELANOMA 
79 | 


--------------------------------------------------------------------------------
/inst/script/creatingNCI60.R:
--------------------------------------------------------------------------------
 1 | rm(list=ls())
 2 | 
 3 | datafile <- system.file("extdata", "nci.data", package="LearnBioconductor")
 4 | data <- read.table(datafile)
 5 | data <- as.matrix(data)
 6 | infofile <- system.file("extdata", "nci.info", package="LearnBioconductor")
 7 | info <- read.table(infofile, skip=14)
 8 | 
 9 | # drop subtypes that contain only 1 sample
10 | rem <- which(table(info)==1)
11 | rm_ind  <- match(names(rem), info[,1])
12 | info <- info[-rm_ind,]
13 | data <- data[, -rm_ind]
14 | 
15 | showMethods("SummarizedExperiment")
16 | selectMethod(SummarizedExperiment, "matrix")
17 | 
18 | NCI60 <- SummarizedExperiment(assays=list(data=data), colData=DataFrame(info))
19 | save(NCI60, file="NCI60.Rda")
20 | 


--------------------------------------------------------------------------------
/vignettes/A01.1_IntroductionToR.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{01.1 Introduction to R}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r setup, echo=FALSE}
  8 | library(LearnBioconductor)
  9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 10 | ```
 11 | 
 12 | ```{r style, echo = FALSE, results = 'asis'}
 13 | BiocStyle::markdown()
 14 | knitr::opts_chunk$set(tidy=FALSE)
 15 | ```
 16 | 
 17 | # Introduction to R
 18 | 
 19 | Martin Morgan<br/>
 20 | October 27, 2014
 21 | 
 22 | ## R
 23 | 
 24 | Language and environment for statistical computing and graphics
 25 | 
 26 | - Full-featured programming language
 27 | - Interactive and *interpretted* -- convenient and forgiving
 28 | - Coherent, extensive documentation
 29 | - Statistical, e.g. `factor()`, `NA`
 30 | - Extensible -- CRAN, Bioconductor, github, ...
 31 | 
 32 | Vector, class, object
 33 | 
 34 | - Efficient _vectorized_ calculations on 'atomic' vectors `logical`,
 35 |   `integer`, `numeric`, `complex`, `character`, `byte`
 36 | - Atomic vectors are building blocks for more complicated _objects_
 37 |   - `matrix` -- atomic vector with 'dim' attribute
 38 |   - `data.frame` -- list of equal length atomic vectors
 39 | - Formal _classes_ represent complicated combinations of vectors,
 40 |   e.g., the return value of `lm()`, below
 41 | 
 42 | Function, generic, method
 43 | 
 44 | - Functions transform inputs to outputs, perhaps with side effects,
 45 |   e.g., `rnorm(1000)`
 46 |   - Argument matching first by name, then by position
 47 |   - Functions may define (some) arguments to have default values
 48 | - _Generic_ functions dispatch to specific _methods_ based on class of
 49 |   argument(s), e.g., `print()`. 
 50 | - Methods are functions that implement specific generics, e.g.,
 51 |   `print.factor`; methods are invoked _indirectly_, via the generic.
 52 | 
 53 | Introspection
 54 | 
 55 | - General properties, e.g., `class()`, `str()`
 56 | - Class-specific properties, e.g., `dim()`
 57 | 
 58 | Help
 59 | 
 60 | - `?print`: help on the generic print 
 61 | - `?print.data.frame`: help on print method for objects of class
 62 |     data.frame.
 63 | 
 64 | Example
 65 | 
 66 | ```{r}
 67 | x <- rnorm(1000)                   # atomic vectors
 68 | y <- x + rnorm(1000, sd=.5)
 69 | df <- data.frame(x=x, y=y)         # object of class 'data.frame'
 70 | plot(y ~ x, df)                    # generic plot, method plot.formula
 71 | fit <- lm(y ~x, df)                # object of class 'lm'
 72 | methods(class=class(fit))          # introspection
 73 | ```
 74 | 
 75 | ## Lab
 76 | 
 77 | ### 1. _R_ data manipulation
 78 | 
 79 | This exercise servers as a refresher / tutorial on basic input and
 80 | manipulation of data.
 81 | 
 82 | Input a file that contains ALL (acute lymphoblastic leukemia) patient
 83 | information
 84 | 
 85 | ```{r echo=TRUE, eval=FALSE}
 86 | fname <- file.choose()   ## "ALLphenoData.tsv"
 87 | stopifnot(file.exists(fname))
 88 | pdata <- read.delim(fname)
 89 | ```
 90 | ```{r echo=FALSE}
 91 | fname <- system.file("extdata", "ALLphenoData.tsv", package="LearnBioconductor")
 92 | stopifnot(file.exists(fname))
 93 | pdata <- read.delim(fname)
 94 | ```
 95 | 
 96 | Check out the help page `?read.delim` for input options, and explore
 97 | basic properties of the object you've created, for instance...
 98 | 
 99 | ```{r ALL-properties}
100 | class(pdata)
101 | colnames(pdata)
102 | dim(pdata)
103 | head(pdata)
104 | summary(pdata$sex)
105 | summary(pdata$cyto.normal)
106 | ```
107 | 
108 | Remind yourselves about various ways to subset and access columns of a
109 | data.frame
110 | 
111 | ```{r ALL-subset}
112 | pdata[1:5, 3:4]
113 | pdata[1:5, ]
114 | head(pdata[, 3:5])
115 | tail(pdata[, 3:5], 3)
116 | head(pdata$age)
117 | head(pdata$sex)
118 | head(pdata[pdata$age > 21,])
119 | ```
120 | 
121 | It seems from below that there are 17 females over 40 in the data set,
122 | but when sub-setting `pdata` to contain just those individuals 19 rows
123 | are selected. Why? What can we do to correct this?
124 | 
125 | ```{r ALL-subset-NA}
126 | idx <- pdata$sex == "F" & pdata$age > 40
127 | table(idx)
128 | dim(pdata[idx,])
129 | ```
130 | 
131 | Use the `mol.biol` column to subset the data to contain just
132 | individuals with 'BCR/ABL' or 'NEG', e.g.,
133 | 
134 | ```{r ALL-BCR/ABL-subset}
135 | bcrabl <- pdata[pdata$mol.biol %in% c("BCR/ABL", "NEG"),]
136 | ```
137 | 
138 | The `mol.biol` column is a factor, and retains all levels even after
139 | subsetting. How might you drop the unused factor levels?
140 | 
141 | ```{r ALL-BCR/ABL-drop-unused}
142 | bcrabl$mol.biol <- factor(bcrabl$mol.biol)
143 | ```
144 | 
145 | The `BT` column is a factor describing B- and T-cell subtypes
146 | 
147 | ```{r ALL-BT}
148 | levels(bcrabl$BT)
149 | ```
150 | 
151 | How might one collapse B1, B2, ... to a single type B, and likewise
152 | for T1, T2, ..., so there are only two subtypes, B and T
153 | 
154 | ```{r ALL-BT-recode}
155 | table(bcrabl$BT)
156 | levels(bcrabl$BT) <- substring(levels(bcrabl$BT), 1, 1)
157 | table(bcrabl$BT)
158 | ```
159 | 
160 | Use `xtabs()` (cross-tabulation) to count the number of samples with
161 | B- and T-cell types in each of the BCR/ABL and NEG groups
162 | 
163 | ```{r ALL-BCR/ABL-BT}
164 | xtabs(~ BT + mol.biol, bcrabl)
165 | ```
166 | 
167 | Use `aggregate()` to calculate the average age of males and females in
168 | the BCR/ABL and NEG treatment groups.
169 | 
170 | ```{r ALL-aggregate}
171 | aggregate(age ~ mol.biol + sex, bcrabl, mean)
172 | ```
173 | 
174 | Use `t.test()` to compare the age of individuals in the BCR/ABL versus
175 | NEG groups; visualize the results using `boxplot()`. In both cases,
176 | use the `formula` interface. Consult the help page `?t.test` and re-do
177 | the test assuming that variance of ages in the two groups is
178 | identical. What parts of the test output change?
179 | 
180 | ```{r ALL-age}
181 | t.test(age ~ mol.biol, bcrabl)
182 | boxplot(age ~ mol.biol, bcrabl)
183 | ```
184 | 
185 | ## Resources
186 | 
187 | - [StackOverflow](http://stackoverflow.com/questions/tagged/r) for _R_
188 |   programming questions; also [R-help]() mailing list.
189 | 
190 | Publications (General _R_)
191 | 
192 | <!-- Bibliography -->
193 | 
194 | [R]: http://r-project.org
195 | 


--------------------------------------------------------------------------------
/vignettes/A01.2_IntroductionToBioconductor.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{01.2 Introduction to Bioconductor}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r setup, echo=FALSE}
  8 | library(LearnBioconductor)
  9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 10 | ```
 11 | 
 12 | ```{r style, echo = FALSE, results = 'asis'}
 13 | BiocStyle::markdown()
 14 | knitr::opts_chunk$set(tidy=FALSE)
 15 | ```
 16 | 
 17 | # Introduction to Bioconductor
 18 | 
 19 | Martin Morgan<br/>
 20 | October 29, 2014
 21 | 
 22 | ## Bioconductor
 23 | 
 24 | Analysis and comprehension of high-throughput genomic data
 25 | 
 26 | - Statistical analysis: large data, technological artifacts, designed
 27 |   experiments; rigorous
 28 | - Comprehension: biological context, visualization, reproducibility
 29 | - High-throughput
 30 |   - Sequencing: RNASeq, ChIPSeq, variants, copy number, ...
 31 |   - Microarrays: expression, SNP, ...
 32 |   - Flow cytometry, proteomics, images, ...
 33 | 
 34 | Packages, vignettes, work flows
 35 | 
 36 | - 934 packages
 37 | - Discover and navigate via [biocViews][]
 38 | - Package 'landing page'
 39 |   - Title, author / maintainer, short description, citation,
 40 |     installation instructions, ..., download statistics
 41 | - All user-visible functions have help pages, most with runnable
 42 |   examples
 43 | - 'Vignettes' an important feature in Bioconductor -- narrative
 44 |   documents illustrating how to use the package, with integrated code
 45 | - 'Release' (every six months) and 'devel' branches
 46 | - [Support site](https://support.bioconductor.org);
 47 |   [videos](https://www.youtube.com/user/bioconductor), [recent
 48 |   courses](http://bioconductor.org/help/course-materials/)
 49 | 
 50 | Objects
 51 | 
 52 | - Represent complicated data types
 53 | - Foster interoperability
 54 | - S4 object system
 55 |   - Introspection: `getClass()`, `showMethods(..., where=search())`,
 56 |     `selectMethod()`
 57 |   - 'accessors' and other documented functions / methods for
 58 |     manipulation, rather than direct access to the object structure
 59 | - Interactive help
 60 |   - `method?"substr,<tab>"` to select help on methods, `class?D<tab>`
 61 |     for help on classes
 62 | 
 63 | Example
 64 | 
 65 | ```{r Biostrings, message=FALSE}
 66 | suppressPackageStartupMessages({
 67 |     library(Biostrings)
 68 | })
 69 | data(phiX174Phage)                       # sample data, see ?phiX174Phage
 70 | phiX174Phage
 71 | m <- consensusMatrix(phiX174Phage)[1:4,] # nucl. x position counts
 72 | polymorphic <- which(colSums(m != 0) > 1)
 73 | m[, polymorphic]
 74 | ```
 75 | ```{r showMethods, eval=FALSE}
 76 | showMethods(class=class(phiX174Phage), where=search())
 77 | ```
 78 | 
 79 | ## Core concepts
 80 | 
 81 | ### Genomic ranges
 82 | 
 83 | Genomic range
 84 | 
 85 | - chromosome (`seqnames`), start, end, and optionally strand
 86 | - Coordinates
 87 |     - 1-based
 88 |     - Closed -- start and end coordinates _included_ in range
 89 |     - Left-most -- start is always to the left of end, regardless of
 90 |       strand
 91 | 
 92 | Why genomic ranges?
 93 | 
 94 | - 'Annotation'
 95 |     - Many genome annotations are range-based
 96 |     - Simple ranges: exons, promoters, transcription factor binding
 97 |       sites, CpG islands, ...
 98 |     - Lists of ranges: gene models (exons-within-transcripts)
 99 | - 'Data'
100 |     - Reads themselves, or derived data
101 |     - Simple ranges: ChIP-seq peaks, SNPs, ungapped reads, ...
102 |     - List of ranges: gapped alignments, paired-end reads, ...
103 | 
104 | Data objects
105 | 
106 | - `r Biocpkg("GenomicRanges")`::_GRanges_
107 |     - `seqnames()`
108 |     - `start()`, `end()`, `width()`
109 |     - `strand()`
110 |     - `mcols()`: 'metadata' associated with each range, stored as a
111 |       `DataFrame`
112 |     - Many very useful operations defined on ranges (later)
113 | 
114 | - `r Biocpkg("GenomicRanges")`::_GRangesList_
115 |     - List-like (e.g., `length()`, `names()`, `[`, `[[`) <!-- ] -->
116 |     - Each list element a _GRanges_
117 |     - Metadata at list and element-list levels
118 |     - Very easy (fast) to `unlist()` and `relist()`.
119 | 
120 | - `r Biocpkg("GenomicAlignments")`::_GAlignments_, _GAlignmentsList_,
121 |   _GAlignemntPairs_; `r Biocpkg("VariantAnnotation")`::_VCF_, _VRanges_
122 |     - _GRanges_-like objects with more specialized roles
123 | 
124 | Example: _GRanges_
125 | 
126 | ```{r eg-GRanges}
127 | ## 'Annotation' package; more later...
128 | suppressPackageStartupMessages({
129 |     library(TxDb.Hsapiens.UCSC.hg19.knownGene)
130 | })
131 | promoters <- promoters(TxDb.Hsapiens.UCSC.hg19.knownGene)
132 | ## 'GRanges' with 2 metadata columns
133 | promoters
134 | head(table(seqnames(promoters)))
135 | table(strand(promoters))
136 | seqinfo(promoters)
137 | ## vector-like access
138 | promoters[ seqnames(promoters) %in% c("chr1", "chr2") ]
139 | ## metadata
140 | mcols(promoters)
141 | length(unique(promoters$tx_name))
142 | ```
143 | 
144 | ```{r eg-GRangesList}
145 | ## exons, grouped by transcript
146 | exByTx <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx", use.names=TRUE)
147 | ## list-like subsetting
148 | exByTx[1:10]              # also logical, character, ...
149 | exByTx[["uc001aaa.3"]]    # also numeric
150 | ## accessors return typed-List, e.g., IntegerList
151 | width(exByTx)
152 | log10(width(exByTx))
153 | ## 'easy' to ask basic questions, e.g., ...
154 | hist(unlist(log10(width(exByTx))))         # widths of exons
155 | exByTx[which.max(max(width(exByTx)))]      # transcript with largest exon
156 | exByTx[which.max(elementLengths(exByTx))]  # transcript with most exons
157 | ```
158 | 
159 | There are many neat range-based operations (more later)!
160 | 
161 | ![Range Operations](our_figures/RangeOperations.png)
162 | 
163 | Some detail
164 | 
165 | - _GRanges_ and friends use data structures defined in `r Biocpkg("S4Vectors")`, 
166 |   `r Biocpkg("IRanges")`
167 | - These data structures can handle relatively large data easily, e.g.,
168 |   1-10 million ranges
169 | - Basic concepts are built on _R_'s vector and list; _List_ instances
170 |   are implemented to be efficient when there are long lists of a few
171 |   elements each.
172 | - Takes a little getting used to, but very powerful
173 | 
174 | ### Integrated containers
175 | 
176 | What is an experiment?
177 | 
178 | - 'Assays' 
179 |     - Regions-of-interest x samples
180 |     - E.g., read counts, expression values
181 | - Regions-of-interest
182 |     - Microarrays: probeset or gene identifiers
183 |     - Sequencing: genomic ranges
184 | - Samples
185 |     - Experimental inforamtion, covariates
186 | - Overall experimental description    
187 | 
188 | Why integrate?
189 | 
190 | - Avoid errors when manipulating data
191 | - Case study: [reproducible research]()
192 | 
193 | Data objects
194 | 
195 | - `r Biocpkg("Biobase")`::_ExpressionSet_
196 |     - Assays (`exprs()`): matrix of expression values
197 |     - Regions-of-interest (`featureData(); fData()`): probeset or gene
198 |       identifiers
199 |     - Samples (`phenoData(); pData()`: `data.frame` of relevant
200 |       information
201 |     - Experiment data (`exptData()`): Instance of class `MIAME`.
202 | - `r Biocpkg("GenomicRanges")`::_SummarizedExperiment_
203 |     - Assays (`assay(), assays()`): arbitrary matrix-like object
204 |     - Regions-of-interest (`rowData()`): `GRanges` or `GRangesList`;
205 |       use `GRangesList` with names and 0-length elements to represent
206 |       assays without ranges.
207 |     - Samples (`colData()`): `DataFrame` of relevant information.
208 |     - Experiment data (`exptData()`): `List` of arbitrary information.
209 | 
210 | ![SummarizedExperiment](our_figures/SummarizedExperiment.png)
211 | 
212 | Example: `ExpressionSet` (see vignettes in `r Biocpkg("Biobase")`).
213 | 
214 | ```{r eg-ExpressionSet}
215 | suppressPackageStartupMessages({
216 |     library(ALL)
217 | })
218 | data(ALL)
219 | ALL
220 | ## 'Phenotype' (sample) and 'feature' data
221 | head(pData(ALL))
222 | head(featureNames(ALL))
223 | ## access to pData columns; matrix-like subsetting; exprs()
224 | ALL[, ALL$sex %in% "M"]
225 | range(exprs(ALL))
226 | ## 30% 'most variable' features (c.f., genefilter::varFilter)
227 | iqr <- apply(exprs(ALL), 1, IQR)
228 | ALL[iqr > quantile(iqr, 0.7), ]
229 | ```
230 |     
231 | Example: `SummarizedExperiment` (see vignettes in `r Biocpkg("GenomicRanges")`).
232 | 
233 | ```{r eg-SummarizedExperiment}
234 | 
235 | suppressPackageStartupMessages({
236 |     library(airway)
237 | })
238 | data(airway)
239 | airway
240 | ## column and row data
241 | colData(airway)
242 | rowData(airway)
243 | ## access colData; matrix-like subsetting; assay() / assays()
244 | airway[, airway$dex %in% "trt"]
245 | head(assay(airway))
246 | assays(airway)
247 | ## library size
248 | colSums(assay(airway))
249 | hist(rowMeans(log10(assay(airway))))
250 | ```
251 | 
252 | ## Lab
253 | 
254 | ### GC content
255 |     
256 | 1.  Calculate the GC content of human chr1 in the hg19 build,
257 |     excluding regions where the sequence is "N". You will need to
258 | 
259 |     1. Load the `r Biocannopkg("BSgenome.Hsapiens.UCSC.hg19")`
260 |     2. Extract, using `[[`, chromosome 1 ("chr1"). <!-- ]] -->
261 |     3. Use `alphabetFrequency()` to calculate the count or frequency
262 |        of the nucleotides in chr1
263 |     4. Use standard _R_ functions to calculate the GC content.
264 | 
265 |     ```{r gc-reference}
266 |     library(BSgenome.Hsapiens.UCSC.hg19)
267 |     chr1seq <- BSgenome.Hsapiens.UCSC.hg19[["chr1"]]
268 |     chr1alf <- alphabetFrequency(chr1seq)
269 |     chr1gc <- sum(chr1alf[c("G", "C")]) / sum(chr1alf[c("A", "C", "G", "T")])
270 |     ```
271 |     
272 | 2.  Calculate the GC content of 'exome' (approximately, all genic
273 |     regions) on chr1. You will need to
274 | 
275 |     1. Load the `r Biocannopkg("TxDb.Hsapiens.UCSC.hg19.knownGene")`
276 |        package.
277 |     2. Use `genes()` to extract genic regions of all genes, then
278 |        subsetting operations to restrict to chromosome 1.
279 |     3. Use `getSeq,BSgenome-method` to extract sequences from
280 |        chromosome 1 of the BSgenome object.
281 |     4. Use `alphabetFrequency()` (with the argument `collapse=TRUE` --
282 |        why?) and standard _R_ operations to extract the gc content of
283 |        the genes.
284 |     
285 |     ```{r gc-exons-1}
286 |     library(TxDb.Hsapiens.UCSC.hg19.knownGene)
287 |     genes <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)
288 |     genes1 <- genes[seqnames(genes) %in% "chr1"]
289 |     seq1 <- getSeq(BSgenome.Hsapiens.UCSC.hg19, genes1)
290 |     alf1 <- alphabetFrequency(seq1, collapse=TRUE)
291 |     gc1 <- sum(alf1[c("G", "C")]) / sum(alf1[c("A", "C", "G", "T")])
292 |     ```
293 |     
294 |     How does the GC content just calculated compare to the average of
295 |     the GC content of each exon? Answer this using
296 |     `alphabetFrequency()` but with `collapse=FALSE)`, and adjust the
297 |     calculation of GC content to act on a matrix, rather than
298 |     vector. Why are these numbers different?
299 |     
300 |     ```{r gc-exons-2}
301 |     alf2 <- alphabetFrequency(seq1, collapse=FALSE)
302 |     gc2 <- rowSums(alf2[, c("G", "C")]) / rowSums(alf2[,c("A", "C", "G", "T")])
303 |     ```
304 |     
305 | 3.  Plot a histogram of per-gene GC content, annotating with
306 |     information about chromosome and exome GC content. Use base
307 |     graphics `hist()`, `abline()`, `plot(density(...))`,
308 |     `plot(ecdf(...))`, etc. (one example is below). If this is too
309 |     easy, prepare a short presentation for the class illustrating how
310 |     to visualize this type of information using another _R_ graphics
311 |     package, e.g., `r CRANpkg("ggplot2")`, `{r CRANpkg("ggvis")`, or
312 |     `{r CRANpkg("lattice")}.
313 |     
314 |     ```{r gc-denisty}
315 |     plot(density(gc2))
316 |     abline(v=c(chr1gc, gc1), col=c("red", "blue"), lwd=2)
317 |     ```
318 | 
319 | ### Integrated containers
320 | 
321 | This exercise illustrates how integrated containers can be used to
322 | effectively manage data; it does _NOT_ represent a suitable way to
323 | analyze RNASeq differential expression data.
324 | 
325 | 1. Load the `r Biocpkg("airway")` package and `airway` data
326 |    set. Explore it a litte, e.g., determining its dimensions (number
327 |    of regions of interest and samples), the information describing
328 |    samples, and the range of values in the `count` assay. The data are
329 |    from an RNA-seq experiment. The `colData()` describe treatment
330 |    groups and other information. The `assay()` is the (raw) number of
331 |    short reads overlapping each region of interest, in each
332 |    sample. The solution to this exercise is summarized above.
333 | 
334 | 2.  Create a subset of the data set that contains only the 30% most
335 |     variable (using IQR as a metric) observations. Plot the
336 |     distribution of asinh-transformed (a log-like transformation,
337 |     except near 0) row mean counts
338 | 
339 |     ```{r airway-plot}
340 |     iqr <- apply(assay(airway), 1, IQR)
341 |     airway1 <- airway[iqr > quantile(iqr, 0.7),]
342 |     plot(density(rowMeans(asinh(assay(airway1)))))
343 |     ```
344 | 
345 | 3.  Use the `r Biocpkg("genefilter")` package `rowttests` function
346 |     (consult it's help page!) to compare asinh-transformed read counts
347 |     between the two `dex` treatment groups for each row. Explore the
348 |     result in various ways, e.g., finding the 'most' differentially
349 |     expressed genes, the genes with largest (absolute) difference
350 |     between treatment groups, adding adjusted _P_ values (via
351 |     `p.adjust()`, in the _stats_ package), etc. Can you obtain the
352 |     read counts for each treatment group, for the most differentially
353 |     expressed gene?
354 | 
355 |     ```{r airway-rowttest}
356 |     suppressPackageStartupMessages({
357 |         library(genefilter)
358 |     })
359 |     ttest <- rowttests(asinh(assay(airway1)), airway1$dex)
360 |     ttest$p.adj <- p.adjust(ttest$p.value, method="BH")
361 |     ttest[head(order(ttest$p.adj)),]
362 |     split(assay(airway1)[order(ttest$p.adj)[1], ], airway1$dex)
363 |     ```
364 |     
365 | 4.  Add the statistics of differential expression to the `airway1`
366 |     _SummarizedExperiment_. Confirm that the statistics have been
367 |     added.
368 |     
369 |     ```{r airway-merge}
370 |     mcols(rowData(airway1)) <- ttest
371 |     head(mcols(airway1))
372 |     ```
373 | 
374 | # Resources
375 | 
376 | - [Web site][Bioconductor] -- install, learn, use, develop _R_ /
377 |   _Bioconductor_ packages
378 | - [Support](http://support.bioconductor.org) -- seek help and
379 |   guidance; also
380 | - [biocViews](http://bioconductor.org/packages/release/BiocViews.html)
381 |   -- discover packages
382 | - Package landing pages, e.g.,
383 |   [GenomicRanges](http://bioconductor.org/packages/release/bioc/html/GenomicRanges.html),
384 |   including title, description, authors, installation instructions,
385 |   vignettes (e.g., GenomicRanges '[How
386 |   To](http://bioconductor.org/packages/release/bioc/vignettes/GenomicRanges/inst/doc/GenomicRangesHOWTOs.pdf)'),
387 |   etc.
388 | - [Course](http://bioconductor.org/help/course-materials/) and other
389 |   [help](http://bioconductor.org/help/) material (e.g., videos, EdX
390 |   course, community blogs, ...)
391 | 
392 | Publications (General _Bioconductor_)
393 | 
394 | - Lawrence M, Huber W, Pagès H, Aboyoun P, Carlson M, et al. (2013)
395 |   Software for Computing and Annotating Genomic Ranges. PLoS Comput
396 |   Biol 9(8): e1003118. doi:
397 |   [10.1371/journal.pcbi.1003118][GRanges.bib]
398 | 
399 | Other
400 | 
401 | - Lawrence, M. 2014. Software for Enabling Genomic Data
402 |   Analysis. Bioc2014 conference [slides][Lawrence.bioc2014.bib].
403 | 
404 | [R]: http://r-project.org
405 | [Bioconductor]: http://bioconductor.org
406 | [GRanges.bib]: https://doi.org/10.1371/journal.pcbi.1003118
407 | [Scalable.bib]: http://arxiv.org/abs/1409.2864
408 | [Lawrence.bioc2014.bib]:
409 |     http://bioconductor.org/help/course-materials/2014/BioC2014/Lawrence_Talk.pdf
410 | 
411 | [AnnotationData]: http://bioconductor.org/packages/release/BiocViews.html#___AnnotationData
412 | [AnnotationDbi]: http://bioconductor.org/packages/release/bioc/html/AnnotationDbi.html
413 | [AnnotationHub]: http://bioconductor.org/packages/release/bioc/html/AnnotationHub.html
414 | [BSgenome.Hsapiens.UCSC.hg19]: http://bioconductor.org/packages/release/data/annotation/html/BSgenome.Hsapiens.UCSC.hg19.html
415 | [BSgenome]: http://bioconductor.org/packages/release/bioc/html/BSgenome.html
416 | [BiocParallel]: http://bioconductor.org/packages/release/bioc/html/BiocParallel.html
417 | [Biostrings]: http://bioconductor.org/packages/release/bioc/html/Biostrings.html
418 | [Bsgenome.Hsapiens.UCSC.hg19]: http://bioconductor.org/packages/release/data/annotation/html/Bsgenome.Hsapiens.UCSC.hg19.html
419 | [CNTools]: http://bioconductor.org/packages/release/bioc/html/CNTools.html
420 | [ChIPQC]: http://bioconductor.org/packages/release/bioc/html/ChIPQC.html
421 | [ChIPpeakAnno]: http://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html
422 | [DESeq2]: http://bioconductor.org/packages/release/bioc/html/DESeq2.html
423 | [DiffBind]: http://bioconductor.org/packages/release/bioc/html/DiffBind.html
424 | [GenomicAlignments]: http://bioconductor.org/packages/release/bioc/html/GenomicAlignments.html
425 | [GenomicFiles]: http://bioconductor.org/packages/release/bioc/html/GenomicFiles.html
426 | [GenomicRanges]: http://bioconductor.org/packages/release/bioc/html/GenomicRanges.html
427 | [Homo.sapiens]: http://bioconductor.org/packages/release/data/annotation/html/Homo.sapiens.html
428 | [IRanges]: http://bioconductor.org/packages/release/bioc/html/IRanges.html
429 | [KEGGREST]: http://bioconductor.org/packages/release/bioc/html/KEGGREST.html
430 | [PSICQUIC]: http://bioconductor.org/packages/release/bioc/html/PSICQUIC.html
431 | [Rsamtools]: http://bioconductor.org/packages/release/bioc/html/Rsamtools.html
432 | [Rsubread]: http://bioconductor.org/packages/release/bioc/html/Rsubread.html
433 | [ShortRead]: http://bioconductor.org/packages/release/bioc/html/ShortRead.html
434 | [SomaticSignatures]: http://bioconductor.org/packages/release/bioc/html/SomaticSignatures.html
435 | [TxDb.Hsapiens.UCSC.hg19.knownGene]: http://bioconductor.org/packages/release/data/annotation/html/TxDb.Hsapiens.UCSC.hg19.knownGene.html
436 | [VariantAnnotation]: http://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html
437 | [VariantFiltering]: http://bioconductor.org/packages/release/bioc/html/VariantFiltering.html
438 | [VariantTools]: http://bioconductor.org/packages/release/bioc/html/VariantTools.html
439 | [biocViews]: http://bioconductor.org/packages/release/BiocViews.html#___Software
440 | [biomaRt]: http://bioconductor.org/packages/release/bioc/html/biomaRt.html
441 | [cn.mops]: http://bioconductor.org/packages/release/bioc/html/cn.mops.html
442 | [edgeR]: http://bioconductor.org/packages/release/bioc/html/edgeR.html
443 | [ensemblVEP]: http://bioconductor.org/packages/release/bioc/html/ensemblVEP.html 
444 | [h5vc]: http://bioconductor.org/packages/release/bioc/html/h5vc.html
445 | [limma]: http://bioconductor.org/packages/release/bioc/html/limma.html
446 | [metagenomeSeq]: http://bioconductor.org/packages/release/bioc/html/metagenomeSeq.html
447 | [org.Hs.eg.db]: http://bioconductor.org/packages/release/data/annotation/html/org.Hs.eg.db.html
448 | [org.Sc.sgd.db]: http://bioconductor.org/packages/release/data/annotation/html/org.Sc.sgd.db.html
449 | [phyloseq]: http://bioconductor.org/packages/release/bioc/html/phyloseq.html
450 | [rtracklayer]: http://bioconductor.org/packages/release/bioc/html/rtracklayer.html
451 | [snpStats]: http://bioconductor.org/packages/release/bioc/html/snpStats.html
452 | [Gviz]: http://bioconductor.org/packages/release/bioc/html/Gviz.html
453 | [epivizr]: http://bioconductor.org/packages/release/bioc/html/epivizr.html
454 | [ggbio]: http://bioconductor.org/packages/release/bioc/html/ggbio.html
455 | [OmicCircos]: http://bioconductor.org/packages/release/bioc/html/OmicCircos.html
456 | 
457 | 
458 | 


--------------------------------------------------------------------------------
/vignettes/A01.3_BioconductorForSequenceAnalysis.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{01.3 Bioconductor for Sequence Analysis}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r setup, echo=FALSE}
  8 | library(LearnBioconductor)
  9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 10 | ```
 11 | 
 12 | ```{r style, echo = FALSE, results = 'asis'}
 13 | BiocStyle::markdown()
 14 | knitr::opts_chunk$set(tidy=FALSE)
 15 | ```
 16 | 
 17 | # Bioconductor for Sequence Analysis
 18 | 
 19 | Martin Morgan<br/>
 20 | October 29, 2014
 21 | 
 22 | ## Sequence analysis work flows
 23 | 
 24 | 1. Experimental design
 25 |    - Keep it simple, e.g., 'control' and 'treatment' groups
 26 |    - Replicate within treatments!
 27 | 2. Wet-lab sequence preparation
 28 |    - Record covariates, including processing day -- likely 'batch effects'
 29 | 3. (Illumina) Sequencing (Bentley et al., 2008,
 30 |    [doi:10.1038/nature07517](doi:10.1038/nature07517))
 31 |     <img src="our_figures/Solexa-bridge-pcr.jpg" height="400px" width="300px" />
 32 |    - Primary output: FASTQ files of short reads and their [quality
 33 |      scores](http://en.wikipedia.org/wiki/FASTQ_format#Encoding)
 34 | 4. Alignment
 35 |    - Choose to match task, e.g., [Rsubread][], Bowtie2 good for ChIPseq,
 36 |      some forms of RNAseq; BWA, GMAP better for variant calling
 37 |    - Primary output: BAM files of aligned reads
 38 | 5. Reduction
 39 |    - e.g., RNASeq 'count table' (simple spreadsheets), DNASeq called
 40 |      variants (VCF files), ChIPSeq peaks (BED, WIG files)
 41 | 6. Analysis
 42 |    - Differential expression, peak identification, ...
 43 | 7. Comprehension
 44 |    - Biological context
 45 | 
 46 | Data movement
 47 | 
 48 | ![Alt Sequencing Ecosystem](our_figures/SequencingEcosystem_no_bioc_pkgs.png)
 49 | 
 50 | ## Sequence data representations
 51 | 
 52 | ### DNA / amino acid sequences: FASTA files
 53 | 
 54 | Input & manipulation: [Biostrings][]
 55 | 
 56 |     >NM_078863_up_2000_chr2L_16764737_f chr2L:16764737-16766736
 57 |     gttggtggcccaccagtgccaaaatacacaagaagaagaaacagcatctt
 58 |     gacactaaaatgcaaaaattgctttgcgtcaatgactcaaaacgaaaatg
 59 |     ...
 60 |     atgggtatcaagttgccccgtataaaaggcaagtttaccggttgcacggt
 61 |     >NM_001201794_up_2000_chr2L_8382455_f chr2L:8382455-8384454
 62 |     ttatttatgtaggcgcccgttcccgcagccaaagcactcagaattccggg
 63 |     cgtgtagcgcaacgaccatctacaaggcaatattttgatcgcttgttagg
 64 |     ...
 65 | 
 66 | Whole genomes: `2bit` and `.fa` formats: [rtracklayer][],
 67 | [Rsamtools][]; [BSgenome][]
 68 | 
 69 | ### Reads: FASTQ files
 70 | 
 71 | Input & manipulation: [ShortRead][] `readFastq()`, `FastqStreamer()`,
 72 | `FastqSampler()`
 73 | 
 74 |     @ERR127302.1703 HWI-EAS350_0441:1:1:1460:19184#0/1
 75 |     CCTGAGTGAAGCTGATCTTGATCTACGAAGAGAGATAGATCTTGATCGTCGAGGAGATGCTGACCTTGACCT
 76 |     +
 77 |     HHGHHGHHHHHHHHDGG<GDGGE@GDGGD<?B8??ADAD<BE@EE8EGDGA3CB85*,77@>>CE?=896=:
 78 |     @ERR127302.1704 HWI-EAS350_0441:1:1:1460:16861#0/1
 79 |     GCGGTATGCTGGAAGGTGCTCGAATGGAGAGCGCCAGCGCCCCGGCGCTGAGCCGCAGCCTCAGGTCCGCCC
 80 |     +
 81 |     DE?DD>ED4>EEE>DE8EEEDE8B?EB<@3;BA79?,881B?@73;1?########################
 82 |         
 83 | - Quality scores: 'phred-like', encoded. See
 84 |   [wikipedia](http://en.wikipedia.org/wiki/FASTQ_format#Encoding)
 85 | 
 86 | ### Aligned reads: BAM files (e.g., ERR127306_chr14.bam)
 87 | 
 88 | Input & manipulation: 'low-level' [Rsamtools][], `scanBam()`,
 89 | `BamFile()`; 'high-level' [GenomicAlignments][]
 90 | 
 91 | - Header
 92 | 
 93 |         @HD     VN:1.0  SO:coordinate
 94 |         @SQ     SN:chr1 LN:249250621
 95 |         @SQ     SN:chr10        LN:135534747
 96 |         @SQ     SN:chr11        LN:135006516
 97 |         ...
 98 |         @SQ     SN:chrY LN:59373566
 99 |         @PG     ID:TopHat       VN:2.0.8b       CL:/home/hpages/tophat-2.0.8b.Linux_x86_64/tophat --mate-inner-dist 150 --solexa-quals --max-multihits 5 --no-discordant --no-mixed --coverage-search --microexon-search --library-type fr-unstranded --num-threads 2 --output-dir tophat2_out/ERR127306 /home/hpages/bowtie2-2.1.0/indexes/hg19 fastq/ERR127306_1.fastq fastq/ERR127306_2.fastq
100 |   
101 | - Alignments: ID, flag, alignment and mate
102 |   
103 |         ERR127306.7941162       403     chr14   19653689        3       72M             =       19652348        -1413  ...
104 |         ERR127306.22648137      145     chr14   19653692        1       72M             =       19650044        -3720  ...
105 |         ERR127306.933914        339     chr14   19653707        1       66M120N6M       =       19653686        -213   ...
106 |         ERR127306.11052450      83      chr14   19653707        3       66M120N6M       =       19652348        -1551  ...
107 |         ERR127306.24611331      147     chr14   19653708        1       65M120N7M       =       19653675        -225   ...
108 |         ERR127306.2698854       419     chr14   19653717        0       56M120N16M      =       19653935        290    ...
109 |         ERR127306.2698854       163     chr14   19653717        0       56M120N16M      =       19653935        2019   ...
110 |             
111 | - Alignments: sequence and quality
112 |         
113 |         ... GAATTGATCAGTCTCATCTGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCC        *'%%%%%#&&%''#'&%%%)&&%%$%%'%%'&*****$))$)'')'%)))&)%%%%$'%%%%&"))'')%))
114 |         ... TTGATCAGTCTCATCTGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAG        '**)****)*'*&*********('&)****&***(**')))())%)))&)))*')&***********)****
115 |         ... TGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAGCAGCCTCTGGTTTCT        '******&%)&)))&")')'')'*((******&)&'')'))$))'')&))$)**&&****************
116 |         ... TGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAGCAGCCTCTGGTTTCT        ##&&(#')$')'%&&#)%$#$%"%###&!%))'%%''%'))&))#)&%((%())))%)%)))%*********
117 |         ... GAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAGCAGCCTCTGGTTTCTT        )&$'$'$%!&&%&&#!'%'))%''&%'&))))''$""'%'%&%'#'%'"!'')#&)))))%$)%)&'"')))
118 |         ... TTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAGCAGCCTCTGGTTTCTTCATGTGGCT        ++++++++++++++++++++++++++++++++++++++*++++++**++++**+**''**+*+*'*)))*)#
119 |         ... TTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAGCAGCCTCTGGTTTCTTCATGTGGCT        ++++++++++++++++++++++++++++++++++++++*++++++**++++**+**''**+*+*'*)))*)#
120 |         
121 | - Alignments: Tags
122 | 
123 |         ... AS:i:0  XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:72 YT:Z:UU NH:i:2  CC:Z:chr22      CP:i:16189276   HI:i:0
124 |         ... AS:i:0  XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:72 YT:Z:UU NH:i:3  CC:Z:=  CP:i:19921600   HI:i:0
125 |         ... AS:i:0  XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:4  MD:Z:72 YT:Z:UU XS:A:+  NH:i:3  CC:Z:=  CP:i:19921465   HI:i:0
126 |         ... AS:i:0  XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:4  MD:Z:72 YT:Z:UU XS:A:+  NH:i:2  CC:Z:chr22      CP:i:16189138   HI:i:0
127 |         ... AS:i:0  XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:5  MD:Z:72 YT:Z:UU XS:A:+  NH:i:3  CC:Z:=  CP:i:19921464   HI:i:0
128 |         ... AS:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:72 NM:i:0  XS:A:+  NH:i:5  CC:Z:=  CP:i:19653717   HI:i:0
129 |         ... AS:i:0  XM:i:0  XO:i:0  XG:i:0  MD:Z:72 NM:i:0  XS:A:+  NH:i:5  CC:Z:=  CP:i:19921455   HI:i:1
130 | 
131 | ### Called variants: VCF files
132 | 
133 | Input and manipulation: [VariantAnnotation][] `readVcf()`,
134 | `readInfo()`, `readGeno()` selectively with `ScanVcfParam()`.
135 | 
136 | - Header
137 | 
138 |           ##fileformat=VCFv4.2
139 |           ##fileDate=20090805
140 |           ##source=myImputationProgramV3.1
141 |           ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
142 |           ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
143 |           ##phasing=partial
144 |           ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
145 |           ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
146 |           ...
147 |           ##FILTER=<ID=q10,Description="Quality below 10">
148 |           ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
149 |           ...
150 |           ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
151 |           ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
152 |           
153 | - Location
154 | 
155 |           #CHROM POS     ID        REF    ALT     QUAL FILTER ...
156 |           20     14370   rs6054257 G      A       29   PASS   ...
157 |           20     17330   .         T      A       3    q10    ...
158 |           20     1110696 rs6040355 A      G,T     67   PASS   ...
159 |           20     1230237 .         T      .       47   PASS   ...
160 |           20     1234567 microsat1 GTC    G,GTCT  50   PASS   ...
161 |           
162 | - Variant INFO
163 | 
164 |           #CHROM POS     ...	INFO                              ...
165 |           20     14370   ...	NS=3;DP=14;AF=0.5;DB;H2           ...
166 |           20     17330   ...	NS=3;DP=11;AF=0.017               ...
167 |           20     1110696 ...	NS=2;DP=10;AF=0.333,0.667;AA=T;DB ...
168 |           20     1230237 ...	NS=3;DP=13;AA=T                   ...
169 |           20     1234567 ...	NS=3;DP=9;AA=G                    ...
170 |     
171 | - Genotype FORMAT and samples
172 | 
173 |           ... POS     ...  FORMAT      NA00001        NA00002        NA00003
174 |           ... 14370   ...  GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
175 |           ... 17330   ...  GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3   0/0:41:3
176 |           ... 1110696 ...  GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2   2/2:35:4
177 |           ... 1230237 ...  GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2
178 |           ... 1234567 ...  GT:GQ:DP    0/1:35:4       0/2:17:2       1/1:40:3
179 |             
180 | ### Genome annotations: BED, WIG, GTF, etc. files
181 | 
182 | Input: [rtracklayer][] `import()`
183 | 
184 | - BED: range-based annotation (see
185 |   http://genome.ucsc.edu/FAQ/FAQformat.html for definition of this and
186 |   related formats)
187 | - WIG / bigWig: dense, continuous-valued data
188 | - GTF: gene model
189 | 
190 |   - Component coordinates
191 |   
192 |               7   protein_coding  gene        27221129    27224842    .   -   . ...
193 |               ...
194 |               7   protein_coding  transcript  27221134    27224835    .   -   . ...
195 |               7   protein_coding  exon        27224055    27224835    .   -   . ...
196 |               7   protein_coding  CDS         27224055    27224763    .   -   0 ...
197 |               7   protein_coding  start_codon 27224761    27224763    .   -   0 ...
198 |               7   protein_coding  exon        27221134    27222647    .   -   . ...
199 |               7   protein_coding  CDS         27222418    27222647    .   -   2 ...
200 |               7   protein_coding  stop_codon  27222415    27222417    .   -   0 ...
201 |               7   protein_coding  UTR         27224764    27224835    .   -   . ...
202 |               7   protein_coding  UTR         27221134    27222414    .   -   . ...
203 |       
204 |   - Annotations
205 | 
206 |               gene_id "ENSG00000005073"; gene_name "HOXA11"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
207 |               ...
208 |               ... transcript_id "ENST00000006015"; transcript_name "HOXA11-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS5411";
209 |               ... exon_number "1"; exon_id "ENSE00001147062";
210 |               ... exon_number "1"; protein_id "ENSP00000006015";
211 |               ... exon_number "1";
212 |               ... exon_number "2"; exon_id "ENSE00002099557";
213 |               ... exon_number "2"; protein_id "ENSP00000006015";
214 |               ... exon_number "2";
215 |               ...
216 | 
217 | ## Sequence data in _R_ / _Bioconductor_
218 | 
219 | ### Role for Bioconductor
220 | 
221 | - Pre-processing and alignment
222 | - Data reduction
223 | - Statistical analysis
224 | - Comprehension -- integrative and 'down stream' analysis
225 | 
226 | ![Alt Sequencing Ecosystem](our_figures/SequencingEcosystem.png)
227 | 
228 | ### Sequences
229 | 
230 | _Biostrings_ classes for DNA or amino acid sequences
231 | 
232 | - XString, XStringSet, e.g., DNAString (genomes),
233 |   DNAStringSet (reads)
234 | 
235 | Methods
236 | 
237 | - [Cheat sheat](http://bioconductor.org/packages/release/bioc/vignettes/Biostrings/inst/doc/BiostringsQuickOverview.pdf)
238 | - Manipulation, e.g., `reverseComplement()`
239 | - Summary, e.g., `letterFrequency()`
240 | - Matching, e.g., `matchPDict()`, `matchPWM()`
241 | 
242 | Related packages
243 | 
244 | - [BSgenome][]
245 |     - Whole-genome representations
246 |     - Model organism or custom
247 |     - 'Masks' to exclude regions (pre-computed or arbitrary) from
248 |       calculations
249 | - [BSgenome][]: Whole-genome sequence representation & manipulation)
250 | - [Rsamtools][]: `FaFile` class for indexed on-disk representation
251 | - [rtracklayer][]: UCSC '2bit' (`TwoBitFile`) class for indexed
252 |   on-disk representation
253 | 
254 | Example 
255 | 
256 | - Whole-genome sequences are distrubuted by ENSEMBL, NCBI, and others
257 |   as FASTA files; model organism whole genome sequences are packaged
258 |   into more user-friendly `BSgenome` packages. The following
259 |   calculates GC content across chr14.
260 | 
261 | ```{r BSgenome-require, message=FALSE}
262 | suppressPackageStartupMessages({  
263 |     library(BSgenome.Hsapiens.UCSC.hg19)
264 | })
265 | chr14_range = GRanges("chr14", IRanges(1, seqlengths(Hsapiens)["chr14"]))
266 | chr14_dna <- getSeq(Hsapiens, chr14_range)
267 | letterFrequency(chr14_dna, "GC", as.prob=TRUE)
268 | ```
269 | 
270 | ### Ranges
271 | 
272 | Ranges represent:
273 | - Data, e.g., aligned reads, ChIP peaks, SNPs, CpG islands, ...
274 | - Annotations, e.g., gene models, regulatory elements, methylated
275 |   regions
276 | - Ranges are defined by chromosome, start, end, and strand
277 | - Often, metadata is associated with each range, e.g., quality of
278 |   alignment, strength of ChIP peak
279 | 
280 | Many common biological questions are range-based
281 | - What reads overlap genes?
282 | - What genes are ChIP peaks nearest?
283 | - ...
284 | 
285 | 
286 | The [GenomicRanges][] package defines essential classes and methods
287 | 
288 | - `GRanges`
289 | 
290 |     ![Alt ](our_figures/GRanges.png)
291 | 
292 | - `GRangesList`
293 | 
294 |     ![Alt ](our_figures/GRangesList.png)
295 | 
296 | #### Range operations
297 | 
298 | ![Alt Ranges Algebra](our_figures/RangeOperations.png)
299 | 
300 | Ranges
301 | - IRanges
302 |   - `start()` / `end()` / `width()`
303 |   - List-like -- `length()`, subset, etc.
304 |   - 'metadata', `mcols()`
305 | - GRanges
306 |   - 'seqnames' (chromosome), 'strand'
307 |   - `Seqinfo`, including `seqlevels` and `seqlengths`
308 | 
309 | Intra-range methods
310 | - Independent of other ranges in the same object
311 | - GRanges variants strand-aware
312 | - `shift()`, `narrow()`, `flank()`, `promoters()`, `resize()`,
313 |   `restrict()`, `trim()`
314 | - See `?"intra-range-methods"`
315 | 
316 | Inter-range methods
317 | - Depends on other ranges in the same object
318 | - `range()`, `reduce()`, `gaps()`, `disjoin()`
319 | - `coverage()` (!)
320 | - see `?"inter-range-methods"`
321 | 
322 | Between-range methods
323 | - Functions of two (or more) range objects
324 | - `findOverlaps()`, `countOverlaps()`, ..., `%over%`, `%within%`,
325 |   `%outside%`; `union()`, `intersect()`, `setdiff()`, `punion()`,
326 |   `pintersect()`, `psetdiff()`
327 | 
328 | Example
329 | 
330 | ```{r ranges, message=FALSE}
331 | suppressPackageStartupMessages({
332 |     library(GenomicRanges)
333 | })
334 | gr <- GRanges("A", IRanges(c(10, 20, 22), width=5), "+")
335 | shift(gr, 1)                            # 1-based coordinates!
336 | range(gr)                               # intra-range
337 | reduce(gr)                              # inter-range
338 | coverage(gr)
339 | setdiff(range(gr), gr)                  # 'introns'
340 | ```
341 | 
342 | IRangesList, GRangesList
343 | - List: all elements of the same type
344 | - Many *List-aware methods, but a common 'trick': apply a vectorized
345 |   function to the unlisted representaion, then re-list
346 | 
347 |         grl <- GRangesList(...)
348 |         orig_gr <- unlist(grl)
349 |         transformed_gr <- FUN(orig)
350 |         transformed_grl <- relist(, grl)
351 |         
352 | Reference
353 | 
354 | - Lawrence M, Huber W, Pagès H, Aboyoun P, Carlson M, et al. (2013)
355 |   Software for Computing and Annotating Genomic Ranges. PLoS Comput
356 |   Biol 9(8): e1003118. doi:10.1371/journal.pcbi.1003118
357 | 
358 | ### [GenomicAlignments][] (Aligned reads)
359 | 
360 | Classes -- GenomicRanges-like behaivor
361 | 
362 | - GAlignments, GAlignmentPairs, GAlignmentsList
363 | - SummarizedExperiment
364 |   - Matrix where rows are indexed by genomic ranges, columns by a
365 |     DataFrame.
366 | 
367 | Methods
368 | 
369 | - `readGAlignments()`, `readGAlignmentsList()`
370 |   - Easy to restrict input, iterate in chunks
371 | - `summarizeOverlaps()`
372 | 
373 | Example
374 | 
375 | - Find reads supporting the junction identified above, at position
376 |   19653707 + 66M = 19653773 of chromosome 14
377 | 
378 | ```{r bam-require}
379 | suppressPackageStartupMessages({
380 |     library(GenomicRanges)
381 |     library(GenomicAlignments)
382 |     library(Rsamtools)
383 | })
384 | 
385 | ## our 'region of interest'
386 | roi <- GRanges("chr14", IRanges(19653773, width=1)) 
387 | ## sample data
388 | suppressPackageStartupMessages({
389 |     library('RNAseqData.HNRNPC.bam.chr14')
390 | })
391 | bf <- BamFile(RNAseqData.HNRNPC.bam.chr14_BAMFILES[[1]], asMates=TRUE)
392 | ## alignments, junctions, overlapping our roi
393 | paln <- readGAlignmentsList(bf)
394 | j <- summarizeJunctions(paln, with.revmap=TRUE)
395 | j_overlap <- j[j %over% roi]
396 | 
397 | ## supporting reads
398 | paln[j_overlap$revmap[[1]]]
399 | ```
400 |   
401 | ### [VariantAnnotation][] (called variants)
402 | 
403 | Classes -- GenomicRanges-like behavior
404 | 
405 | - VCF -- 'wide'
406 | - VRanges -- 'tall'
407 | 
408 | Functions and methods
409 | 
410 | - I/O and filtering: `readVcf()`, `readGeno()`, `readInfo()`,
411 |   `readGT()`, `writeVcf()`, `filterVcf()`
412 | - Annotation: `locateVariants()` (variants overlapping ranges),
413 |   `predictCoding()`, `summarizeVariants()`
414 | - SNPs: `genotypeToSnpMatrix()`, `snpSummary()`
415 | 
416 | Example
417 | 
418 | - Read variants from a VCF file, and annotate with respect to a known
419 |   gene model
420 |   
421 | ```{r vcf, message=FALSE}
422 | ## input variants
423 | suppressPackageStartupMessages({
424 |     library(VariantAnnotation)
425 | })
426 | fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
427 | vcf <- readVcf(fl, "hg19")
428 | seqlevels(vcf) <- "chr22"
429 | ## known gene model
430 | suppressPackageStartupMessages({
431 |    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
432 | })
433 | coding <- locateVariants(rowData(vcf),
434 |     TxDb.Hsapiens.UCSC.hg19.knownGene,
435 |     CodingVariants())
436 | head(coding)
437 | ```
438 | 
439 | Related packages
440 | 
441 | - [ensemblVEP][] 
442 |   - Forward variants to Ensembl Variant Effect Predictor
443 | - [VariantTools][], [h5vc][]
444 |   - Call variants
445 | - [VariantFiltering][]
446 |   - Filter variants using criteria such as coding consequence, MAF,
447 |     ..., inheritance model
448 | 
449 | Reference
450 | 
451 | - Obenchain, V, Lawrence, M, Carey, V, Gogarten, S, Shannon, P, and
452 |   Morgan, M. VariantAnnotation: a Bioconductor package for exploration
453 |   and annotation of genetic variants. Bioinformatics, first published
454 |   online March 28, 2014
455 |   [doi:10.1093/bioinformatics/btu168](http://bioinformatics.oxfordjournals.org/content/early/2014/04/21/bioinformatics.btu168)
456 | 
457 | ### [rtracklayer][] (Genome annotations)
458 | 
459 | - Import BED, GTF, WIG, etc
460 | - Export GRanges to BED, GTF, WIG, ...
461 | - Access UCSC genome browser
462 | 
463 | ## A sequence analysis package tour
464 | 
465 | This very open-ended topic points to some of the most prominent
466 | Bioconductor packages for sequence analysis. Use the opportunity in
467 | this lab to explore the package vignettes and help pages highlighted
468 | below; many of the material will be covered in greater detail in
469 | subsequent labs and lectures.
470 | 
471 | Basics 
472 | 
473 | - Bioconductor packages are listed on the [biocViews][] page. Each
474 |   package has 'biocViews' (tags from a controlled vocabulary)
475 |   associated with it; these can be searched to identify appropriately
476 |   tagged packages, as can the package title and author.
477 | - Each package has a 'landing page', e.g., for
478 |   [GenomicRanges][]. Visit this landing page, and note the
479 |   description, authors, and installation instructions. Packages are
480 |   often written up in the scientific literature, and if available the
481 |   corresponding citation is present on the landing page. Also on the
482 |   landing page are links to the vignettes and reference manual and, at
483 |   the bottom, an indication of cross-platform availability and
484 |   download statistics.
485 | -   A package needs to be installed once, using the instructions on the
486 |     landing page. Once installed, the package can be loaded into an R
487 |     session
488 | 
489 |     ```{r require}
490 |     suppressPackageStartupMessages({
491 |         library(GenomicRanges)
492 |     })
493 |     ```
494 |     
495 |     and the help system queried interactively, as outlined above:
496 | 
497 |     ```{r help, eval=FALSE}
498 |       help(package="GenomicRanges")
499 |       vignette(package="GenomicRanges")
500 |       vignette(package="GenomicRanges", "GenomicRangesHOWTOs")
501 |       ?GRanges
502 |     ```
503 |     
504 | Domain-specific analysis -- explore the landing pages, vignettes, and
505 | reference manuals of two or three of the following packages.
506 | 
507 | - Important packages for analysis of differential expression include
508 |   [edgeR][] and [DESeq2][]; both have excellent vignettes for
509 |   exploration. Additional research methods embodied in Bioconductor
510 |   packages can be discovered by visiting the [biocViews][] web page,
511 |   searching for the 'DifferentialExpression' view term, and narrowing
512 |   the selection by searching for 'RNA seq' and similar.
513 | - Popular ChIP-seq packages include [DiffBind][] for comparison of
514 |   peaks across samples, [ChIPQC][] for quality assessment, and
515 |   [ChIPpeakAnno][] for annotating results (e.g., discovering nearby
516 |   genes). What other ChIP-seq packages are listed on the [biocViews][]
517 |   page?
518 | - Working with called variants (VCF files) is facilitated by packages
519 |   such as [VariantAnnotation][], [VariantFiltering][], [ensemblVEP][],
520 |   and [SomaticSignatures][]; packages for calling variants include,
521 |   e.g., [h5vc][] and [VariantTools][].
522 | - Several packages identify copy number variants from sequence data,
523 |   including [cn.mops][]; from the [biocViews][] page, what other copy
524 |   number packages are available? The [CNTools][] package provides some
525 |   useful facilities for comparison of segments across samples.
526 | - Microbiome and metagenomic analysis is facilitated by packages such
527 |   as [phyloseq][] and [metagenomeSeq][].
528 | - Metabolomics, chemoinformatics, image analysis, and many other
529 |   high-throughput analysis domains are also represented in
530 |   Bioconductor; explore these via biocViews and title searches.
531 |   
532 | Working with sequences, alignments, common web file formats, and raw
533 | data; these packages rely very heavily on the [IRanges][] /
534 | [GenomicRanges][] infrastructure that we will encounter later in the
535 | course.
536 | 
537 | - The [Biostrings][] package is used to represent DNA and other
538 |   sequences, with many convenient sequence-related functions. Check
539 |   out the functions documented on the help page `?consensusMatrix`,
540 |   for instance. Also check out the [BSgenome][] package for working
541 |   with whole genome sequences, e.g., `?"getSeq,BSgenome-method"`
542 | - The [GenomicAlignments][] package is used to input reads aligned to
543 |   a reference genome. See for instance the `?readGAlignments` help
544 |   page and `vigentte(package="GenomicAlignments",
545 |   "summarizeOverlaps")`
546 | - [rtracklayer][]'s `import` and `export` functions can read in many
547 |   common file types, e.g., BED, WIG, GTF, ..., in addition to querying
548 |   and navigating the UCSC genome browser. Check out the `?import` page
549 |   for basic usage.
550 | - The [ShortRead][] and [Rsamtools][] packages can be used for
551 |   lower-level access to FASTQ and BAM files, respectively. Explore the
552 |   [ShortRead vignette](http://bioconductor.org/packages/release/bioc/vignettes/ShortRead/inst/doc/Overview.pdf)
553 |   and Scalable Genomics labs to see approaches to effectively
554 |   processing the large files.
555 | 
556 | Visualization
557 | 
558 | - The [Gviz][] package provides great tools for visualizing local
559 |   genomic coordinates and associated data. 
560 | - [epivizr][] drives the [epiviz](http://epiviz.cbcb.umd.edu/) genome
561 |   browser from within R; [rtracklayer][] provides easy ways to
562 |   transfer data to and manipulate UCSC browser sessions.
563 | - Additionl packages include [ggbio][], [OmicCircos][], ...
564 | 
565 | ## Lab
566 | 
567 | ### Short read quality assessment
568 | 
569 | `fastqc` is a Java program commonly used for summarizing quality of
570 | fastq files. It has a straight-forward graphical user interface. Here
571 | we will use the command-line version.
572 | 
573 | 1. From within _Rstudio_, choose 'Tools --> Shell...', or log on to
574 |    your Amazon machine instance using a Mac / linux terminal or on
575 |    Windows the PuTTY program.
576 | 
577 | 2. Run fastqc on sample fastq files, sending the output to the
578 |    `~/fastqc_report` directory.
579 | 
580 |         fastqc fastq/*fastq --threads 8 --outdir=fastqc_reports
581 | 
582 | 3. Study the quality report and resulting on-line [documentation](FIXME):
583 |    In the Files tab, click on `fastqc_reports`. Click on the HTML file 
584 |    there and then click on "View in Web Browser".
585 | 
586 | `r Biocpkg("ShortRead")` provides similar functionality, but from
587 | within _R_. The following shows that _R_ can handle large data, and
588 | illustrates some of the basic ways in which one might interact with
589 | functionality provided by a _Bioconductor_ package.
590 | 
591 | ```{r ShortRead, messages=FALSE}
592 | ## 1. attach ShortRead and BiocParallel
593 | suppressPackageStartupMessages({
594 |     library(ShortRead)
595 |     library(BiocParallel)
596 | })
597 | 
598 | ## 2. create a vector of file paths
599 | fls <- dir("~/fastq", pattern="*fastq", full=TRUE)
600 | 
601 | ```
602 | 
603 | ```{r fakestats, eval=FALSE}
604 | ## 3. collect statistics
605 | stats0 <- qa(fls)
606 | ```
607 | 
608 | ```{r realstats, echo=FALSE, results="hide"}
609 | data(stats0)
610 | ```
611 | 
612 | ```{r browseStats}
613 | ## 4. generate and browse the report
614 | if (interactive())
615 |     browseURL(report(stats0))
616 | ```
617 | 
618 | Check out the qa report from all lanes
619 | 
620 | ```{r ShortRead-qa-all}
621 | data(qa_all)
622 | if (interactive())
623 |     browseURL(report(qa_all))
624 | ```
625 | 
626 | ### Alignments (and genomic annotations)
627 | 
628 | This data is from the `r Biocannopkg("airway")` Bioconductor
629 | annotation package; see the
630 | [vignette](http://bioconductor.org/packages/release/data/experiment/vignettes/airway/inst/doc/airway.html)
631 | for details
632 | 
633 | Integrative Genomics Viewer
634 | 
635 | 1. Start IGV and select the "hg19" genome.
636 | 
637 | 2. The sequence names used in the reference genome differ from those
638 |    used by IGV to represent the identical genome. We need to map
639 |    between these different sequence names, following the instructions
640 |    for
641 |    [Creating a Chromosome Name Alias File](http://www.broadinstitute.org/software/igv/LoadData/#aliasfile).
642 | 
643 |    Copy the file `hg19_alias.tab` from the location specified in class
644 |    into the directory `<user_home>/igv/genomes/`. Restart IGV.
645 | 
646 | 3. Start igv.
647 | 
648 | 4. Choose hg19 from the drop-down menu at the top left of
649 |    the screen
650 | 
651 | 5. Use `File -> Load from URL` menu to load a bam file. The URLs will
652 |    be provided during class.
653 | 
654 | 6. Zoom in to a particular gene, e.g., SPARCL1, by entering the gene
655 |    symbol in the box toward the center of the browser window. Adjust
656 |    the zoom until reads come in to view, and interpret the result.
657 | 
658 | _Bioconductor_: we'll explore how to map between different types of
659 | identifiers, how to navigate genomic coordinates, and how to query BAM
660 | files for aligned reads.
661 | 
662 | 1. Attach 'Annotation' packages containing information about gene
663 |    symbols `r Biocannopkg("org.Hs.eg.db")` and genomic coordinates
664 |    (e.g., genes, exons, cds, transcripts) `r
665 |    Biocannopkg(TxDb.Hsapiens.UCSC.hg19.knownGene)`. Arrange for the
666 |    'seqlevels' (chromosome names) in the TxDb package to match those
667 |    in the BAM files.
668 | 
669 | 2. Use an appropriate `org.*` package to map from gene symbol to
670 |    Entrez gene id, and the appropriate `TxDb.*` package to retrieve
671 |    gene coordinates of the SPARCL1 gene. N.B. -- The following uses a
672 |    single gene symbol, but we could have used 1, 2, or all gene
673 |    symbols in a _vectorized_ fashion.
674 | 
675 | 3. Attach the `r Biocpkg("GenomicAlignments")` package for working
676 |    with aligned reads. Use `range()` to get the genomic coordinates
677 |    spanning the first and last exon of SPARCL1. Input paired reads
678 |    overlapping SPARCL1.
679 | 
680 | 4. What questions can you easily answer about these alignments? E.g.,
681 |    how many reads overlap this region of interest?
682 | 
683 |     ```{r setup-view, message=FALSE, warning=FALSE}
684 |     ## 1.a 'Annotation' packages
685 |     suppressPackageStartupMessages({
686 |         library(TxDb.Hsapiens.UCSC.hg19.knownGene)
687 |         library(org.Hs.eg.db)
688 |     })
689 |     
690 |     ## 1.b -- map 'seqlevels' as recorded in the TxDb file to those in the
691 |     ## BAM file
692 |     fl <- "~/igv/genomes/hg19_alias.tab"
693 |     map <- with(read.delim(fl, header=FALSE, stringsAsFactors=FALSE),
694 |         setNames(V1, V2))
695 |     seqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, force=TRUE) <- map
696 |     
697 |     ## 2. Symbol -> Entrez ID -> Gene coordinates
698 |     sym2eg <- select(org.Hs.eg.db, "SPARCL1", "ENTREZID", "SYMBOL")
699 |     exByGn <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "gene")
700 |     sparcl1exons <- exByGn[[sym2eg$ENTREZID]]
701 |     
702 |     ## 3. Aligned reads
703 |     suppressPackageStartupMessages({
704 |         library(GenomicAlignments)
705 |     })
706 |     
707 |     fl <- "~/bam/SRR1039508_sorted.bam"
708 |     sparcl1gene <- range(sparcl1exons)
709 |     param <- ScanBamParam(which=sparcl1gene)
710 |     aln <- readGAlignmentPairs(fl, param=param)
711 |     ```
712 |     
713 | 5. As another exercise we ask how many of the reads we've input are
714 |    compatible with the known gene model. We have to find the
715 |    transcripts that belong to our gene, and then exons grouped by
716 |    transcript
717 | 
718 |     ```{r compatibleAlignments, warning=FALSE}
719 |     ## 5.a. exons-by-transcript for our gene of interest
720 |     txids <- select(TxDb.Hsapiens.UCSC.hg19.knownGene, sym2eg$ENTREZID,
721 |         "TXID", "GENEID")$TXID
722 |     exByTx <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx")[txids]
723 |     
724 |     ## 5.b compatible alignments
725 |     hits <- findCompatibleOverlaps(query=aln, subject=exByTx)
726 |     good <- seq_along(aln) %in% queryHits(hits)
727 |     table(good)
728 |     ```
729 |     
730 | 6. Finally, let's go from gene model to protein coding
731 |    sequence. (a) Extract CDS regions grouped by transcript, select just
732 |    transcripts we're interested in, (b) attach and then extract the coding
733 |    sequence from the appropriate reference genome. Translating the
734 |    coding sequences to proteins.
735 | 
736 |     ```{r coding-sequence, warning=FALSE}
737 |     ## reset seqlevels
738 |     restoreSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene)
739 |     
740 |     ## a. cds coordinates, grouped by transcript
741 |     txids <- select(TxDb.Hsapiens.UCSC.hg19.knownGene, sym2eg$ENTREZID,
742 |         "TXID", "GENEID")$TXID
743 |     cdsByTx <- cdsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx")[txids]
744 |     
745 |     ## b. coding sequence from relevant reference genome
746 |     suppressPackageStartupMessages({
747 |         library(BSgenome.Hsapiens.UCSC.hg19)
748 |     })
749 |     
750 |     dna <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19, cdsByTx)
751 |     protein <- translate(dna)
752 |     ```
753 | 
754 | ### Working with genomic ranges
755 | 
756 | Visit the "GenomicRanges HOWTOs" vignette. 
757 | 
758 | ```{r GenomicRanges-howtos, eval=FALSE}
759 | browseVignettes("GenomicRanges")
760 | ```
761 | 
762 | Read section 1, and
763 | do exercises 2.2, 2.4, 2.5, 2.8, 2.12, and 2.13. Perhaps select
764 | additional topics of particular interest to you.
765 | 
766 | ## Resources
767 | 
768 | _R_ / _Bioconductor_
769 | 
770 | - [Web site][Bioconductor] -- install, learn, use, develop _R_ /
771 |   _Bioconductor_ packages
772 | - [Support](http://support.bioconductor.org) -- seek help and
773 |   guidance; also
774 |   [StackOverflow](http://stackoverflow.com/questions/tagged/r) for _R_
775 |   programming questions
776 | - [biocViews](http://bioconductor.org/packages/release/BiocViews.html)
777 |   -- discover packages
778 | - Package landing pages, e.g.,
779 |   [GenomicRanges](http://bioconductor.org/packages/release/bioc/html/GenomicRanges.html),
780 |   including title, description, authors, installation instructions,
781 |   vignettes (e.g., GenomicRanges '[How
782 |   To](http://bioconductor.org/packages/release/bioc/vignettes/GenomicRanges/inst/doc/GenomicRangesHOWTOs.pdf)'),
783 |   etc.
784 | - [Course](http://bioconductor.org/help/course-materials/) and other
785 |   [help](http://bioconductor.org/help/) material (e.g., videos, EdX
786 |   course, community blogs, ...)
787 | 
788 | Publications and presentations
789 | 
790 | - Lawrence M, Huber W, Pagès H, Aboyoun P, Carlson M, et al. (2013)
791 |   Software for Computing and Annotating Genomic Ranges. PLoS Comput
792 |   Biol 9(8): e1003118. doi:
793 |   [10.1371/journal.pcbi.1003118][GRanges.bib]
794 | 
795 | - Lawrence, M. 2014. Software for Enabling Genomic Data
796 |   Analysis. Bioc2014 conference [slides](http://bioconductor.org/help/course-materials/2014/BioC2014/Lawrence_Talk.pdf).
797 | 
798 | <!-- Bibliography -->
799 | 
800 | [R]: http://r-project.org
801 | [Bioconductor]: http://bioconductor.org
802 | [GRanges.bib]: https://doi.org/10.1371/journal.pcbi.1003118
803 | [Scalable.bib]: http://arxiv.org/abs/1409.2864
804 | [Lawrence.bioc2014.bib]:
805 |     http://bioconductor.org/help/course-materials/2014/BioC2014/Lawrence_Talk.pdf
806 | 
807 | 
808 | [AnnotationData]: http://bioconductor.org/packages/release/BiocViews.html#___AnnotationData
809 | [AnnotationDbi]: http://bioconductor.org/packages/release/bioc/html/AnnotationDbi.html
810 | [AnnotationHub]: http://bioconductor.org/packages/release/bioc/html/AnnotationHub.html
811 | [BSgenome.Hsapiens.UCSC.hg19]: http://bioconductor.org/packages/release/data/annotation/html/BSgenome.Hsapiens.UCSC.hg19.html
812 | [BSgenome]: http://bioconductor.org/packages/release/bioc/html/BSgenome.html
813 | [Biostrings]: http://bioconductor.org/packages/release/bioc/html/Biostrings.html
814 | [Bsgenome.Hsapiens.UCSC.hg19]: http://bioconductor.org/packages/release/data/annotation/html/Bsgenome.Hsapiens.UCSC.hg19.html
815 | [CNTools]: http://bioconductor.org/packages/release/bioc/html/CNTools.html
816 | [ChIPQC]: http://bioconductor.org/packages/release/bioc/html/ChIPQC.html
817 | [ChIPpeakAnno]: http://bioconductor.org/packages/release/bioc/html/ChIPpeakAnno.html
818 | [DESeq2]: http://bioconductor.org/packages/release/bioc/html/DESeq2.html
819 | [DiffBind]: http://bioconductor.org/packages/release/bioc/html/DiffBind.html
820 | [GenomicAlignments]: http://bioconductor.org/packages/release/bioc/html/GenomicAlignments.html
821 | [GenomicRanges]: http://bioconductor.org/packages/release/bioc/html/GenomicRanges.html
822 | [Homo.sapiens]: http://bioconductor.org/packages/release/data/annotation/html/Homo.sapiens.html
823 | [IRanges]: http://bioconductor.org/packages/release/bioc/html/IRanges.html
824 | [KEGGREST]: http://bioconductor.org/packages/release/bioc/html/KEGGREST.html
825 | [PSICQUIC]: http://bioconductor.org/packages/release/bioc/html/PSICQUIC.html
826 | [Rsamtools]: http://bioconductor.org/packages/release/bioc/html/Rsamtools.html
827 | [Rsubread]: http://bioconductor.org/packages/release/bioc/html/Rsubread.html
828 | [ShortRead]: http://bioconductor.org/packages/release/bioc/html/ShortRead.html
829 | [SomaticSignatures]: http://bioconductor.org/packages/release/bioc/html/SomaticSignatures.html
830 | [TxDb.Hsapiens.UCSC.hg19.knownGene]: http://bioconductor.org/packages/release/data/annotation/html/TxDb.Hsapiens.UCSC.hg19.knownGene.html
831 | [VariantAnnotation]: http://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html
832 | [VariantFiltering]: http://bioconductor.org/packages/release/bioc/html/VariantFiltering.html
833 | [VariantTools]: http://bioconductor.org/packages/release/bioc/html/VariantTools.html
834 | [biocViews]: http://bioconductor.org/packages/release/BiocViews.html#___Software
835 | [biomaRt]: http://bioconductor.org/packages/release/bioc/html/biomaRt.html
836 | [cn.mops]: http://bioconductor.org/packages/release/bioc/html/cn.mops.html
837 | [edgeR]: http://bioconductor.org/packages/release/bioc/html/edgeR.html
838 | [ensemblVEP]: http://bioconductor.org/packages/release/bioc/html/ensemblVEP.html 
839 | [h5vc]: http://bioconductor.org/packages/release/bioc/html/h5vc.html
840 | [limma]: http://bioconductor.org/packages/release/bioc/html/limma.html
841 | [metagenomeSeq]: http://bioconductor.org/packages/release/bioc/html/metagenomeSeq.html
842 | [org.Hs.eg.db]: http://bioconductor.org/packages/release/data/annotation/html/org.Hs.eg.db.html
843 | [org.Sc.sgd.db]: http://bioconductor.org/packages/release/data/annotation/html/org.Sc.sgd.db.html
844 | [phyloseq]: http://bioconductor.org/packages/release/bioc/html/phyloseq.html
845 | [rtracklayer]: http://bioconductor.org/packages/release/bioc/html/rtracklayer.html
846 | [snpStats]: http://bioconductor.org/packages/release/bioc/html/snpStats.html
847 | [Gviz]: http://bioconductor.org/packages/release/bioc/html/Gviz.html
848 | [epivizr]: http://bioconductor.org/packages/release/bioc/html/epivizr.html
849 | [ggbio]: http://bioconductor.org/packages/release/bioc/html/ggbio.html
850 | [OmicCircos]: http://bioconductor.org/packages/release/bioc/html/OmicCircos.html
851 | 


--------------------------------------------------------------------------------
/vignettes/A01_Introduction.Rmd:
--------------------------------------------------------------------------------
 1 | <!--
 2 | %\VignetteIndexEntry{01. Learning R / Bioconductor for Sequence Analysis}
 3 | %\VignettePackage{LearnBioconductor}
 4 | %\VignetteEngine{knitr::knitr}
 5 | -->
 6 | 
 7 | ```{r setup, echo=FALSE}
 8 | library(LearnBioconductor)
 9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
10 | ```
11 | 
12 | ```{r style, echo = FALSE, results = 'asis'}
13 | BiocStyle::markdown()
14 | ```
15 | 
16 | # Learning R / Bioconductor for Sequence Analysis
17 | 
18 | This package contains training material for a Fall, 2014 introductory
19 | _R_ / _Bioconductor_ course "Learning _R_ / _Bioconductor_ for Sequence
20 | Analysis", offered October 27-29, Seattle, WA.
21 | 
22 | This course is directed at beginning and intermediate users who
23 | would like an introduction to the analysis and comprehension of
24 | high-throughput sequence data using _R_ and _Bioconductor_. Day 1 focuses
25 | on learning essential background: an introduction to the _R_ programming
26 | language; central concepts for effective use of _Bioconductor_ software;
27 | and an overview of high-throughput sequence analysis work flows. Day 2
28 | emphasizes use of _Bioconductor_ for specific tasks: an RNA-seq
29 | differential expression work flow; exploratory, machine learning, and
30 | other statistical tasks; gene set enrichment; and annotation.  Day 3
31 | transitions to understanding effective approaches for managing larger
32 | challenges: strategies for working with large data, writing re-usable
33 | functions, developing reproducible reports and work flows, and
34 | visualizing results.
35 | 
36 | The course combines lectures with extensive hands-on practicals;
37 | students are required to bring a laptop with wireless internet access
38 | and a modern version of the Chrome or Safari web browser.
39 | 
40 | ## Schedule (tentative)
41 | 
42 | Day 1: Learn _R_ / _Bioconductor_
43 | 
44 | - 9:00 - 10:30 [Introduction to _R_](A01.1_IntroductionToR.html):
45 |   objects, functions, help!
46 | - 11:00 - 12:30
47 |   [Introduction to _Bioconductor_](A01.2_IntroductionToBioconductor.html):
48 |   working with packages and classes
49 | - 1:30 - 5:00 (break: 3:00 - 3:30)
50 |   [Introduction to sequence analysis](A01.3_BioconductorForSequenceAnalysis.html):
51 |   typical work flow; data types and quality assessment; essential
52 |   _Bioconductor_ packages
53 | 
54 | Day 2: Use _R_ / _Bioconductor_ 
55 | 
56 | - 9:00 - 12:30 (break: 10:30 - 11:00) An RNA-seq differential
57 |   expression work flow ([lecture](B02.1_RNASeq.html);
58 |   [practical](B02.1.1_RNASeqLab.html))
59 | - 1:30 - 2:00 [Other work flows](B02.2_CommonWorkFlows.html) (survey):
60 |   ChIP-seq, variants, copy number, epigenomics
61 | - 2:00 - 3:00 [Machine learning](B02.3_MachineLearning.html);
62 |   exploratory and other statistical analysis
63 | - 3:30 - 4:00
64 |   [Annotating genes, genomes, and variants](B02.4_Annotation.html)
65 | - 4:00 - 5:00
66 |   [Approaches to gene set enrichment](B02.5_GeneSetEnrichment.html)
67 | 
68 | Day 3: Develop Skills and Best Practices
69 | 
70 | - 9:00 - 10:30 [Working with large data](C03.1_LargeData.html)
71 | - 11:00 - 12:30
72 |   [Organizing code in functions, files, and packages](C03.2_CodeToPackages.html)
73 | - 1:30 - 3:00
74 |   [Reproducible reports and work flows](C03.3_ReproducibleResearch.html)
75 | - 3:30 - 4:30 [Visualization](C03.4_Visualization.html)
76 | - 4:30 - 5:00 Summary
77 | 


--------------------------------------------------------------------------------
/vignettes/B02.1_RNASeq.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{02.1 RNA-Seq Work Flows}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r style, echo = FALSE, results = 'asis'}
  8 | BiocStyle::markdown()
  9 | knitr::opts_chunk$set(tidy=FALSE)
 10 | ```
 11 | 
 12 | ```{r setup, echo=FALSE}
 13 | library(LearnBioconductor)
 14 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 15 | ```
 16 | 
 17 | # RNA-Seq Work Flows
 18 | 
 19 | Martin Morgan, Sonali Arora<br/>
 20 | October 28, 2014
 21 | 
 22 | ## 7-step work flow
 23 | 
 24 | 
 25 | ### 1. Experimental design
 26 | 
 27 | Keep it simple
 28 | 
 29 | - Classical experimental designs
 30 | - Time series
 31 | - Without missing values, where possible
 32 | - Intended analysis must be feasbile -- can the available samples and
 33 |   hypothesis of interest be combined to formulate a testable
 34 |   statistical hypothesis?
 35 | 
 36 | Replicate
 37 | 
 38 | - Extent of replication determines nuance of biological question.
 39 | - No replication (1 sample per treatment): qualitative description
 40 |   with limited statistical options.
 41 | - 3-5 replicates per treatment: designed experimental manipulation
 42 |   with cell lines or other well-defined entities; 2-fold (?)
 43 |   change in average expression between groups.
 44 | - 10-50 replicates per treatment: population studies, e.g., cancer
 45 |   cell lines.
 46 | - 1000's of replicates: prospective studies, e.g., SNP discovery
 47 | - One resource: `r Biocpkg("RNASeqPower")`
 48 | 
 49 | Avoid confounding experimental factors with other factors
 50 | 
 51 | - Common problems: samples from one treatment all on the same flow
 52 |   cell; samples from treatment 1 processed first, treatment 2
 53 |   processed second, etc.
 54 | 
 55 | Record co-variates
 56 |        
 57 | Be aware of _batch effects_
 58 | 
 59 | - Leek et al., 2010, Nature Reviews Genetics 11
 60 |   [733-739](http://www.nature.com/nrg/journal/v11/n10/abs/nrg2825.html),
 61 |   Leek & Story PLoS Genet 3(9):
 62 |   [e161](https://doi.org/10.1371/journal.pgen.0030161).
 63 | - Scientific finding: pervasive batch effects
 64 | - Statistical insights: surrogate variable analysis: identify and
 65 |   build surrogate variables; remove known batch effects
 66 | - Benefits: reduce dependence, stabilize error rate estimates, and
 67 |   improve reproducibility
 68 | - _combat_ software / `r Biocpkg("sva")` _Bioconductor_ package 
 69 | 
 70 |   ![](our_figures/nrg2825-f2.jpg) 
 71 |   HapMap samples from one facility, ordered by date of processing.
 72 | 
 73 | ### 2. Wet-lab
 74 | 
 75 | Confounding factors
 76 | 
 77 | - Record or avoid
 78 | 
 79 | Artifacts of your _particular_ protocols
 80 | 
 81 | - Sequence contaminants
 82 | - Enrichment bias, e.g., non-uniform transcript representation.
 83 | - PCR artifacts -- adapter contaminants, sequence-specific
 84 |   amplification bias, ...
 85 | 
 86 | ### 3. Sequencing
 87 | 
 88 | Axes of variation
 89 | 
 90 | - Single- versus paired-end
 91 | - Length: 50-200nt
 92 | - Number of reads per sample
 93 | 
 94 | Application-specific, e.g.,
 95 | 
 96 | - ChIP-seq: short, single-end reads are usually sufficient
 97 | - RNA-seq, known genes: single- or  paired-end reads
 98 | - RNA-seq, transcripts or novel variants: paired-end reads
 99 | - Copy number: single- or paired-end reads
100 | - Structural variants: paired-end reads
101 | - Variants: depth via longer, paired-end reads
102 | - Microbiome: long paired-end reads (overlapping ends)
103 | 
104 | ### 4. Alignment
105 | 
106 | Alignment strategies
107 | 
108 | - _de novo_
109 |   - No reference genome; considerable sequencing and computational
110 |     resources
111 | - Genome
112 |   - Established reference genome
113 |   - Splice-aware aligners
114 |   - Novel transcript discovery
115 | - Transcriptome
116 |   - Established reference genome; reliable gene model
117 |   - Simple aligners
118 |   - Known gene / transcript expression
119 | 
120 | Splice-aware aligners (and _Bioconductor_ wrappers)
121 | 
122 | - [Bowtie2][] (`r Biocpkg("Rbowtie")`)
123 | - [STAR][] ([doi](https://doi.org/10.1093/bioinformatics/bts635))
124 | - [GMAP/GSNAP][] (`r Biocpkg("gmapR")`)
125 | - subread ([doi](https://doi.org/10.1093/nar/gkt214))
126 |   (`r Biocpkg("Rsubread")`)
127 | - Systematic evaluation (Engstrom et al., 2013,
128 |   [doi](https://doi.org/10.1038/nmeth.2722))
129 | 
130 | ### (5a. Bowtie2 / tophat / Cufflinks / Cuffdiff)
131 | 
132 | - [tophat][] uses [Bowtie2][] to perform basic single- and paired-end
133 |   alignments, then uses algorithms to place difficult-to-align reads
134 |   near to their well-aligned mates.
135 | - [Cufflinks][] ([doi](https://doi.org/10.1038/nprot.2012.016))
136 |   takes _tophat_ output and estimate existing and novel transcript
137 |   abundance.
138 |   [How Cufflinks Works](http://cufflinks.cbcb.umd.edu/howitworks.html)
139 | - [Cuffdiff][] assesses statistical significance of estimated
140 |   abundances between experimental groups
141 | 
142 | ### 5. Reduction to 'count tables'
143 | 
144 | - Use known gene model to count aligned reads overlapping regions of
145 |   interest / gene models
146 | - Gene model can be public (e.g., UCSC, NCBI, ENSEMBL) or _ad hoc_ (gff file)
147 | - `GenomicAlignments::summarizeOverlaps()`
148 | - [HTSeq](http://www-huber.embl.de/users/anders/HTSeq/doc/overview.html),
149 |   [htseq-count](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html)
150 | 
151 | ### Step 6. Analysis
152 | 
153 | Summarization
154 | 
155 | - Counts _per se_, rather than a summary (RPKM, FRPKM, ...), are
156 |   relevant for analysis
157 |   - For a given gene, larger counts imply more information; RPKM etc.,
158 |     treat all estimates as equally informative.
159 |   - Comparison is across samples at _each_ region of interest; all
160 |     samples have the same region of interest, so modulo library size
161 |     there is no need to correct for, e.g., gene length or mapability.
162 | 
163 | Normalization
164 | 
165 | - Libraries differ in size (total counted reads per sample) for
166 |   un-interesting reasons; we need to account for differences in
167 |   library size in statistical analysis.
168 | - Total number of counted reads per sample is _not_ a good estimate of
169 |   library size. It is un-necessarily influenced by regions with large
170 |   counts, and can introduce bias and correlation across
171 |   genes. Instead, use a robust measure of library size that takes
172 |   account of skew in the distribution of counts (simplest: trimmed
173 |   geometric mean; more advanced / appropriate encountered in the lab).
174 | - Library size (total number of counted reads) differs between
175 |   samples, and should be included _as a statistical offset_ in
176 |   analysis of differential expression, rather than 'dividing by' the
177 |   library size early in an analysis.
178 | 
179 | Appropriate error model
180 | 
181 | - Count data is _not_ distributed normally or as a Poisson process,
182 |   but rather as negative binomial. 
183 | - Result of a combination Poisson (`shot' noise, i.e., within-sample
184 |   technical and sampling variation in read counts) with variation
185 |   between biological samples.
186 | - A negative binomial model requires estimation of an additional
187 |   parameter ('dispersion'), which is estimated poorly in small
188 |   samples.
189 | - Basic strategy is to moderate per-gene estimates with more robust
190 |   local estimates derived from genes with similar expression values (a
191 |   little more on borrowing information is provided below).
192 | 
193 | Pre-filtering
194 | 
195 | - Naively, a statistical test (e.g., t-test) could be applied to each
196 |   row of a counts table. However, we have relatively few samples
197 |   (10's) and very many comparisons (10,000's) so a naive approach is
198 |   likely to be very underpowered, resulting in a very high _false
199 |   discovery rate_
200 | - A simple approach is perform fewer tests by removing regions that
201 |   could not possibly result in statistical significance, regardless of
202 |   hypothesis under consideration.
203 | - Example: a region with 0 counts in all samples could not possibly be
204 |   significant regradless of hypothesis, so exclude from further
205 |   analysis.
206 | - Basic approaches: 'K over A'-style filter -- require a minimum of A
207 |   (normalized) read counts in at least K samples. Variance filter,
208 |   e.g., IQR (inter-quartile range) provides a robust estimate of
209 |   variability; can be used to rank and discard least-varying regions.
210 | - More nuanced approaches: `r Biocpkg("edgeR")` vignette; work flow
211 |   today.
212 | 
213 | Borrowing information
214 | 
215 | - Why does low statistical power elevate false discovery rate?
216 | - One way of developing intuition is to recognize a t-test (for
217 |   example) as a ratio of variances. The numerator is
218 |   treatment-specific, but the denominator is a measure of overall
219 |   variability.
220 | - Variances are measured with uncertainty; over- or under-estimating
221 |   the denominator variance has an asymmetric effect on a t-statistic
222 |   or similar ratio, with an underestimate _inflating_ the statistic
223 |   more dramatically than an overestimate deflates the statistic. Hence
224 |   elevated false discovery rate.
225 | - Under the typical null hypothesis used in microarray or RNA-seq
226 |   experiments, each gene may respond differently to the treatment
227 |   (numerator variance) but the overall variability of a gene is
228 |   the same, at least for genes with similar average expression
229 | - The strategy is to estimate the denominator variance as the
230 |   between-group variance for the gene, _moderated_ by the average
231 |   between-group variance across all genes.
232 | - This strategy exploits the fact that the same experimental design
233 |   has been applied to all genes assayed, and is effective at
234 |   moderating false discovery rate.
235 | 
236 | ### Step 7. Comprehension
237 | 
238 | Placing differentially expressed regions in context
239 | 
240 | - Gene names associated with genomic ranges
241 | - Gene set enrichment and similar analysis
242 | - Proximity to regulatory marks
243 | - Integrate with other analyses, e.g., methylation, copy number,
244 |   variants, ...
245 |   
246 |   ![Copy number / expression QC](our_figures/copy_number_QC_2.png)
247 |   Correlation between genomic copy number and mRNA expression
248 |   identified 38 mis-labeled samples in the TCGA ovarian cancer
249 |   Affymetrix microarray dataset.
250 | 
251 | ## Lab
252 | 
253 | [The lab](B02.1.1_RNASeqLab.html) is based on a modified version of the
254 | RNA-seq work flow developed by Michael Love, Simon Anders, Wolfgang
255 | Huber.
256 | 
257 | 
258 | [Bowtie2]: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
259 | [tophat]: http://ccb.jhu.edu/software/tophat/index.shtml 
260 | [Cufflinks]: http://cufflinks.cbcb.umd.edu/
261 | 
262 | [RSEM]: http://deweylab.biostat.wisc.edu/rsem/
263 | [STAR]: https://github.com/alexdobin/STAR
264 | [GMAP/GSNAP]: http://research-pub.gene.com/gmap/
265 | 


--------------------------------------------------------------------------------
/vignettes/B02.2.3_CopyNumber.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{02.2.1 Copy Number}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r setup, echo=FALSE}
  8 | options(error=traceback)
  9 | library(LearnBioconductor)
 10 | set.seed(123L)
 11 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 12 | ```
 13 | 
 14 | ```{r style, echo = FALSE, results = 'asis'}
 15 | BiocStyle::markdown()
 16 | knitr::opts_chunk$set(tidy=FALSE)
 17 | ```
 18 | 
 19 | # Copy number work flow
 20 | 
 21 | Sonali Arora, Martin Morgan<br/>
 22 | October 28, 2014
 23 | 
 24 | Copy Number Variation (CNV's) refers to the duplication or deletion of
 25 | DNA segments larger than 1 kb. CNV's are structural variations in the
 26 | genome which range in length between 50 bp and 1 Mbp. They are
 27 | widespread among humans - on an average 12 CNVs exist per individual
 28 | in comparison to the reference genome. They have also been shown to
 29 | play a role in diseases such as autism, breast cancer, obesity,
 30 | Alzheimer’s disease and schizophrenia among other diseases.
 31 | 
 32 | ## Experimental design
 33 | 
 34 | Like any other genomic analysis, before we start a copy Number
 35 | analysis, we need to consider experimental design. Here, we highlight
 36 | two specific pointers that one needs to keep in mind while designing a
 37 | copy number analysis.
 38 | 
 39 | **Tumor and normal samples** Are we planning to just find copy number
 40 | profile in individuals? For example, how does the copy number profile
 41 | for a region evolve over a certain period of time? (Here we are
 42 | comparing the copy number profile to a reference genome)
 43 | 
 44 | Are we planning to compare copy number profiles from tumor vs normal
 45 | profiles?  For example, we may be trying to find out if copy number
 46 | changes are responsible for a certain form of cancer and want to find
 47 | the exact genomic region against which a treatment can be developed.
 48 | 
 49 | There are different packages and different functions inside the same
 50 | package which handle CNV for tumor and normal samples or CNV in
 51 | samples.
 52 | 
 53 | **Germ line versus somatic CNV** Germ line CNV are relatively short
 54 | (a few bp to a few Mbp) copy number changes that the individual
 55 | inherits from one of the two parental gametes and thus are typically
 56 | present in 100% of cells.
 57 | 
 58 | Somatic CNV (often called CNA where A stands for alterations or aberration) are
 59 | copy number changes of any size and amount (from a few bases to whole 
 60 | chromosomes) that happen (and often carry on happening) in cancer cells. 
 61 | Cancer cells can be aneuploid (that means they are largely triploid, 
 62 | tetraploid or even aploid) and can have high focal amplifications 
 63 | (some regions could have many copies: it is not unusual to have 8-12 copies for 
 64 | some regions). Furthermore, because tumor samples are typically an admixture 
 65 | of normal and cancer cells, the tumor purity in unknown and variable. 
 66 | 
 67 | Different algorithms make different assumptions while handling somatic
 68 | or germ line CNV. Typically, germ line cnv caller can assume:
 69 | 
 70 | - The genome is largely diploid.
 71 | - The sample is pure and homogeneous.
 72 | - Any gain or loss should be 50% move or 50% less coverage.
 73 | 
 74 | For these reasons, the algorithms can focus more on associating
 75 | _p_-values for each call; it is possible to estimate false positive
 76 | and false negative rates.
 77 | 
 78 | Somatic CNA callers cannot make any of the assumption above, or if
 79 | they do, they have limited scope.
 80 |   
 81 | ## Sequencing technology
 82 | 
 83 | Some key questions when thinking about sequencing technology to use
 84 | include:
 85 | 
 86 | +   What kind of sequencing data are we working with? 
 87 | 
 88 |     Is it array CGH data, SNP array or next generation sequencing
 89 |     data? For example, the `r Biocpkg("CGHbase")` detects CNV's in
 90 |     array CGH data whereas the `r Biocpkg("seqCNA")` among others
 91 |     detects CNV's in high-throughput sequencing data.
 92 |   
 93 | +   Amount of genome being sequenced - whole genome vs exome?
 94 | 
 95 |     Are we looking for copy number across the whole genome or are we
 96 |     looking for copy number only in exomes? Again different packages
 97 |     handle different kind of sequencing data. For example, The
 98 |     _Bioconductor_ package `r Biocpkg("exomeCopy")` detects CNV in
 99 |     exome sequencing data whereas the _Bioconductor_ package 
100 |     `r Biocpkg("cn.mops")` detects CNV in whole genome sequencing data.
101 |   
102 | +   Which platform was the sequencing done on?
103 | 
104 |     A lot of packages detecting CNV on platform-specific data. 
105 |     For example, the `r Biocpkg("crlmm")` detects CNV's in Affymetrix SNP 5.0 
106 |     and 6.0 and Illumina arrays whereas the `r Biocpkg("CopyNumber450K")` 
107 |     detects them in Illumina 450k methylation microarrays. 
108 |   
109 | +   What is the coverage of sequenced data?
110 | 
111 |     Most packages work well with high coverage sequencing data, but
112 |     some packages are designed to work well with low coverage data. It
113 |     is best to recognize how coverage of our data will affect the
114 |     choice of the package we use for our analysis at an early stage.
115 |     
116 | ## Copy number analysis algorithm?
117 |     
118 | Since statistics plays a huge role in copy number analysis, we should
119 | also spend some time in thoroughly understanding the underlying
120 | algorithm of the _R_ package being used. A few questions to consider
121 | while choosing a package would be -
122 | 
123 | 1. How is our chosen package binning and counting reads?
124 | 
125 | 2. Is any pre-processing required from our end? Is it trimming aligned
126 |    reads internally?
127 | 
128 | 3. What segmentation algorithm is being used ? For example, does the
129 |    package use Circular Binary Segmentation, HMM based methods etc.
130 | 
131 | 4. How efficiently can it handle big data? Do I need additional
132 |    computational resources to run the analysis? Does the function run
133 |    in parallel?
134 |   
135 | ## Available resources in _Bioconductor_ 
136 | 
137 | _Bioconductor_ currently has about 41 packages for Copy Number
138 | Analysis.  To find these, one can visit the
139 | [biocViews](http://bioconductor.org/packages/devel/BiocViews.html#___Software.)
140 | page and type "CopyNumberVariation" in the "Autocomplete biocViews
141 | search"
142 | 
143 | ## Workflow using _cn.mops_
144 | 
145 | Lets work through a small example to illustrate how straight-forward a
146 | copy number analysis can be once you've figured out all the
147 | logistics. We will also find the genes that lie within the detected
148 | copy number regions.
149 | 
150 | For this analysis, I chose the `r Biocpkg("cn.mops")` package as it
151 | helps us with
152 | 
153 | - Detecting germ-line CNVs
154 | - Works well with low coverage data 
155 | - Handles both single copy number analysis and tumor vs normal copy
156 |   number analysis
157 | - Uses parallel processing internally so we get fast computation
158 | - Handles whole genome sequencing data
159 | - Supports _GenomicRanges_ infrastructure which allows easy workflows
160 |   with other Bioconductor packages.
161 | 
162 | We start by downloading relevant files, if necessary
163 | 
164 | ```{r cvn-setup-1, echo=FALSE}
165 | destdir <- "~/bigdata"
166 | if (!file.exists(destdir))
167 |     dir.create(destdir)
168 | ```
169 | 
170 | ```{r cnv-setup-2, message=FALSE}
171 | ## set path/to/download/directory, e.g.,
172 | ## destdir <- "~/bam/copynumber"
173 | stopifnot(file.exists(destdir))
174 | 
175 | bamFiles <- file.path(destdir,
176 |                       c("tumorA.chr4.bam", "normalA.chr4.bam"))
177 | urls <- paste0("http://s3.amazonaws.com/copy-number-analysis/",
178 |                basename(bamFiles))
179 | for (i in seq_along(bamFiles))
180 |     if (!file.exists(bamFiles[i])) {
181 |         download.file(urls[i], bamFiles[i])
182 |         download.file(paste0(urls[i], ".bai"), paste0(bamFiles[i], ".bai"))
183 |     }
184 | ```
185 | 
186 | The main work flow 1) loads the library; 2) counts reads; 3)
187 | normalizes counts; 4) detects CNVs; and 5) visualizes results.
188 | 
189 | ```{r cnv-workflow, message=FALSE}
190 | ## 1. Load the cn.mops package
191 | suppressPackageStartupMessages({
192 |     library(cn.mops)
193 | })
194 | 
195 | ## 2. We can bin and count the reads
196 | reads_gr <- getReadCountsFromBAM(BAMFiles = bamFiles,
197 |     sampleNames = c("tumor", "normal"),
198 |     refSeqName = "chr4", WL = 10000, mode = "unpaired")
199 | 
200 | ## 3. Noramlization
201 | ## We need a special normalization because the tumor has many large CNVs
202 | X <- normalizeGenome(reads_gr, normType="poisson")
203 | 
204 | ## 4. Detect cnv's
205 | ref_analysis <- referencecn.mops(X[,1], X[,2],
206 |      norm=0, 
207 |      I = c(0.025, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 8, 16, 32, 64), 
208 |      classes = paste0("CN", c(0:8, 16, 32, 64, 128)),
209 |      segAlgorithm="DNAcopy")
210 | resCNMOPS <- calcIntegerCopyNumbers(ref_analysis)
211 | 
212 | ## 5. Visualize the cnv's
213 | segplot(resCNMOPS)
214 | ```
215 | 
216 | Here the x-axis represents the genomic position and the y-axis
217 | represents the log ratio of read counts and copy number call of each
218 | segment (red)
219 | 
220 | ```{r cnv-regions}
221 | human_cn <- cnvr(resCNMOPS)
222 | human_cn
223 | ```
224 | 
225 | To find the genes that lie in these copy number regions, we will use
226 | the _TranscriptDb_ object for hg19
227 | 
228 | ```{r cnv-annotate-txdb, message=FALSE}
229 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
230 | ## subset to work with only chr4
231 | txdb <- keepSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene, "chr4")
232 | genes0 <- genes(txdb)
233 | ## 'unlist' so that each range is associated with a single gene identifier
234 | idx <- rep(seq_along(genes0), elementLengths(genes0$gene_id))
235 | genes <- granges(genes0)[idx]
236 | genes$gene_id = unlist(genes0$gene_id)
237 | ```
238 | 
239 | Next we will use find overlaps to assign gene identifiers to cnv
240 | regions.
241 | 
242 | ```{r cnv-annotate}
243 | olaps <- findOverlaps(genes, human_cn, type="within")
244 | idx <- factor(subjectHits(olaps), levels=seq_len(subjectLength(olaps)))
245 | human_cn$gene_ids <- splitAsList(genes$gene_id[queryHits(olaps)], idx)
246 | human_cn
247 | ```
248 | 
249 | ## Session info
250 | 
251 | The packages and versions used in this work flow are as follows:
252 | 
253 | ```{r cvn-session-info}
254 | restoreSeqlevels(TxDb.Hsapiens.UCSC.hg19.knownGene)
255 | sessionInfo()
256 | ```
257 | 


--------------------------------------------------------------------------------
/vignettes/B02.2_CommonWorkFlows.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{02.2 Other Sequence Analysis Work Flows}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r setup, echo=FALSE}
  8 | library(LearnBioconductor)
  9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 10 | ```
 11 | 
 12 | ```{r style, echo = FALSE, results = 'asis'}
 13 | BiocStyle::markdown()
 14 | knitr::opts_chunk$set(tidy=FALSE)
 15 | ```
 16 | 
 17 | # Common Sequence Analysis Work Flows
 18 | 
 19 | Martin Morgan, Sonali Arora<br/>
 20 | October 28, 2014
 21 | 
 22 | ## RNA-seq
 23 | 
 24 | See the [lecture notes](B02.1_RNASeq.html) and [lab](B02.1_RNASeqLab.html).
 25 | 
 26 | RNA-seq differential expression of known _genes_
 27 | 
 28 | - Simplest scenario
 29 | - Experimental design: simple, replicated; track covariates and be
 30 |   aware of batch effects
 31 | - Sequencing: moderate length and number of reads; single or
 32 |   paired-end (though probably paired-end).
 33 | - Alignment: basic splice-aware aligner, e.g., _Bowtie2_,
 34 |   _STAR_. Viable _Bioconductor_ approaches: `r Biocpkg("Rsubread")`,
 35 |   `r Biocpkg("Rbowtie")` (especially via the `r Biocpkg("QuasR")`
 36 |   package).
 37 | - Reduction: `GenomicRanges::summarizeOverlaps()` or external tools,
 38 |   using gene model from `TxDb.*` package or GFF / GTF files. End
 39 |   result: matrix of counts.
 40 | - Analysis: `r Biocpkg("DESeq2")`, `r Biocpkg("edgeR")`, and
 41 |   additional software.
 42 |   
 43 | RNA-seq differential expression of known and novel _transcripts_
 44 | 
 45 | - Popular non-_R_ work flow: _Rbowtie2_, _tophat_, _cufflinks_, _cuffdiff_.
 46 | - _Biocondutor_ options
 47 |     
 48 |     - `r Biocpkg("DEXSeq")`: differential _exon_ use.
 49 |     - `Rsubread::subjunc()` for aligning without requiring known gene models.
 50 |     - `r Biocpkg("cummeRbund")`: working with _cufflinks_ output.
 51 | 
 52 | Single-cell expression
 53 | 
 54 | - `r Biocpkg("monocle")`
 55 | 
 56 | ## ChIP-seq
 57 | 
 58 | See my recent
 59 | [slides](http://bioconductor.org/help/course-materials/2014/CSAMA2014/4_Thursday/lectures/ChIPSeq_slides.pdf)
 60 | outlining ChIP-seq and relevant _Bioconductor_ software.
 61 | 
 62 | - Experimental design / wet lab: important to effectively enrich
 63 |   genomic DNA via ChIP, otherwise hard to distinguish signal peaks from background
 64 | - Sequencing: moderate length and number of single-end reads very adequate.
 65 | - Alignment: Basic aligners sufficient
 66 | - Reduction
 67 | 
 68 |     - External software; many tools depending on application, e.g., _MACS_.
 69 |     - Product: BED and / or WIG files of called peaks
 70 | 
 71 | - Analysis & Comprehension
 72 | 
 73 |     - `r Biocpkg("ChIPQC")` for quality control.
 74 |     - `r Biocpkg("rtracklayer")` to input BED and WIG files to
 75 |       standard _Bioconductor_ data structures.
 76 |     - `r Biocpkg("ChIPpeakAnno")`, `r Biocpkg("ChIPXpres")` for
 77 |       annotating peaks in relation to genes.
 78 |     - `r Biocpkg("DiffBind")` to assess differential representation of
 79 |       peaks in a designed experiment.
 80 |     - `r Biocpkg("AnnotationHub")` for accessing (some)
 81 |       consortium-level summary data.
 82 | 
 83 | ## Copy Number
 84 | 
 85 | See the [Copy Number Workflow](./B02.2.3_CopyNumber.html) document.
 86 | 
 87 | ## Variants
 88 | 
 89 | See Michael Lawrence's variant calling with
 90 | [VariantTools](http://bioconductor.org/help/course-materials/2014/BioC2014/Lawrence_Tutorial.pdf).
 91 | and Val Obenchain's manipulation and annotation of called variants with
 92 | [VariantAnnotation](http://bioconductor.org/help/workflows/variants/).
 93 | 
 94 | - Sequencing: requires high-quality reads with high per-nucleotide
 95 |   depth of coverage -- longer, paired-end sequencing.
 96 | - Alignment: requires effective aligners; _BWA_, _GMAP_, ...
 97 | 
 98 |     - `r Biocpkg("gmapR")` wraps the GMAP aligner in _R_.
 99 |     
100 | - Reduction: typically to VCF files summarizing variants and / or
101 |   population-level variation. _GATK_ and other non-_R_ tools commonly
102 |   used.
103 |   
104 |     - `r Biocpkg("VariantTools")` includes facilities for calling
105 |       variants.
106 |     - `r Biocpkg("h5vc")` targets a different intermediate step:
107 |       summarize base counts at each position in the genome; use this
108 |       as a starting point for calling variants, and to evaluate false
109 |       positives, etc.
110 |       
111 | - Analysis & comprehension
112 | 
113 |     - `r Biocpkg("VariantAnnotation")`, `r Biocpkg("ensemblVEP")` for
114 |       querying / inputting VCF files, and for annotation of variants
115 |       ("is this a coding variant?", etc.).
116 |     - `r Biocpkg("SomaticSignatures")` for working with somatic
117 |       signatures of single-nucleotide variants.
118 | 
119 | ## Epigenomics
120 | 
121 | See the short
122 | [introduction](http://bioconductor.org/help/course-materials/2014/Epigenomics/MethylationArrays.html)
123 | and
124 | [lab](http://bioconductor.org/help/course-materials/2014/Epigenomics/MethylationArrays-lab.html)
125 | centered around Illumina 450k methylation arrays and the `r Biocpkg("minfi")` package.
126 | 
127 | - Analysis & comprehension: `r Biocpkg("bsseq")`, `r Biocpkg("BiSeq")`
128 |   for processing and analysis; `r Biocpkg("bumphunter")` as basic tool
129 |   for identifying CpG features.
130 | 
131 | ## Microbiome
132 | 
133 | - Experimental design: typically population-level surveys with
134 |   moderate (10's-100's) of samples.
135 | - Wet lab & sequencing: often target phylogenetically-informative
136 |   genes, requiring longer (overlapping) paired-end reads. Many
137 |   existing studies used 454 technology, which has a different
138 |   sequencing error model than Illumina (e.g., homopolymers are a
139 |   common error, instead of trailing nucleotide quality deterioration).
140 | - Reduction: Pre-processing (e.g., knitting together overlapping
141 |   paired-end reads) and taxonomic classification / placement in
142 |   third-party software, e.g., _QIIME_, _pplacer_. End result: count
143 |   table summarizing represenation of distinct taxa in each sample.
144 |   
145 |     - `r Biocpkg("rRDP")` provides an _R_ / _Bioconductor_ interface
146 |       to the RDP classifiere.
147 | 
148 | - Analysis: _R_ / _Bioconductor_ and many insights from microarray /
149 |   RNA-seq analysis well suited to count table, but common pipelines
150 |   have re- or dis-invented the wheel.
151 |   
152 |     - `r Biocpkg("phyloseq")` provides very nice tools for general
153 |       analysis.
154 | 


--------------------------------------------------------------------------------
/vignettes/B02.3_MachineLearning.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{02.3 Machine Learning}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r style, echo = FALSE, results = 'asis'}
  8 | BiocStyle::markdown()
  9 | knitr::opts_chunk$set(tidy=FALSE)
 10 | ```
 11 | 
 12 | ```{r setup, echo=FALSE}
 13 | library(LearnBioconductor)
 14 | library(xtable)
 15 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 16 | ```
 17 | 
 18 | # Machine Learning 
 19 | 
 20 | Sonali Arora <br/>
 21 | 28 October, 2014
 22 | 
 23 | ## Introduction 
 24 | 
 25 | Statistical Learning plays an important role in Genomics. A few example of 
 26 | learning problems would be - 
 27 | * Identify the risk factors(genes) for prostrate cancer based on gene
 28 |   expression data
 29 | * Predict the chances of breast cancer survival in a patient. 
 30 | * Identify patterns of gene expression among different sub types of
 31 |   breast cancer
 32 | 
 33 | Typically, we have an outcome measurement, usually quantitative (such as gene
 34 | expression) or categorical (such as breast cancer /no breast cancer), that we 
 35 | wish to predict based on a set of features (such as diet and clinical 
 36 | measurements). We have a training set of data, in which we observe the outcome 
 37 | and feature measurements for a set of objects (such as people). With this data
 38 | we build a prediction model, or a learner, which will enable us to predict
 39 | the outcome for new unseen objects. A good learner is one that accurately
 40 | predicts such an outcome. This is called _supervised_ learning because the 
 41 | outcome variable guides the learning process. In the _unsupervised_ learning 
 42 | problem, have no measurements of the outcome and observe only the features. 
 43 | Our task is to describe how the data are organized or clustered. 
 44 | 
 45 | ## Machine Learning with Bioconductor : Resources Overview 
 46 | 
 47 | Bioconductor houses a lot of R packages which provide machine learning tools for 
 48 | different kinds of Genomic Data analysis. It has had many releases since its 
 49 | inception. The latest cutting edge development happens in the "devel" version. 
 50 | One can find R packages specially catered to Machine Learning for Genomic 
 51 | Analysis by visiting the following link and using
 52 | [biocViews](http://www.bioconductor.org/packages/devel/BiocViews.html#___Software).
 53 | 
 54 | For example, one can type "Clustering" in the "Autocomplete biocViews search" 
 55 | box and find that there are 89 packages in Bioconductor which provide 
 56 | "Clustering" functionality for Genomic Data. On the same page, under 
 57 | "StatisticalMethod" one can find R packages for other machine learning 
 58 | techniques -
 59 | 
 60 | ```{r echo=FALSE, results='asis'}
 61 | biocView_df <- data.frame( technique=c("Bayesian","Classification","Clustering"
 62 |     ,"DecisionTree","NeuralNetwork","SupportVectorMachines","DimensionReduction",
 63 |     "HiddenMarkovModel","Regression","PrincipalComponent"), packages = 
 64 |     as.integer(c(15, 64, 89, 7, 1, 1, 2, 4, 7, 4)))
 65 | print(xtable(biocView_df), type="html", comment=FALSE)
 66 | ```
 67 | 
 68 | ## Data for this Lab 
 69 | 
 70 | We will use two data sets in this lab - one for supervised learning and one for 
 71 | unsupervised learning. 
 72 | 
 73 | For **Unsupervised learning**, we will use the NCI60 cancer cell 
 74 | microarray data which contains 6830 gene expression measurements of 64 cancer 
 75 | cell  lines. We dropped the sub types which contain only 1 sample ( there were 
 76 | about 6 of them) and created a SummarizedExperiment object.
 77 | 
 78 | The *SummarizedExperiment* class is a matrix-like container where rows 
 79 | represent ranges of interest (as a GRanges or GRangesList-class) and columns 
 80 | represent samples (with sample data summarized as a DataFrame-class). A 
 81 | SummarizedExperiment contains one or more assays, each represented by a 
 82 | matrix-like object of numeric or other mode.
 83 | 
 84 | ![Summarized Experiment](our_figures/SummarizedExperiment.png)
 85 | 
 86 | This object has been made available to you, you can simply read it in using 
 87 | 
 88 | ```{r message=FALSE}
 89 | library(GenomicRanges)
 90 | sefile <- system.file("extdata", "NCI60.Rda", package="LearnBioconductor")
 91 | load(sefile)
 92 | nci60data <- t(assay(NCI60))
 93 | ncilabels <- colData(NCI60)
 94 | ```
 95 | The gene expression data is stored in "assay" whereas the labels are stored in 
 96 | colData.
 97 | 
 98 | For **Supervised learning**, we will use cervical count data from the
 99 | Biocoductor package, `r Biocpkg("MLSeq")`. This data set contains
100 | expressions of 714 miRNA's of human samples. There are 29 tumor and 29
101 | non-tumor cervical samples. For learning purposes, we can treat these
102 | as two separate groups and run various classification algorithms.
103 | 
104 | ```{r message=FALSE}
105 | library(MLSeq)
106 | filepath = system.file("extdata/cervical.txt", package = "MLSeq")
107 | cervical = read.table(filepath, header = TRUE)
108 | 
109 | ```
110 | 
111 | ### Exercise 
112 | 
113 | Download the NCI60 [data](http://statweb.stanford.edu/~tibs/ElemStatLearn/)
114 | from the given url and create a SummarizedExperiment
115 | object filtering out subtypes which contain only 1 sample. 
116 | 
117 | ## Unsupervised Learning
118 | 
119 | Unsupervised Learning is a set of statistical tools intended for the setting
120 | in which we have only a set of features $X_1$, $X_2$, ....,$X_p$ measured 
121 | on 'n' observations. We are primarily interested in discovering interesting 
122 | things on the measurement $X_1$, $X_2$, ....,$X_p$
123 | 
124 | Unsupervised Learning is often performed as a part of Exploratory Data Analysis. 
125 | These tools help us to get a good idea about the data set. Unlike a supervised
126 | learning problem, where we can use prediction to gain some confidence about our
127 | learning algorithm, there is no way to check our model. The learning algorithm
128 | is thus, aptly named "unsupervised".
129 | 
130 | Nevertheless, it offers great insights for our given problem. For Example, in 
131 | case of our NCI60 data set, we may want to look for subgroups among our cancer 
132 | samples or among genes to better understand how the disease works.
133 | 
134 | ### PCA 
135 | 
136 | One of the classic things that we are told as soon as we get our hands on is to 
137 | make a scatter plot and visualize the data. With big data, this is not always 
138 | very easy to do. For example, in our NCI60 data set we would have about 
139 | ( 6830 C 2) = 6830(6830-1)/2= 23321035 scatter plots ! It is certainly not 
140 | possible to look at all of them and since they each contain only a small amount
141 | of the total information present in the  data set, none of them will be 
142 | informative. Thus, we need a low dimensional representation of data which 
143 | captures as much variation as possible.
144 | 
145 | *Principal components* allow us to summarize the data with a smaller number of 
146 | variables that explain the variability in the data set. The process by which 
147 | these principal components are computed and the usage of these components in 
148 | understanding the data is known as *Principal Componenet Analysis*.  
149 | 
150 | Let us use PCA to visualize the  NCI60 data set. We first perform PCA on the 
151 | data after scaling the variables (genes) to have standard deviation one and 
152 | plot the first few principal components in order to visualize the data. As you 
153 | can see from the following figure, with big data, biplot() is not very 
154 | informative.
155 | 
156 | ```{r}
157 | pcaRes <- prcomp(nci60data, scale=TRUE)
158 | biplot(pcaRes)
159 | ```
160 | So lets look at the first 3 components and see if we can gain some interesting 
161 | insights from them.
162 | 
163 | ```{r fig.width=12}
164 | # make colors as factors. 
165 | labs <- as.character(unlist(as.list(ncilabels)))
166 | 
167 | cellColor <- function(vec)
168 | {
169 |     uvec <- unique(vec)
170 |     cols = rainbow(length(uvec))
171 |     colvec <- cols[as.numeric(as.factor(vec))]
172 |     list(colvec=colvec, cols=cols, labels= uvec)
173 | }
174 | 
175 | par(mfrow=c(1,2))
176 | 
177 | colres <- cellColor(labs)
178 | 
179 | plot(pcaRes$x[,1:2],col=colres$colvec, xlab = "z1", ylab="z2", pch=19)
180 | legend("bottomright", legend = colres$labels, text.col = colres$cols, 
181 |        bty="n", cex=0.80)
182 | plot(pcaRes$x[,c(1,3)], col=colres$colvec, xlab="z1", ylab="z3", pch=19)
183 | legend("topright",  legend = colres$labels,text.col = colres$cols, 
184 |        bty ="n" , cex=0.80)
185 | 
186 | par(mfrow=c(1,1))
187 | 
188 | ```
189 | 
190 | Overall, we can conclude that the cell lines corresponding to a single cancer 
191 | type tend to have similar values on the first few principal component score 
192 | vectors. This indicates that cell lines from the same cancer type tend to have
193 | similar gene expression levels. 
194 | 
195 | ### Clustering observations
196 | 
197 | In hierarchical clustering, we start by defining some dissimilarity measure 
198 | between each pair of observations (like Euclidean distance). We start at the 
199 | bottom of the dendrogram, each of the n observations are considered as a 
200 | cluster, the two clusters that are most similar to each other are fused together
201 | so now we have n-1 clusters. Next the two clusters that are similar together are
202 | fused so that there are n-2 clusters. This algorithm proceeds iteractively until 
203 | all samples belong to one single cluster and the dendrogram is complete. 
204 | 
205 | We define the dissimilarity between two clusters or two groups of observations
206 | using Linkage. There are four common types of linkage - "average", "complete",
207 | "single", "centroid". Assuming we have two clusters A and B, then - 
208 | 
209 | 1. _Complete_ refers to recording the *largest* of all pairwise dissimilarities 
210 | between observations in cluster A and observations in Cluster B. 
211 | 2. _Single_ refers to recording the *smallest* of all pairwise dissimilarities 
212 | between observations in cluster A and observations in Cluster B.
213 | 3. _Single_ refers to recording the *average* of all pairwise dissimilarities 
214 | between observations in cluster A and observations in Cluster B.
215 | 4. _Centroid_ refers to the dissimilarity between the centroid for cluster A and 
216 | the centroid for cluster B. 
217 | 
218 | Usually, the dissimilarity measure is the Euclidean Distance. 
219 | 
220 | Lets cluster the observations using complete linkage with Euclidean distance 
221 | as the dissimilarity measure in "complete linkage".
222 | 
223 | ```{r fig.width=12 , message=FALSE}
224 | library(dendextend)
225 | 
226 | sdata <- scale(nci60data)
227 | d <- dist(sdata)
228 | labs <- as.character(unlist(as.list(ncilabels)))
229 | comp_clust <- hclust(d)
230 | dend <- as.dendrogram(comp_clust)
231 | leaves <- labs[order.dendrogram(dend)]
232 | labels_colors(dend, labels=TRUE) <- cellColor(leaves)$colvec
233 | labels(dend) <- leaves
234 | plot(dend, main ="Clustering using Complete Linkage")
235 | ```
236 | 
237 | ### Exercise 
238 | 
239 | 1. Perform hierarchical clustering using average and single linkage. 
240 | 2. Interpret the difference in the dendrograms. 
241 | 3. Can you observe some patterns from these dendrograms? (hint: use cutree)
242 | 
243 | **Solutions:**
244 | 
245 | 1. The plots can be made with the following code -  
246 | ```{r fig.width=12, fig.height=6}
247 | plot(hclust(d, method="average"), labels= labs,
248 |      main ="Clustering using Average Linkage" , xlab="", ylab="" )
249 | plot(hclust(d, method="single"), labels= labs, 
250 |      main ="Clusteringg using Single Linkage" , xlab="", ylab="" )
251 | ```
252 |   
253 |   
254 | 2. Briefly, complete linkage provides maximal inter cluster dissimilarity, single
255 | linkage results in minimal inter cluster dissimilarity and average results in 
256 | mean inter cluster dissimilarity. We see that the results are affected by the 
257 | choice of the linkage. Single linkage tend to yield trailing clusters while 
258 | complete and average linkage leads to more balanced and attractive clusters.  
259 | 
260 | 3. For our example, we see that the cell lines within a single cancer cell type do
261 | not cluster together. But if we consider complete linkage and cut the tree at 
262 | height=4 ( we tried different heights to observe patterns) we observe some clear
263 | patterns like the leukemia cell lines fall in cluster 2 and the breast cancer
264 | cell lines spread out. 
265 | ```{r}
266 | hc <- cutree(comp_clust, 4)
267 | table(hc, labs)
268 | ```
269 | 
270 | ## Supervised Learning 
271 | 
272 | In supervised learning, along with the features $X_1$, $X_2$, ....,$X_p$, we 
273 | also have the a response Y measured on the same n observations. The goal is then
274 | to predict Y using $X_1$, $X_2$, ....,$X_p$ for new observations. 
275 | 
276 | ### A simple example using knn
277 | 
278 | For the cervical data, we know that the first 29 are non-Tumor samples whereas
279 | the last 29 are Tumor samples. We will code these as 0 and 1 respectively. 
280 | 
281 | ```{r}
282 | class = data.frame(condition = factor(rep(c(0, 1), c(29, 29))))
283 | ```
284 | 
285 | Lets look at one of the most basic supervised learning techniques 
286 | **k-Nearest Neighbor** and see what all goes into building a simple model with 
287 | it. For the sake of simplicity, we will use only 2 predictors (so that we can 
288 | represent the data in 2 dimensional space)
289 | 
290 | ```{r}
291 | data <- t(cervical)
292 | data <- data[,1:2]
293 | df  <- cbind(data, class)
294 | colnames(df) <- c("x1","x2","y")
295 | rownames(df) <- NULL 
296 | head(df)
297 | ```
298 | 
299 | This is how the data looks - 
300 | ```{r fig.width=12}
301 | plot(df[,"x1"], df[,"x2"], xlab="x1", ylab="x2", 
302 |      main="data representation for knn", 
303 |      col=ifelse(as.character(df[,"y"])==1, "red","blue"))                                               
304 | ```
305 | 
306 | Given a observation $x_0$ and a positive integer, K, the KNN classifier first 
307 | identifies K points in the training data that are closest to $x_0$, represented
308 | by $N_0$. It estimates the conditional probability for class j as a fraction of
309 | $N_0$ and applies Bayes rule to classify the test observation to the class
310 | with the largest probability. 
311 | More concretely, if k=3 and there are 2 observation belonging to class 1 and 1
312 | observation belonging to class 2, then we the test observation is assigned to
313 | class1. 
314 | 
315 | ![knn_figure](our_figures/knn.png)
316 | 
317 | For all supervised experiments its a good idea to hold out some data as 
318 | _Training Data_ and build a model with this data. We can then test the  built 
319 | model using the left over data (_Test Data_) to gain confidence in our model. 
320 | We will randomly sample 30% of our data and use that as a test set. The 
321 | remaining 70% of the data will be used as training data
322 | 
323 | ```{r }
324 | set.seed(9)
325 | nTest = ceiling(ncol(cervical) * 0.2)
326 | ind = sample(ncol(cervical), nTest, FALSE)
327 | 
328 | cervical.train = cervical[, -ind]
329 | cervical.train = as.matrix(cervical.train + 1)
330 | classtr = data.frame(condition = class[-ind, ])
331 | 
332 | cervical.test = cervical[, ind]
333 | cervical.test = as.matrix(cervical.test + 1)
334 | classts = data.frame(condition = class[ind, ])
335 | ```
336 | 
337 | Training set error is the proportion of mistakes made if we apply our model to 
338 | the training data and Test set error is the proportion of mistakes made when 
339 | we apply our model to test data. 
340 | 
341 | For different neighbors , let us calculate the training error and test error 
342 | using KNN. 
343 | 
344 | ```{r message=FALSE}
345 | library(class)
346 | 
347 | newknn <- function( testset, trainset, testclass, trainclass, k)
348 | {
349 |     pred.train <- knn.cv(trainset, trainclass, k=k)
350 |     pred.test <- knn(trainset, testset, trainclass, k=k)
351 |     
352 |     test_fit <- length(which(mapply(identical, as.character(pred.test), 
353 |         testclass)==FALSE))/length(testclass)
354 |             
355 |     train_fit <- length(which(mapply(identical, as.character(pred.train), 
356 |             trainclass)==FALSE))/length(trainclass)
357 |     
358 |     c(train_fit=train_fit, test_fit= test_fit)
359 | }
360 | 
361 | trainset <- t(cervical.train)
362 | testset <- t(cervical.test)
363 | testclass <- t(classts)
364 | trainclass <- t(classtr)
365 | klist <- 1:15
366 | ans <- lapply(klist, function(x) 
367 |     newknn(testset, trainset, testclass, trainclass,k =x))
368 | 
369 | resdf <- t(as.data.frame(ans))
370 | rownames(resdf) <- NULL
371 | plot(klist, resdf[,"train_fit"], col="blue", type="b",ylim=c(range(resdf)),
372 |     main="k Nearest Neighbors for Cervical Data", xlab="No of neighbors", 
373 |     ylab ="Training and Test Error")
374 | points(klist, resdf[,"test_fit"], col="red", type="b")
375 | legend("bottomright", legend=c("Training error","Test error"), 
376 |     text.col=c("blue","red"), bty="n")
377 | 
378 | ```
379 | 
380 | Another important concept in machine learning is **cross validation**. Since 
381 | samples are often scarse, it is often not possible to set aside a validation set
382 | ans then use it to assess the performance of our prediction model. So we use 
383 | cross validation to train a better model. We start by dividing the training data
384 | randomly into n equal parts. The learning method is fit to n-1 parts of the 
385 | data, and the prediction error is computed on the remaining part. This is done 
386 | in turn for each 1/n parts of the data, and finally the n prediction error 
387 | estimates are averaged.
388 | 
389 | For example, K-fold cross validation where K=5
390 | 
391 | ![cv_figure](our_figures/cross_validation.png)
392 | 
393 | As you can see, computation for this very simple learner can become quite 
394 | complicated.
395 | 
396 | ### Fast classification using Bioconductor. 
397 | 
398 | MLSeq aims to make computation less complicated for a user and
399 | allows one to learn a model using various classifier's with one single function. 
400 | 
401 | The main function of this package is classify which requires data in the form of 
402 | a DESeqDataSet instance. The DESeqDataSet is a subclass of SummarizedExperiment,
403 | used to store the input values, intermediate calculations and results of an 
404 | analysis of differential expression.
405 | 
406 | So lets create DESeqDataSet object for both the training and test set, and run 
407 | DESeq on it. 
408 | 
409 | ```{r}
410 | cervical.trainS4 = DESeqDataSetFromMatrix(countData = cervical.train, 
411 |         colData = classtr, formula(~condition))
412 | cervical.trainS4 = DESeq(cervical.trainS4, fitType = "local")
413 | 
414 | cervical.testS4 = DESeqDataSetFromMatrix(countData = cervical.test, colData = classts,
415 | formula(~condition))
416 | cervical.testS4 = DESeq(cervical.testS4, fitType = "local")
417 | 
418 | ```
419 | Classify using Support Vector Machines. 
420 | 
421 | ```{r}
422 | svm = classify(data = cervical.trainS4, method = "svm", normalize = "deseq",
423 | deseqTransform = "vst", cv = 5, rpt = 3, ref = "1")
424 | svm
425 | ```
426 | 
427 | It returns an object of class 'MLseq' and we observe that it successfully
428 | fitted a model with 97.8% accuracy. We can access the slots of this S4 object by
429 | ```{r}
430 | getSlots("MLSeq")
431 | ```
432 | And also, ask about the model trained. 
433 | 
434 | ```{r}
435 | trained(svm)
436 | ```
437 | 
438 | We can predict the class labels of our test data using "predict"
439 | 
440 | ```{r}
441 | pred.svm = predictClassify(svm, cervical.testS4)
442 | table(pred.svm, relevel(cervical.testS4$condition, 2))
443 | ```
444 | 
445 | The other classification methods available are 'randomforest', 'cart' and 
446 | 'bagsvm'.
447 | 
448 | ### Exercise:
449 | 
450 | Train the same training data and test data using randomForest.
451 | 
452 | **Solutions:**
453 | 
454 | ```{r}
455 | rf = classify(data = cervical.trainS4, method = "randomforest", 
456 |         normalize = "deseq", deseqTransform = "vst", cv = 5, rpt = 3, ref = "1")
457 | trained(rf)
458 | pred.rf = predictClassify(rf, cervical.testS4)
459 | table(pred.rf, relevel(cervical.testS4$condition, 2))
460 | ```
461 | 
462 | ## SessionInfo
463 | 
464 | ```{r}
465 | sessionInfo()
466 | ```
467 | 
468 | 
469 | 


--------------------------------------------------------------------------------
/vignettes/B02.4_Annotation.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{02.4 Annotating Genes, Genomes,and Variants}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r setup, echo=FALSE}
  8 | library(LearnBioconductor)
  9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 10 | ```
 11 | 
 12 | ```{r style, echo = FALSE, results = 'asis'}
 13 | BiocStyle::markdown()
 14 | ```
 15 | 
 16 | # Annotating Genes, Genomes, and Variants
 17 | 
 18 | Martin Morgan<br/>
 19 | October 28, 2014
 20 | 
 21 | ```{r pkgs, eval=TRUE, echo=FALSE, warning=FALSE, message=FALSE}
 22 | options(max.print=1000)
 23 | suppressPackageStartupMessages({
 24 |     library(org.Hs.eg.db)
 25 |     library(TxDb.Hsapiens.UCSC.hg19.knownGene)
 26 |     library(BSgenome.Hsapiens.UCSC.hg19)
 27 |     library(GenomicRanges)
 28 |     library(biomaRt)
 29 |     library(rtracklayer)
 30 | })
 31 | ```
 32 | 
 33 | ## Gene annotation
 34 | 
 35 | ### Data packages
 36 | 
 37 | Organism-level ('org') packages contain mappings between a central
 38 | identifier (e.g., Entrez gene ids) and other identifiers (e.g. GenBank
 39 | or Uniprot accession number, RefSeq id, etc.).  The name of an org
 40 | package is always of the form `org.<Sp>.<id>.db`
 41 | (e.g. [org.Sc.sgd.db][]) where `<Sp>` is a 2-letter abbreviation of
 42 | the organism (e.g. `Sc` for *Saccharomyces cerevisiae*) and `<id>` is
 43 | an abbreviation (in lower-case) describing the type of central
 44 | identifier (e.g. `sgd` for gene identifiers assigned by the
 45 | *Saccharomyces* Genome Database, or `eg` for Entrez gene ids).  The
 46 | "How to use the '.db' annotation packages" vignette in the
 47 | [AnnotationDbi][] package (org packages are only one type of ".db"
 48 | annotation packages) is a key reference.  The '.db' and most other
 49 | Bioconductor annotation packages are updated every 6 months.
 50 | 
 51 | Annotation packages usually contain an object named after the package
 52 | itself.  These objects are collectively called `AnnotationDb` objects,
 53 | with more specific classes named `OrgDb`, `ChipDb` or `TranscriptDb`
 54 | objects.  Methods that can be applied to these objects include
 55 | `cols()`, `keys()`, `keytypes()` and `select()`.  Common operations
 56 | for retrieving annotations are summarized in the table.
 57 | 
 58 | | Category   | Function                              | Description                                                      |
 59 | |------------|---------------------------------------|------------------------------------------------------------------|
 60 | | Discover   | `columns()`                           | List the kinds of columns that can be returned                   |
 61 | |            | `keytypes()`                          | List columns that can be used as keys                            |
 62 | |            | `keys()`                              | List values that can be expected for a given keytype             |
 63 | |            | `select()`                            | Retrieve annotations matching `keys`, `keytype` and `columns`    |
 64 | | Manipulate | `setdiff()`, `union()`, `intersect()` | Operations on sets                                               |
 65 | |            | `duplicated()`, `unique()`            | Mark or remove duplicates                                        |
 66 | |            | `%in%`,  `match()`        | Find matches                                                     |
 67 | |            | `any()`, `all()`                      | Are any `TRUE`?  Are all?                                        |
 68 | |            | `merge()`                             | Combine two different \Robject{data.frames} based on shared keys |
 69 | | `GRanges*` | `transcripts()`, `exons()`, `cds()`   | Features (transcripts, exons, coding sequence) as `GRanges`.     |
 70 | |            | `transcriptsBy()` , `exonsBy()`       | Features group by  gene, transcript, etc., as `GRangesList`.     |
 71 | |            | `cdsBy()`                             |                                                                  |
 72 | 
 73 | **Exercise**: This exercise illustrates basic use of the `select'
 74 | interface to annotation packages.
 75 | 
 76 | 1. What is the name of the org package for *Homo sapiens*?  Load it.
 77 |    Display the `OrgDb` object for the [org.Hs.eg.db][] package.  Use
 78 |    the `columns()` method to discover which sorts of annotations can
 79 |    be extracted from it.
 80 | 2. Use the `keys()` method to extract ENSEMBL identifiers and then
 81 |    pass those keys in to the `select()` method in such a way that you
 82 |    extract the SYMBOL (gene symbol) and GENENAME information for
 83 |    each. Use the following ENSEMBL ids.
 84 | 
 85 | ```{r select-setup}
 86 | ensids <- c("ENSG00000130720", "ENSG00000103257", "ENSG00000156414", 
 87 |             "ENSG00000144644", "ENSG00000159307", "ENSG00000144485")
 88 | ```
 89 | 
 90 | **Solution** The `OrgDb` object is named `org.Hs.eg.db`.
 91 | ```{r select}
 92 | library(org.Hs.eg.db)
 93 | keytypes(org.Hs.eg.db)
 94 | columns(org.Hs.eg.db)
 95 | cols <- c("SYMBOL", "GENENAME")
 96 | select(org.Hs.eg.db, keys=ensids, columns=cols, keytype="ENSEMBL")
 97 | ```
 98 | 
 99 | ### Internet resources
100 | 
101 | A short summary of select Bioconductor packages enabling web-based
102 | queries is in following Table.
103 | 
104 | | Package                                             | Description                               |
105 | |-----------------------------------------------------|-------------------------------------------|
106 | | [AnnotationHub][]                                   | Ensembl, Encode, dbSNP, UCSC data objects |
107 | | [biomaRt](http://biomart.org)                       | Ensembl and other annotations             |
108 | | [PSICQUIC](https://code.google.com/p/psicquic)      | Protein interactions                      |
109 | | [uniprot.ws](http://uniprot.org)                    | Protein annotations                       |
110 | | [KEGGREST](http://www.genome.jp/kegg)               | KEGG pathways                             |
111 | | [SRAdb](http://www.ncbi.nlm.nih.gov/sra)            | Sequencing experiments.                   |
112 | | [rtracklayer](http://genome.ucsc.edu)               | genome tracks.                            |
113 | | [GEOquery](http://www.ncbi.nlm.nih.gov/geo/)        | Array and other data                      |
114 | | [ArrayExpress](http://www.ebi.ac.uk/arrayexpress/)  | Array and other data                      |
115 | 
116 | *Using biomaRt*
117 | 
118 | The [biomaRt][] package offers access to the online
119 | [biomart](http://www.biomart.org) resource. this consists of several
120 | data base resources, referred to as 'marts'.  Each mart allows access
121 | to multiple data sets; the [biomaRt][] package provides methods for
122 | mart and data set discovery, and a standard method `getBM()` to
123 | retrieve data.
124 | 
125 | *Exercise*
126 | 
127 | 1. Load the [biomaRt][] package and list the available marts.  Choose
128 |    the *ensembl* mart and list the datasets for that mart.  Set up a
129 |    mart to use the *ensembl* mart and the *hsapiens gene ensembl*
130 |    dataset.
131 | 2. A [biomaRt][] dataset can be accessed via `getBM()`. In addition to
132 |    the mart to be accessed, this function takes filters and attributes
133 |    as arguments.  Use `filterOptions()` and `listAttributes()` to
134 |    discover values for these arguments.  Call `getBM()` using filters
135 |    and attributes of your choosing.
136 | 
137 | *Solution*
138 | ```{r biomaRt1, eval=FALSE, results="hide"}
139 | ## NEEDS INTERNET ACCESS !!
140 | library(biomaRt)
141 | head(listMarts(), 3)                      ## list the marts
142 | head(listDatasets(useMart("ensembl")), 3) ## mart datasets
143 | ensembl <-                                ## fully specified mart
144 |     useMart("ensembl", dataset = "hsapiens_gene_ensembl")
145 | 
146 | head(listFilters(ensembl), 3)             ## filters
147 | myFilter <- "chromosome_name"
148 | substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
149 | myValues <- c("21", "22")
150 | head(listAttributes(ensembl), 3)          ## attributes
151 | myAttributes <- c("ensembl_gene_id","chromosome_name")
152 | 
153 | ## assemble and query the mart
154 | res <- getBM(attributes =  myAttributes, filters =  myFilter,
155 |              values =  myValues, mart = ensembl)
156 | ```
157 | 
158 | *Exercise*
159 | 
160 | As an optional exercise, annotate the genes that are differentially
161 | expressed in the DESeq2 laboratory, e.g., find the \texttt{GENENAME}
162 | associated with the five most differentially expressed genes. Do these
163 | make biological sense? Can you `merge()` the annotation results with
164 | the `top table' results to provide a statistically and biologically
165 | informative summary?
166 | 
167 | ## Genome annotation
168 | 
169 | There are a diversity of packages and classes available for
170 | representing large genomes. Several include:
171 | 
172 | - 'TxDb.*' For transcript and other genome / coordinate annotation.
173 | - [BSgenome][] For whole-genome representation. See
174 |   `available.packages()` for pre-packaged genomes, and the vignette
175 |   'How to forge a BSgenome data package' in the
176 | - [Homo.sapiens][] For integrating 'TxDb*' and 'org.*' packages.
177 | - 'SNPlocs.*' For model organism SNP locations derived from dbSNP.
178 | - `FaFile()` ([Rsamtools][]) for accessing indexed FASTA files.
179 | - 'SIFT.*', 'PolyPhen', 'ensemblVEP' Variant effect scores.
180 | 
181 | ### Transcript annotation packages
182 | 
183 | Genome-centric packages are very useful for annotations involving
184 | genomic coordinates. It is straight-forward, for instance, to discover
185 | the coordinates of coding sequences in regions of interest, and from
186 | these retrieve corresponding DNA or protein coding sequences. Other
187 | examples of the types of operations that are easy to perform with
188 | genome-centric annotations include defining regions of interest for
189 | counting aligned reads in RNA-seq experiments and retrieving DNA
190 | sequences underlying regions of interest in ChIP-seq analysis, e.g.,
191 | for motif characterization.
192 | 
193 | *Exercise*
194 | 
195 | This exercise uses annotation resources to go from a gene symbol
196 | 'BRCA1' through to the genomic coordinates of each transcript
197 | associated with the gene, and finally to the DNA sequences of the
198 | transcripts.
199 | 
200 | 1. Use the [org.Hs.eg.db][] package to map from the gene symbol
201 |    'BRCA1' to its Entrez identifier. Do this using the `select`
202 |    command.
203 | 2. Use the [TxDb.Hsapiens.UCSC.hg19.knownGene][] package to retrieve
204 |    the transcript names (`TXNAME`) corresponding to the BRCA1 Entrez
205 |    identifier. (The 'org\*' packages are based on information from
206 |    NCBI, where Entrez identifiers are labeled ENTREZID; the 'TxDb*'
207 |    package we are using is from UCSC, where Entrez identifiers are
208 |    labeled GENEID).
209 | 3. Use the `cdsBy()` function to retrieve the genomic coordinates of
210 |    all coding sequences grouped by transcript, and select the
211 |    transcripts corresponding to the identifiers we're interested
212 |    in. The coding sequences are returned as an `GRangesList`, where
213 |    each element of the list is a `GRanges` object representing the
214 |    exons in the coding sequence. As a sanity check, ensure that the
215 |    sum of the widths of the exons in each coding sequence is evenly
216 |    divisible by 3 (the R 'modulus' operator `%%` returns the remainder
217 |    of the division of one number by another, and might be helpful in
218 |    this case).
219 | 
220 | 4. Visualize the transcripts in genomic coordinates using the [Gviz][]
221 |    package to construct an `AnnotationTrack`, and plotting it using
222 |    `plotTracks()`.
223 | 
224 | 5. Use the [Bsgenome.Hsapiens.UCSC.hg19][] package and
225 |    `extractTranscriptSeqs()` function to extract the DNA sequence of
226 |    each transcript.
227 | 
228 | 
229 | *Solution*
230 | 
231 | Retrieve the Entrez identifier corresponding to the BRCA1 gene symbol
232 | 
233 | ```{r symbol-to-entrez}
234 | library(org.Hs.eg.db)
235 | eid <- select(org.Hs.eg.db, "BRCA1", "ENTREZID", "SYMBOL")[["ENTREZID"]]
236 | ```
237 | 
238 | Map from Entrez gene identifier to transcript name
239 | 
240 | ```{r entrez-to-tx, messages=FALSE}
241 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
242 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
243 | txid <- select(txdb, eid, "TXNAME", "GENEID")[["TXNAME"]]
244 | ```
245 | 
246 | Retrieve all coding sequences grouped by transcript, and select those
247 | matching the transcript ids of interest, verifying that each coding
248 | sequence width is a multiple of 3
249 | 
250 | ```{r tx-to-cds-coords}
251 | cds <- cdsBy(txdb, by="tx", use.names=TRUE)
252 | brca1cds <- cds[names(cds) %in% txid]
253 | class(brca1cds)
254 | length(brca1cds)
255 | brca1cds[[1]]                           # exons in cds
256 | cdswidth <- width(brca1cds)             # width of each exon
257 | all((sum(cdswidth) %% 3) == 0)          # sum within cds, modulus 3
258 | ```
259 | 
260 | Visualize the BRCA1 transcripts using [Gviz][] (this package has an
261 | excellent vignette, `vignette("Gviz")`)
262 | 
263 | ```{r Gviz, message=FALSE}
264 | require(Gviz)
265 | anno <- AnnotationTrack(brca1cds)
266 | plotTracks(list(GenomeAxisTrack(), anno))
267 | ```
268 | 
269 | Extract the coding sequences of each transcript
270 | 
271 | ```{r cds-to-seq}
272 | library(BSgenome.Hsapiens.UCSC.hg19)
273 | genome <- BSgenome.Hsapiens.UCSC.hg19
274 | tx_seq <- extractTranscriptSeqs(genome, brca1cds)
275 | tx_seq
276 | ```
277 | 
278 | Intron coordinates can be identified by first calculating the range of
279 | the genome (from the start of the first exon to the end of the last
280 | exon) covered by each transcript, and then taking the (algebraic) set
281 | difference between this and the genomic coordinates covered by each
282 | exon
283 | 
284 | ```{r introns}
285 | introns <- psetdiff(range(brca1cds), brca1cds)
286 | ```
287 | 
288 | Retrieve the intronic sequences with `getSeq()` (these are *not*
289 | assembled, the way that `extractTranscriptSeqs()` assembles exon
290 | sequences into mature transcripts); note that introns start and end
291 | with the appropriate acceptor and donor site sequences.
292 | 
293 | ```{r intron-seqs}
294 | seq <- getSeq(genome, introns)
295 | names(seq)
296 | seq[["uc010whl.2"]]                     # 21 introns
297 | ```
298 | 
299 | ### [rtracklayer][]
300 | 
301 | The [rtracklayer][] package allows us to query the UCSC genome
302 | browser, as well as providing `import()` and
303 | `export()` functions for common annotation file formats like
304 | GFF, GTF, and BED.
305 | 
306 | *Exercise*
307 | 
308 | Here we use [rtracklayer][] to retrieve estrogen receptor
309 | binding sites identified across cell lines in the ENCODE project. We
310 |   focus on binding sites in the vicinity of a particularly interesting
311 |   region of interest.
312 | 
313 | 1. Define our region of interest by creating a `GRanges` instance with
314 |    appropriate genomic coordinates. Our region corresponds to 10Mb up-
315 |    and down-stream of a particular gene.
316 | 2. Create a session for the UCSC genome browser
317 | 3. Query the UCSC genome browser for ENCODE estrogen receptor
318 |    ERalpha$_a$ transcription marks; identifying the appropriate track,
319 |    table, and transcription factor requires biological knowledge and
320 |    detective work.
321 | 4. Visualize the location of the binding sites and their scores;
322 |    annotate the mid-point of the region of interest.
323 | 
324 | *Solution*
325 | 
326 | Define the region of interest
327 | 
328 | ```{r rtracklayer-roi}
329 | library(GenomicRanges)
330 | roi <- GRanges("chr10", IRanges(92106877, 112106876, names="ENSG00000099194"))
331 | ``` 
332 | 
333 | Create a session
334 | 
335 | ```{r rtracklayer-session}
336 | library(rtracklayer) 
337 | session <- browserSession()
338 | ``` 
339 | 
340 | Query the UCSC for a particular track, table, and transcription
341 | factor, in our region of interest
342 | 
343 | ```{r rtracklayer-marks}
344 | trackName <- "wgEncodeRegTfbsClusteredV2"
345 | tableName <- "wgEncodeRegTfbsClusteredV2"
346 | trFactor <- "ERalpha_a"
347 | ucscTable <- getTable(ucscTableQuery(session, track=trackName,
348 |     range=roi, table=tableName, name=trFactor))
349 | ``` 
350 | 
351 | Visualize the result
352 | 
353 | ```{r rtracklayer-plot, fig.height=3}
354 | plot(score ~ chromStart, ucscTable, pch="+")
355 | abline(v=start(roi) + (end(roi) - start(roi) + 1) / 2, col="blue")
356 | ``` 
357 | 
358 | ## Variants
359 | 
360 | Follow the
361 | [Variants](http://bioconductor.org/help/workflows/variants/) work
362 | flow.
363 | 
364 | [AnnotationHub]: http://bioconductor.org/packages/release/bioc/html/AnnotationHub.html
365 | [BSgenome]: http://bioconductor.org/packages/release/bioc/html/BSgenome.html
366 | [Bsgenome.Hsapiens.UCSC.hg19]: http://bioconductor.org/packages/release/data/annotation/html/Bsgenome.Hsapiens.UCSC.hg19.html
367 | [Homo.sapiens]: http://bioconductor.org/packages/release/data/annotation/html/Homo.sapiens.html
368 | [Rsamtools]: http://bioconductor.org/packages/release/bioc/html/Rsamtools.html
369 | [TxDb.Hsapiens.UCSC.hg19.knownGene]: http://bioconductor.org/packages/release/data/annotation/html/TxDb.Hsapiens.UCSC.hg19.knownGene.html
370 | [biomaRt]: http://bioconductor.org/packages/release/bioc/html/biomaRt.html
371 | [org.Hs.eg.db]: http://bioconductor.org/packages/release/data/annotation/html/org.Hs.eg.db.html
372 | [org.Sc.sgd.db]: http://bioconductor.org/packages/release/data/annotation/html/org.Sc.sgd.db.html
373 | [rtracklayer]: http://bioconductor.org/packages/release/bioc/html/rtracklayer.html
374 | 


--------------------------------------------------------------------------------
/vignettes/B02.5_GeneSetEnrichment.Rmd:
--------------------------------------------------------------------------------
 1 | <!--
 2 | %\VignetteIndexEntry{02.5 Gene Set Enrichment}
 3 | %\VignettePackage{LearnBioconductor}
 4 | %\VignetteEngine{knitr::knitr}
 5 | -->
 6 | 
 7 | ```{r setup, echo=FALSE}
 8 | library(LearnBioconductor)
 9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
10 | ```
11 | 
12 | ```{r style, echo = FALSE, results = 'asis'}
13 | BiocStyle::markdown()
14 | ```
15 | 
16 | # Gene Set Enrichment and Pathway Analysis
17 | 
18 | Martin Morgan<br/>
19 | October 28, 2014
20 | 
21 | My overview of approaches to [gene set
22 | enrichment](http://bioconductor.org/help/course-materials/2013/EMBOBGI/GeneSetEnrichment.pdf),
23 | with a sketch of relevant _Bioconductor_ packages.


--------------------------------------------------------------------------------
/vignettes/C03.1_LargeData.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{03.1 Working with Large Data}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r setup, echo=FALSE}
  8 | library(LearnBioconductor)
  9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 10 | ```
 11 | 
 12 | ```{r style, echo = FALSE, results = 'asis'}
 13 | BiocStyle::markdown()
 14 | ```
 15 | 
 16 | # Working with Large Data
 17 | 
 18 | Martin Morgan<br/>
 19 | October 29, 2014
 20 | 
 21 | ## Scalable computing
 22 | 
 23 | 1. Efficient _R_ code
 24 |    - Vectorize!
 25 |    - Reuse others' work -- `r Biocpkg("DESeq2")`,
 26 |      `r Biocpkg("GenomicRanges")`, `r Biocpkg("Biostrings")`, ...,
 27 |      `r CRANpkg("dplyr")`, `r CRANpkg("data.table")`, `r CRANpkg("Rcpp")`
 28 |    - Useful tools: `system.time()`, `Rprof()`, `r CRANpkg("microbenchmark")`
 29 |    - More detail in
 30 |      [deadly sins](http://bioconductor.org/help/course-materials/2014/CSAMA2014/1_Monday/labs/IntermediateR.html#efficient-code)
 31 |      of a previous course.
 32 | 2. Iteration
 33 |    - Chunk-wise
 34 |    - `open()`, read chunk(s), `close()`.
 35 |    - e.g., `yieldSize` argument to `Rsamtools::BamFile()`
 36 | 3. Restriction
 37 |    - Limit to columns and / or rows of interest
 38 |    - Exploit domain-specific formats, e.g., BAM files and
 39 |      `Rsamtools::ScanBamParam()`
 40 |    - Use a data base
 41 | 4. Sampling
 42 |    - Iterate through large data, retaining a manageable sample, e.g.,
 43 |      `ShortRead::FastqSampler()`
 44 | 5. Parallel evaluation
 45 |    - **After** writing efficient code
 46 |    - Typically, `lapply()`-like operations
 47 |    - Cores on a single machine ('easy'); clusters (more tedious);
 48 |      clouds
 49 | 
 50 | ## Parallel evaluation in _Bioconductor_
 51 | 
 52 | - [BiocParallel][] -- `bplapply()` for `lapply()`-like functions,
 53 |   increasingly used by package developers to provide easy, standard
 54 |   way of gaining parallel evaluation.
 55 | - [GenomicFiles][] -- Framework for working on groups of files,
 56 |   ranges, or ranges x files
 57 | - Bioconductor
 58 |   [AMI](http://bioconductor.org/help/bioconductor-cloud-ami/) (Amazon
 59 |   Machine Instance) including pre-configured StarCluster.
 60 | 
 61 | ## Lab
 62 | 
 63 | ### Efficient code
 64 | 
 65 | Write the following as a function. Use `system.time()` to explore how
 66 | long this takes to execute as `n` increases from 100 to 10000. Use
 67 | `identical()` and `r CRANpkg("microbenchmark")` to compare
 68 | alternatives `f1()`, `f2()`, and `f3()` for both correctness and performance of
 69 | these three different functions. What strategies are these functions
 70 | using?
 71 | 
 72 | ```{r benchmark}
 73 | f0 <- function(n) {
 74 |     ## inefficient!
 75 |     ans <- numeric()
 76 |     for (i in seq_len(n))
 77 |         ans <- c(ans, exp(i))
 78 |     ans
 79 | }
 80 | 
 81 | f1 <- function(n) {
 82 |     ans <- numeric(n)
 83 |     for (i in seq_len(n))
 84 |         ans[[i]] <- exp(i)
 85 |     ans
 86 | }
 87 | 
 88 | f2 <- function(n)
 89 |     sapply(seq_len(n), exp)
 90 | 
 91 | f3 <- function(n)
 92 |     exp(seq_len(n))
 93 | ```
 94 | 
 95 | ### Sleeping serially and in parallel
 96 | 
 97 | Go to sleep for 1 second, then return `i`. This takes 8 seconds.
 98 | 
 99 | ```{r parallel-sleep}
100 | library(BiocParallel)
101 | 
102 | fun <- function(i) {
103 |     Sys.sleep(1)
104 |     i
105 | }
106 | 
107 | ## serial
108 | f0 <- function(n)
109 |     lapply(seq_len(n), fun)
110 | 
111 | ## parallel
112 | f1 <- function(n)
113 |     bplapply(seq_len(n), fun)
114 | ```
115 | 
116 | ### Counting overlaps -- our own version
117 | 
118 | Regions of interest, named like the chromosomes in the bam file.
119 | 
120 | ```{r count-overlaps-roi, eval=FALSE}
121 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
122 | exByTx <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx")
123 | 
124 | map0 <- read.delim("~/igv/genomes/hg19_alias.tab", header=FALSE, 
125 |     stringsAsFactors=FALSE)
126 | map <- setNames(map0$V1, map0$V2)
127 | seqlevels(exByTx, force=TRUE) <- map
128 | ```
129 | 
130 | A function to iterate through a bam file
131 | 
132 | ```{r count-overlaps, eval=FALSE}
133 | count1 <- function(filename, roi) {
134 |     ## Create and open BAM file
135 |     bf <- BamFile(filename, yieldSize=1000000)
136 |     open(bf)
137 | 
138 |     ## initialize variables
139 |     n <- 0                          # number of reads examined
140 |     count <- integer(length(roi))   # running count of reads overlapping roi
141 |     names(counts) <- names(roi)
142 | 
143 |     ## read in and count chunks of data, until done
144 |     repeat {
145 |         ## input
146 |         aln <- readGAlignments(bf)   # input next chunk
147 |         if (length(aln) == 0)        # stopping condition
148 |             break
149 |         n <- n + length(aln)         # how are we doing?
150 |         message(n)
151 |         
152 |         ## overlaps
153 |         olaps <- findOverlaps(aln, roi, type="within", ignore.strand=TRUE)
154 |         count <- count + tabulate(subjectHits(olaps), subjectLength(olaps))
155 |     }
156 |     
157 |     ## finish and return result
158 |     close(bf)
159 |     count
160 | }
161 | ```
162 | 
163 | In action
164 | 
165 | ```{r count-overlaps-doit, eval=FALSE}
166 | filename <- "~/bam/SRR1039508_sorted.bam"
167 | count <- count1(filename, exByTx)
168 | ```
169 | 
170 | Parallelize
171 | 
172 | ```{r count-overlaps-parallel, eval=FALSE}
173 | library(BiocParallel)
174 | 
175 | ## all bam files
176 | filenames <- dir("~/bam", pattern="bam$", full=TRUE)
177 | names(filenames) <- sub("_sorted.bam", "", basename(filenames))
178 | 
179 | ## iterate
180 | counts <- bplapply(filenames, count1, exByTx)
181 | counts <- simplify2array(counts)
182 | head(counts)
183 | ```
184 | 
185 | ## Resources
186 | 
187 | - Lawrence, M, and Morgan, M. 2014. Scalable Genomics with R and
188 |   Bioconductor. Statistical Science 2014, Vol. 29, No. 2,
189 |   214-226. http://arxiv.org/abs/1409.2864v1
190 | 
191 | [BiocParallel]: http://bioconductor.org/packages/release/bioc/html/BiocParallel.html
192 | [GenomicFiles]: http://bioconductor.org/packages/release/bioc/html/GenomicFiles.html
193 | 
194 | 


--------------------------------------------------------------------------------
/vignettes/C03.2_CodeToPackages.Rmd:
--------------------------------------------------------------------------------
  1 | <!--
  2 | %\VignetteIndexEntry{03.2 Organizing Code in Functions, Files ,and Packages}
  3 | %\VignettePackage{LearnBioconductor}
  4 | %\VignetteEngine{knitr::knitr}
  5 | -->
  6 | 
  7 | ```{r setup, echo=FALSE}
  8 | library(LearnBioconductor)
  9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 10 | ```
 11 | 
 12 | ```{r style, echo = FALSE, results = 'asis'}
 13 | BiocStyle::markdown()
 14 | ```
 15 | 
 16 | # Organizing Code in Functions, Files ,and Packages
 17 | 
 18 | Martin Morgan<br/>
 19 | October 29, 2014
 20 | 
 21 | ## Programming
 22 | 
 23 | _R_ is a programming language
 24 | 
 25 | - Your own functions
 26 | 
 27 |   ```{r fun}
 28 |   fun <- function(n) {
 29 |       x <- rnorm(n)
 30 |       y <- x + rnorm(n, sd=.5)
 31 |       lm(y ~ x)
 32 |   }
 33 |   ```
 34 | 
 35 |     - Check out the _RStudio_ function wizard!
 36 | 
 37 | - Iteration
 38 | 
 39 |   ```{r iteration, eval=FALSE}
 40 |   ## 'side effects'
 41 |   for (i in 1:10)
 42 |       message(i)
 43 | 
 44 |   ## result for subsequent computation
 45 |   sapply(1:10, function(i) {
 46 |       sum(rnorm(1000))
 47 |   })
 48 |   ```
 49 |   
 50 |     - Other iteration paradigms: `lapply()`, `apply()`, `mapply() / Map()`.
 51 | 
 52 | - Conditional execution
 53 | 
 54 |   ```{r conditional, eval=FALSE}
 55 |   sapply(1:10, function(i)) {
 56 |       if (sum(rnorm(1000)) > 0) {
 57 |           "hi!"
 58 |       } else {
 59 |           "low!"
 60 |       }
 61 |   })
 62 |   ```
 63 | 
 64 | ## From script to package
 65 | 
 66 | Scripts contain a combination of data and transformations. Often the
 67 | transformations are idiosyncratic, and rely heavily on functions
 68 | provided by various packages. Sometimes the script contains a useful
 69 | chunk of code that could be reused in different places. Examples we've
 70 | encountered in this course might include GC-content of DNA sequences
 71 | (or is there a function for that already? check out `r
 72 | Biocpkg("Biostrings")`!) and creating a simple 'map' from one type of
 73 | annotation to another.
 74 | 
 75 | It is easy and beneficial to create a package. 
 76 | 
 77 | - Only need to think carefully about how to implement the function once.
 78 | - Code in the package is reused -- if you correct an error, then all
 79 |   your scripts automatically benefit.
 80 | - Share with your lab mates / colleagues for consistent results, and
 81 |   with the wider community for fame and glory.
 82 |   
 83 | What is an _R_ package?
 84 | 
 85 | - Simple directory structure of required files. Minimal:
 86 | 
 87 |         MyPackage
 88 |         |-- DESCRIPTION (title, author, version, etc.)
 89 |         |-- NAMESPACE (imports / exports)
 90 |         |-- R (R function definitions)
 91 |             |-- gc.R
 92 |             |-- annoHelper.R
 93 |         |-- man (help pages)
 94 |             |-- MyPackage.Rd
 95 |             |-- gc.Rd
 96 |             |-- annoHelper.Rd
 97 |   
 98 | Check out the _Rstudio_ package wizard!
 99 | 
100 | ## Lab
101 | 
102 | ### GC-content
103 | 
104 | Write a function that takes a `DNAStringSet` and returns the GC content.
105 | 
106 | Modify the function using a conditional statement to work whether
107 | provided a `DNAString` or a `DNAStringSet`. Test the function.
108 | 
109 | Save the function in a file on your AMI.
110 | 
111 | ### Annotation-helper
112 | 
113 | Write a function that takes as its argument Ensembl gene identifiers
114 | (like the `rownames()` of the _SummarizedExperiment_ object in the
115 | RNASeq vignette yesterday) and uses the `select()` method and
116 | annotation package `r Biocannopkg("org.Hs.eg.db")` to return a named
117 | character vector, where the names of the vector are the Ensembl
118 | identifiers and the values are the corresponding gene SYMBOLs. Adopt
119 | some simple-to-implement policy for handling Ensembl identifiers that
120 | map to more than one gene symbol. Save this function to another file
121 | 
122 | ### A package
123 | 
124 | Use the _RStudio_ wizard to create a package from the files containing
125 | your GC-content and annotation-helper functions.
126 | 


--------------------------------------------------------------------------------
/vignettes/C03.3_ReproducibleResearch.Rmd:
--------------------------------------------------------------------------------
 1 | <!--
 2 | %\VignetteIndexEntry{03.3 Reproducible Research}
 3 | %\VignettePackage{LearnBioconductor}
 4 | %\VignetteEngine{knitr::knitr}
 5 | -->
 6 | 
 7 | ```{r setup, echo=FALSE}
 8 | library(LearnBioconductor)
 9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
10 | ```
11 | 
12 | ```{r style, echo = FALSE, results = 'asis'}
13 | BiocStyle::markdown()
14 | ```
15 | 
16 | # Reproducible Research
17 | 
18 | Martin Morgan<br/>
19 | October 29, 2014
20 | 
21 | ## Importance of reproducibility
22 | 
23 | A [cautionary tale](http://bioconductor.org/help/course-materials/2013/EMBOBGI/reproducible-research.pdf) 
24 | 
25 | ## Facilities enabling reproducible analysis in _R_
26 | 
27 | Basics -- quickly becomes unsatisfactory
28 | 
29 | - 'Old-school' text scripts
30 | - Comments describing approaches
31 | - Package versions
32 | 
33 | Vignettes -- 'literate programming'
34 | 
35 | - Markdown
36 | - _R_ markdown; weaving and tangling
37 | - _LaTeX_
38 | 
39 | ## More advanced considerations
40 |     
41 | Version control
42 | 
43 | - Alternative (better!) to using cryptic file names to indicate
44 |   different versions
45 | - Create a 'repository', add / edit files, 'commit' changes
46 | - Easily review 'diff'erences, restore previous versions, ...
47 | - _RStudio_
48 | 
49 |     - File -> New Project --> New Directory --> Empty Project. Click
50 |       'Create git repository.
51 |     - File --> New File --> R Script. Edit. 
52 |     - 'Git' icon on menu bar --> Commit 
53 | 
54 | Packages
55 | 
56 | - The Rstudio 'wizard'
57 | - Benefits
58 | 
59 |     - Code re-use standardizes analysis
60 |     - No need to copy / paste code
61 |     - Easy to share with colleagues (work group, company, world)
62 | 
63 | ## Lab
64 | 
65 | Produce a short 'vignette' summarizing your RNA-seq work
66 | yesterday. The code chunks might be along the lines of
67 | 
68 | ```{r workflow-code-chunks, eval=FALSE}
69 | library(airway)
70 | library(DESeq2)
71 | 
72 | data(airway)
73 | 
74 | dds <- DESeqDataSet(airway, design = ~ cell + dex)
75 | dds$dex <- relevel(dds$dex, "untrt")
76 | dds <- DESeq(dds)
77 | res <- results(dds)
78 | ```
79 | 
80 | Embed this in textual description with relevant descriptive
81 | information (title, author, date) as well as text describing nuances
82 | of each step, plus figures or tables of your choosing to illustrate
83 | relevant aspects of the experimental design or results.
84 | 
85 | Render this as an HTML document to share with your colleagues at home.
86 | 


--------------------------------------------------------------------------------
/vignettes/C03.4_Visualization.Rmd:
--------------------------------------------------------------------------------
 1 | <!--
 2 | %\VignetteIndexEntry{03.4 Visualization}
 3 | %\VignettePackage{LearnBioconductor}
 4 | %\VignetteEngine{knitr::knitr}
 5 | -->
 6 | 
 7 | ```{r setup, echo=FALSE}
 8 | library(LearnBioconductor)
 9 | stopifnot(BiocInstaller::biocVersion() == "3.0")
10 | ```
11 | 
12 | ```{r style, echo = FALSE, results = 'asis'}
13 | BiocStyle::markdown()
14 | ```
15 | 
16 | # Visualization
17 | 
18 | Martin Morgan<br/>
19 | October 29, 2014
20 | 
21 | [Three
22 | approaches](http://bioconductor.org/help/course-materials/2014/summerx/Visualization-slides.pdf)
23 | 
24 | - 'base' graphics
25 | - _lattice_
26 | - `r CRANpkg("ggplot2")`
27 |   
28 |   - A neat _ggplot2_ example:
29 |     [slide 19+](http://bioconductor.org/help/course-materials/2014/CSAMA2014/2_Tuesday/lectures/Visualization_in_Statistical_Genomics-Carey.pdf)
30 |     from Vince Carey's presentation.
31 | 
32 | 
33 | Genome-centric
34 | 
35 | - `r Biocpkg("Gviz")` for visualizing genomic regions
36 | - `r Biocpkg("ggbio")` provides extensive options
37 | 
38 | Interactive visualization
39 | 
40 | - `r Biocpkg("rtracklayer")` for visualizing genomic ranges and
41 |   managing UCSC genome browser sessions.
42 | - `r Biocpkg('epivizr')` for interactive display
43 | - `r CRANpkg("shiny")` [web](http://shiny.rstudio.com) for interactive
44 |   apps.
45 | 
46 | Reports
47 | 
48 | - `r Biocpkg("ReportingTools")` for easy report templates
49 | 
50 | And...
51 | 
52 | - `r CRANpkg("RColorBrewer")` and
53 |   [web site](http://colorbrewer2.org/): helps choose sensible color
54 |   schemes.
55 | - [Accidental aRt](http://accidental-art.tumblr.com/archive)
56 | - [stick figure](https://github.com/EconometricsBySimulation/R-Graphics/blob/master/Stick-Figures/draw.stick.R)
57 | 
58 | ## Lab
59 | 
60 | ### Gviz / ggbio
61 | 
62 | Work through
63 | [vignette section 2](http://bioconductor.org/packages/release/bioc/vignettes/Gviz/inst/doc/Gviz.pdf)
64 | the `r Biocpkg('Gviz')` package, and then the 'Plotting fold changes
65 | in genomic space' in the [RNASeq lab](B02.1.1_RNASeqLab.html).
66 | 
67 | Peruse the 
68 | [vignette](http://bioconductor.org/packages/release/bioc/vignettes/ggbio/inst/doc/ggbio.pdf) of the `r Biocpkg('ggbio')` package.
69 | 
70 | Run the following _DESeq2_ work flow to arrive at a top-table; coerce
71 | the result to a `data.frame`.
72 | 
73 | ```{r ggplot-setup, eval=FALSE}
74 | library(DESeq2)
75 | library(airway)
76 | data(airway)
77 | se = airway
78 | dds <- DESeqDataSet(se, design = ~ cell + dex)
79 | dds$dex <- relevel(dds$dex, "untrt")
80 | dds <- DESeq(dds)
81 | res <- results(dds)
82 | resdf <- as.data.frame(res)
83 | ```
84 | 
85 | A 'volcano plot' shows the relationship between P-value and log fold
86 | change. Here's a basic volcano plot using base graphics; create a
87 | volcano plot using _ggplot2_ and / or _lattice_.
88 | 
89 | ```{r volcano, eval=FALSE}
90 | plot(-log10(padj) ~ log2FoldChange, resdf)
91 | ```
92 | 
93 | 


--------------------------------------------------------------------------------
/vignettes/D04.1_InstallIGV.Rmd:
--------------------------------------------------------------------------------
 1 | <!--
 2 | %\VignetteIndexEntry{04.1 Appendix: Install IGV}
 3 | %\VignettePackage{LearnBioconductor}
 4 | %\VignetteEngine{knitr::knitr}
 5 | -->
 6 | 
 7 | ```{r setup, echo=FALSE}
 8 | stopifnot(BiocInstaller::biocVersion() == "3.0")
 9 | ```
10 | 
11 | ```{r style, echo = FALSE, results = 'asis'}
12 | BiocStyle::markdown()
13 | ```
14 | ## Appendix 1: Installing and using IGV
15 | 
16 | _NOTE_: All instructions in this document should be performed
17 | **on your laptop**, not on the RStudio Server AMI.
18 | 
19 | * We'll first create a directory called "igv" in your
20 |   home directory. You can determine your home directory by issuing
21 |   the following command in a Terminal window on Linux or Mac:
22 | 
23 |     echo $HOME
24 | 
25 |   On Windows, open a command window by clicking Start, then Run, then 
26 |   typing `cmd` and pressing Enter. In the window, type
27 | 
28 |     echo %USERPROFILE%
29 | 
30 | * So go to the indicated directory, and then issue the command
31 |   
32 |     mkdir igv
33 | 
34 |   That will create an `igv` directory in your home directory.
35 | 
36 | * Copy the file [hg19_alias.tab](hg19_alias.tab) to this
37 | directory. 
38 | This is a simple tab-delimited file that maps between the sequence
39 | names used by the alignment, and the sequence names known to IGV.
40 | 
41 | * Install Java if you haven't already. Go to 
42 |   [https://www.java.com/en/](https://www.java.com/en/) and
43 |   click on Free Java Download. Download and install.
44 | 
45 | * Download IGV from [https://www.broadinstitute.org/igv/download](https://www.broadinstitute.org/igv/download). There are several
46 | ways to do this, the easiest is to click on one of the Java Web Start
47 | links. Make sure you have more memory available than what is listed below the Launch button. You may need to change the security settings on your
48 | computer to allow IGV to launch.
49 | 
50 | * With IGV running, let's change the default genome. It's currently
51 |   set to Human hg18, so click on the dropdown in the upper left
52 |   that says "Human hg18". Click on "More..." and then in the
53 |   `Filter` box, start typing `hg19`. `Human hg19` will then show
54 |   up in the box and you can double-click on it. We're now using
55 |   Human hg19 as our default genome.
56 | 
57 | * You can now open bam files in IGV. Here are the URLs of 
58 |   BAM files for this class. You can open them by clicking on
59 |   IGV's `File` menu, then clicking on `Load From URL...`.
60 |   You can paste one of these URLs into the box and click OK:
61 | 
62 | <pre>
63 | 
64 |     http://s3-us-west-2.amazonaws.com/oct2014bamfiles/SRR1039508_sorted.bam
65 |     http://s3-us-west-2.amazonaws.com/oct2014bamfiles/SRR1039509_sorted.bam
66 |     http://s3-us-west-2.amazonaws.com/oct2014bamfiles/SRR1039512_sorted.bam
67 |     http://s3-us-west-2.amazonaws.com/oct2014bamfiles/SRR1039513_sorted.bam
68 |     http://s3-us-west-2.amazonaws.com/oct2014bamfiles/SRR1039516_sorted.bam
69 |     http://s3-us-west-2.amazonaws.com/oct2014bamfiles/SRR1039517_sorted.bam
70 |     http://s3-us-west-2.amazonaws.com/oct2014bamfiles/SRR1039520_sorted.bam
71 |     http://s3-us-west-2.amazonaws.com/oct2014bamfiles/SRR1039521_sorted.bam
72 |   
73 | </pre>
74 | 
75 |   You may see a warning that the bam file does not contain any sequence names which match the current genome. You can ignore this (click OK).
76 | 
77 | * Zoom in to a particular gene, e.g., SPARCL1, by entering the gene
78 |    symbol in the box toward the top center of the browser window. Then click `Go`. Adjust
79 |    the zoom until reads come in to view, and interpret the result.
80 | 
81 | 


--------------------------------------------------------------------------------
/vignettes/hg19_alias.tab:
--------------------------------------------------------------------------------
 1 | gi|224384768|gb|CM000663.1| chr1
 2 | gi|224384767|gb|CM000664.1| chr2
 3 | gi|224384766|gb|CM000665.1| chr3
 4 | gi|224384765|gb|CM000666.1| chr4
 5 | gi|224384764|gb|CM000667.1| chr5
 6 | gi|224384763|gb|CM000668.1| chr6
 7 | gi|224384762|gb|CM000669.1| chr7
 8 | gi|224384761|gb|CM000670.1| chr8
 9 | gi|224384760|gb|CM000671.1| chr9
10 | gi|224384759|gb|CM000672.1| chr10
11 | gi|224384758|gb|CM000673.1| chr11
12 | gi|224384757|gb|CM000674.1| chr12
13 | gi|224384756|gb|CM000675.1| chr13
14 | gi|224384755|gb|CM000676.1| chr14
15 | gi|224384754|gb|CM000677.1| chr15
16 | gi|224384753|gb|CM000678.1| chr16
17 | gi|224384752|gb|CM000679.1| chr17
18 | gi|224384751|gb|CM000680.1| chr18
19 | gi|224384750|gb|CM000681.1| chr19
20 | gi|224384749|gb|CM000682.1| chr20
21 | gi|224384748|gb|CM000683.1| chr21
22 | gi|224384747|gb|CM000684.1| chr22
23 | 


--------------------------------------------------------------------------------
/vignettes/our_figures/GRanges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/GRanges.png


--------------------------------------------------------------------------------
/vignettes/our_figures/GRangesList.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/GRangesList.png


--------------------------------------------------------------------------------
/vignettes/our_figures/RangeOperations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/RangeOperations.png


--------------------------------------------------------------------------------
/vignettes/our_figures/SequencingEcosystem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/SequencingEcosystem.png


--------------------------------------------------------------------------------
/vignettes/our_figures/SequencingEcosystem_no_bioc_pkgs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/SequencingEcosystem_no_bioc_pkgs.png


--------------------------------------------------------------------------------
/vignettes/our_figures/Solexa-bridge-pcr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/Solexa-bridge-pcr.jpg


--------------------------------------------------------------------------------
/vignettes/our_figures/SummarizedExperiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/SummarizedExperiment.png


--------------------------------------------------------------------------------
/vignettes/our_figures/copy_number_QC_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/copy_number_QC_2.png


--------------------------------------------------------------------------------
/vignettes/our_figures/cross_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/cross_validation.png


--------------------------------------------------------------------------------
/vignettes/our_figures/knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/knn.png


--------------------------------------------------------------------------------
/vignettes/our_figures/nrg2825-f2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bioconductor/LearnBioconductor/f1f94b661f33cc911ac06bfffc823c6690fc84f7/vignettes/our_figures/nrg2825-f2.jpg


--------------------------------------------------------------------------------