├── .gitignore
├── .md
├── LICENSE
├── README.md
├── Rscripts
├── cheung.R
├── cheungSubset.R
├── cltmovie.R
├── dilution.R
├── gsea.R
├── limma_quiz.R
├── make_two_tables_for_class.R
├── microarraydata-eda-lab.R
└── read_tcga_meth.R
├── advinference
├── eda_for_highthroughput.Rmd
├── inference_for_highthroughput.Rmd
├── intro_to_highthroughput_data.Rmd
├── multiple_testing.Rmd
├── quick_Bioc_intro.Rmd
└── storage
│ ├── confounding.Rmd
│ ├── crossvalidation.Rmd
│ ├── distance_lecture.Rmd
│ ├── heatmaps.Rmd
│ ├── hierarchical_modeling.R
│ ├── justsvd_duplicate_content.txt
│ ├── modeling.Rmd
│ ├── multtest.Rmd
│ ├── pca_svd.Rmd
│ ├── prediction.Rmd
│ ├── sva.Rmd
│ ├── svacombat.Rmd
│ └── transformations.Rmd
├── batch
├── adjusting_with_factor_analysis.Rmd
├── adjusting_with_linear_models.Rmd
├── confounding.Rmd
├── eda_with_pca.Rmd
├── factor_analysis.Rmd
└── intro_to_batch_effects.Rmd
├── bioc
├── EDA_plots_for_NGS.Rmd
├── HPCami.Rmd
├── aaFinalSummary.Rmd
├── anno4liftover.Rmd
├── annoCheat.Rmd
├── archWk4basic.Rmd
├── background.Rmd
├── biocparallel.Rmd
├── biological_versus_technical_var.Rmd
├── biostrings.Rmd
├── c1.all.v4.0.entrez.gmt
├── eset.Rmd
├── eset_sumexp.Rmd
├── gene_set_analysis.Rmd
├── gene_set_analysis_in_R.Rmd
├── ggbioNote.Rmd
├── grangesERBSExample.Rmd
├── importBed.Rmd
├── import_NGS.Rmd
├── inference_with_bioc.Rmd
├── installing_Bioconductor_finding_help.Rmd
├── iranges_granges.Rmd
├── moreGR.Rmd
├── normalization.Rmd
├── operateGRanges.Rmd
├── read_counting.Rmd
├── reading_microarray_data.Rmd
├── seq4motif.Rmd
├── storage
│ ├── EDA_plots_for_microarray.Rmd
│ ├── GEOquery.Rmd
│ ├── anno1refbuilds.Rmd
│ ├── anno2Biostrings.Rmd
│ ├── anno3GRanges.Rmd
│ ├── anno4liftover.Rmd
│ ├── anno5genes.Rmd
│ ├── annoPhen.Rmd
│ ├── basic_Bioconductor_infrastructure.Rmd
│ ├── basic_inference_microarray.Rmd
│ ├── chromComp.Rmd
│ ├── chromIntro.Rmd
│ ├── confounding.Rmd
│ ├── mapping_features.Rmd
│ ├── probeSearch.Rmd
│ ├── svacombat.Rmd
│ ├── using_limma.Rmd
│ └── using_limma_old_no_comments.Rmd
├── tophat.md
├── using_limma.Rmd
└── visualizing_NGS.Rmd
├── biocadv_6x
├── bioc2_HPCami.Rmd
├── bioc2_externData.Rmd
├── bioc2_ggbio.Rmd
├── bioc2_gvfeat.Rmd
├── bioc2_hybstor.Rmd
├── bioc2_integExamps.Rmd
├── bioc2_nosql.Rmd
├── bioc2_ov.Rmd
├── bioc2_parallel.Rmd
├── bioc2_rainfall.Rmd
├── bioc2_repro1.Rmd
├── bioc2_rpacks.Rmd
├── bioc2_shiny.Rmd
├── bioc2_vizNGS.Rmd
├── bioc2_vizOv.Rmd
├── esHclust.Rmd
├── finalViz.Rmd
├── multiOOM.Rmd
└── tcga.Rmd
├── biocintro_5x
├── WhatWeMeas.Rmd
├── bioc1_align.Rmd
├── bioc1_annoCheat.Rmd
├── bioc1_annoOverview.Rmd
├── bioc1_btvari.Rmd
├── bioc1_geneset_1.Rmd
├── bioc1_grangeOps.Rmd
├── bioc1_igranges.Rmd
├── bioc1_liftOver.Rmd
├── bioc1_limma.Rmd
├── bioc1_mgt_gsd.Rmd
├── bioc1_multibed.Rmd
├── bioc1_roast.Rmd
├── bioc1_summex.Rmd
├── bioc1_t_mult.Rmd
├── biointro.Rmd
├── biomotiv.Rmd
├── dataman2017.Rmd
├── dataman2019.Rmd
├── dataman2022.Rmd
└── optalign.Rmd
├── chipseq
├── ChIPseq.Rmd
├── ChIPseq_quiz.R
└── MACS.txt
├── eda
├── exploratory_data_analysis.Rmd
└── plots_to_avoid.Rmd
├── example.Rmd
├── footnotes.R
├── highdim
├── PCA.Rmd
├── distance.Rmd
├── images
│ └── handmade
│ │ ├── Heatmap.png
│ │ ├── SVD1.png
│ │ ├── SVD2.png
│ │ └── animals.png
├── mds.Rmd
├── pca_motivation.Rmd
├── projections.Rmd
├── rotations.Rmd
└── svd.Rmd
├── inference
├── R_refresher.Rmd
├── association_tests.Rmd
├── clt_and_t-distribution.Rmd
├── clt_in_practice.Rmd
├── confidence_intervals.Rmd
├── monte_carlo.Rmd
├── permutation_tests.Rmd
├── populations_and_samples.Rmd
├── power_calculations.Rmd
├── random_variables.Rmd
└── t-tests_in_practice.Rmd
├── intro
├── dplyr_intro.Rmd
├── dplyr_tutorial.Rmd
├── getting_started.Rmd
├── github.Rmd
├── introduction.Rmd
├── math_notation.Rmd
└── system_files.Rmd
├── linear
├── collinearity.Rmd
├── expressing_design_formula.Rmd
├── interactions_and_contrasts.Rmd
├── linear_models_going_further.Rmd
├── linear_models_in_practice.Rmd
├── linear_models_intro.Rmd
├── qr_and_regression.Rmd
└── standard_errors.Rmd
├── list_libs.sh
├── makefile
├── matrixalg
├── intro_using_regression.Rmd
├── matrix_algebra_examples.Rmd
├── matrix_notation.Rmd
└── matrix_operations.Rmd
├── methyl
├── epiviz.Rmd
├── inference_for_DNAmeth.Rmd
├── methylation.Rmd
└── minfi.Rmd
├── ml
├── clustering_and_heatmaps.Rmd
├── conditional_expectation.Rmd
├── crossvalidation.Rmd
├── machine_learning.Rmd
└── smoothing.Rmd
├── modeling
├── bayes-gif.R
├── bayes.Rmd
├── hierarchical_models.Rmd
└── modeling.Rmd
├── renaming_map.md
├── rnaseq
├── airway_sample_table.csv
├── fastq.md
├── genome_align_STAR.md
├── r_bioc_links.md
├── rnaseq_exon_usage.Rmd
├── rnaseq_gene_level.Rmd
├── rnaseq_isoform_cummerbund.Rmd
├── rnaseq_pkgs.R
├── storage
│ ├── RNAseq_quiz.R
│ └── cufflinks.txt
└── trancsript_align_RSEM.md
├── robust
├── ranktest.Rmd
└── robust_summaries.Rmd
└── variants
├── SNP.Rmd
└── SNP_quiz.R
/.gitignore:
--------------------------------------------------------------------------------
1 | footnotes.md
2 | */figure
3 | */cache
4 | */.cache
5 | */*.html
6 | */*.md
7 | */*.RData
8 | */*.txt
9 | */*.csv
10 | */*.tsv
11 | */*.tab
12 | */*.bam
13 | */*.bai
14 | */*.fasta
15 | */*.fai
16 | */*.bgz
17 | */*.tbi
18 | */.DS_Store
19 | */.Rhistory
20 | *_exercises
21 | *.Rproj
22 | .Rproj.user
23 |
--------------------------------------------------------------------------------
/.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/.md
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2013 Rafael Irizarry and Michael Love
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### Data Analysis for the Life Sciences
2 |
3 | #### NEWS:
4 |
5 | September 16, 2015 : We are reogranizing the labs here for the new courses launching this Fall.
6 | We have decided to drop the `course1` style of directory structure, as the number of courses
7 | is still in flux. We are now using a modular structure. See `renaming_map.md` for how courses
8 | were remapped to new names.
9 |
10 | #### Book versions
11 |
12 | Compiled versions of this document as HTML can be found here:
13 |
14 | http://genomicsclass.github.io/book/
15 |
16 | The ePub version of this document can be found on *Leanpub*:
17 |
18 | https://leanpub.com/dataanalysisforthelifesciences/
19 |
20 | #### Pull requests and issues
21 |
22 | We greatly appreciate all of our readers who contribute pull requests!
23 |
24 | If you want to contribute through pull request, please first clone a *new version* of the repo. If you have a version of the repo from 2014, it will contain some large data objects, which accidentally snuck in, and we won't be able to accept your pull request.
25 |
26 | Please do not add an issue which says "I couldn't knit the Rmd". This is nearly always because users are missing one or more of the libraries and datasets used within (we do not re-install libraries in each Rmd script as this would slow down our compilation of the book material). You will find the missing library if you step through the Rmd one chunk at a time.
27 |
--------------------------------------------------------------------------------
/Rscripts/cheung.R:
--------------------------------------------------------------------------------
1 | library(GEOquery)
2 | gse <- getGEO("GSE5859")
3 | pd<-pData(gse[[1]])
4 | library(affy)
5 | filenames<-file.path("GSE5859",basename(as.character(pd[,38])))
6 | e=justRMA(filenames=filenames)
7 | dates<-vector("character",ncol(exprs(e)))
8 | for(i in seq(along=dates)){
9 | tmp<-affyio::read.celfile.header(filenames[i],info="full")
10 | dates[i]<-strsplit(tmp$ScanDate,"\ ")[[1]][1]
11 | }
12 | dates<-as.Date(dates,"%m/%d/%y")
13 |
14 | ###ethnicity info obtained from Jeff Leek
15 | eth <- readLines("fulldata_reprecent.txt",n=1)
16 | eth=strsplit(eth,"\ ")[[1]]
17 | eth=t(sapply(strsplit(eth,"_"),function(x) x))
18 |
19 | gmnames<-gsub("_rep[12]","",as.character(pd[,1]))
20 |
21 | eth2<-eth[match(gmnames,eth[,2]),1]
22 | eth2[is.na(eth2)]<-"HAN" ##from LA, checked here
23 | ##http://ccr.coriell.org/Sections/Search/Advanced_Search.aspx?PgId=175
24 |
25 |
26 | pd=data.frame(ethnicity=eth2,date=dates,filename=I(basename(filenames)))
27 | pData(e)<-pd
28 | save(e,file="GSE5859.rda")
29 |
--------------------------------------------------------------------------------
/Rscripts/cheungSubset.R:
--------------------------------------------------------------------------------
1 | library(Biobase)
2 | library(GSE5859)
3 | library(hgfocus.db) ##get the gene chromosome
4 | data(GSE5859)
5 | annot <- select(hgfocus.db, keys=featureNames(e), keytype="PROBEID",
6 | columns=c("CHR", "CHRLOC", "SYMBOL"))[,-4]
7 | ##for genes with multiples, pick on
8 | annot <-annot[match(featureNames(e),annot$PROBEID),]
9 | annot$CHR <- ifelse(is.na(annot$CHR),NA,paste0("chr",annot$CHR))
10 | y<- colMeans(exprs(e)[which(annot$CHR=="chrY"),])
11 | sex <- ifelse(y<4.5,"F","M")
12 |
13 | sampleInfo <- pData(e)
14 | sampleInfo$group <- ifelse(sex=="F",1,0)
15 |
16 | batch <- format(pData(e)$date,"%y%m")
17 | ind<-which(batch%in%c("0506","0510"))
18 | set.seed(1)
19 | N <- 12; N1 <-3; M<-12; M1<-9
20 | ind <- c(sample(which(batch=="0506" & sex=="F"),N1),
21 | sample(which(batch=="0510" & sex=="F"),N-N1),
22 | sample(which(batch=="0506" & sex=="M"),M1),
23 | sample(which(batch=="0510" & sex=="M"),M-M1))
24 |
25 | geneExpression <- exprs(e)[,ind]
26 | sampleInfo <- sampleInfo[ind,]
27 | geneAnnotation <- annot
28 |
29 | save(geneExpression,sampleInfo,geneAnnotation,file="GSE5859Subset.rda")
30 |
--------------------------------------------------------------------------------
/Rscripts/cltmovie.R:
--------------------------------------------------------------------------------
1 | dat<-read.csv("http://www.biostat.jhsph.edu/bstcourse/bio751/data/USheights_subsample.csv")
2 |
3 | library(animation)
4 | saveGIF({
5 | set.seed(1)
6 | N=10
7 | L<-1000
8 | means<-vector("numeric",L)
9 | LIM=seq(69.28-4,69.28+4,0.33/sqrt(N)*sqrt(10))
10 | LIM2=seq(69.28-4,69.28+4,0.1)
11 | for(i in 1:L){
12 | means[i]<-mean(sample(dat$Height[dat$Gender==1],N))
13 | if(i%%20==1){
14 | dd=dnorm(LIM2,mean(dat$Height[dat$Gender==1]),sd(dat$Height[dat$Gender==1])/sqrt(N))
15 | tmp=hist(means[1:i],ylim=c(0,150),xlim=range(LIM),breaks=LIM,freq=TRUE,xlab="average height",ylab="Density",main=paste0("N=",N))
16 | k=sum(tmp$counts)/sum(dd)*length(dd)/length(tmp$counts) ##this is a normalizing constant to assure same are on plot
17 | lines(LIM2,dd*k,type="l",col=2,lwd=2)
18 | }
19 | }
20 | },'clt10.gif', interval = .05)
21 |
22 |
23 |
--------------------------------------------------------------------------------
/Rscripts/dilution.R:
--------------------------------------------------------------------------------
1 | library(affy)
2 | fns <- list.celfiles(path="CEL",full=TRUE)
3 | pData <- read.table("dilution_pdata.txt",header=TRUE)
4 | stopifnot(all(pData$filename == grep("CEL\\/(.+)\\.cel","\\1",fns)))
5 | celData <- ReadAffy(filenames=fns,phenoData=pData,verbose=TRUE)
6 | dilution <- rma(celData,verbose=TRUE)
7 | save(dilution, file="dilution.RData")
8 |
--------------------------------------------------------------------------------
/Rscripts/gsea.R:
--------------------------------------------------------------------------------
1 | tab <- read.delim("gseacelfiles/reannotate_select_cal.gct",as.is=TRUE,skip=2)
2 | library(affy)
3 | fns <- list.celfiles(path="gseacelfiles")
4 |
5 | sns <- gsub("\\.CEL\\.gz","",fns)
6 | tmp<-strsplit(names(tab)[-c(1,2)],"\\.")
7 | sns2 <- sapply(tmp,function(x)x[2])
8 | tmp2 <- data.frame(t(sapply(tmp,function(x) strsplit(x[1],"_")[[1]])))
9 | filenames <- fns[match(sns2,sns)]
10 | ab<- ReadAffy(filenames=filenames,celfile.path="gseacelfiles")
11 | e<-rma(ab)
12 | dates<-vector("character",ncol(exprs(e)))
13 | for(i in seq(along=dates)){
14 | tmp<-affyio::read.celfile.header(file.path("gseacelfiles",filenames[i]),info="full")
15 | dates[i]<-strsplit(tmp$ScanDate,"\ ")[[1]][1]
16 | }
17 | dates<-as.Date(dates,"%m/%d/%y")
18 |
19 | tmp2$dates <- dates
20 |
21 | pData(e)<-tmp2
22 |
23 |
24 | save(e,file="gsea.rda")
25 |
26 | ##adding MAS 5.0
27 | library(simpleaffy)
28 | m <- justMAS(ab)
29 | pData(m)<-tmp2
30 | save(m,file="gseamas5.rda")
31 |
--------------------------------------------------------------------------------
/Rscripts/limma_quiz.R:
--------------------------------------------------------------------------------
1 | # biocLite("gaschYHS")
2 | library(gaschYHS)
3 | data(gaschYHS)
4 | e <- gaschYHS
5 | head(pData(e)[,c("time","status")],12)
6 |
7 | e <- e[,1:8]
8 | e <- e[!apply(exprs(e), 1, anyNA),]
9 |
10 | # question 1
11 |
12 | condition <- factor(rep(1:2,c(5,3)))
13 | design <- model.matrix(~ condition)
14 | library(limma)
15 | fit <- lmFit(e, design)
16 | fit <- eBayes(fit)
17 | (tt <- topTable(fit, coef=2))
18 | tt["YDR171W",]
19 |
20 | # question 2
21 |
22 | e = e[,1:5]
23 | time <- pData(e)$time
24 | design <- model.matrix(~ time + I(time^2))
25 |
26 | fit <- lmFit(e, design)
27 | fit <- eBayes(fit)
28 | (tt <- topTable(fit, coef=2:3))
29 | tt["YGR211W",]
30 |
31 |
--------------------------------------------------------------------------------
/Rscripts/make_two_tables_for_class.R:
--------------------------------------------------------------------------------
1 | library(minfi)
2 | datadir="/home/bst/other/hcorrada/methyl/exps/tcga/raw_data/colon"
3 |
4 | clinicalDir=file.path(datadir,"Clinical/Biotab")
5 | sample_tab=read.delim(file.path(clinicalDir,"biospecimen_sample_coad.txt"),sep="\t",stringsAsFactors=FALSE)
6 | keep=sample_tab$sample_type %in% c("Primary Tumor", "Solid Tissue Normal")
7 | sample_tab=sample_tab[keep,]
8 |
9 | patient_id=unique(sapply(strsplit(sample_tab$bcr_sample_barcode,split="-"), function(x) paste(x[1:3],collapse="-")))
10 |
11 | tumor_sample_id=sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Primary Tumor"]
12 | normal_sample_id=sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Solid Tissue Normal"]
13 |
14 | # read tumor data
15 | tumor_tab=read.delim(file.path(clinicalDir,"biospecimen_tumor_sample_coad.txt"),sep="\t",stringsAsFactors=FALSE)
16 |
17 | ###make it slightly harder by changing a name
18 | write.csv(tumor_tab,file="tumor_tab.csv",row.names=FALSE)
19 | write.csv(sample_tab,file="sample_tab.csv",row.names=FALSE)
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/Rscripts/microarraydata-eda-lab.R:
--------------------------------------------------------------------------------
1 | # biocLite("SpikeIn")
2 | library(SpikeIn)
3 | data("SpikeIn95")
4 | ##from previous data exploration we know that array 55 is bad. we pick
5 | ##two groups with same 52-55 and 56-59
6 | int=pm(SpikeIn95)[,52:59] ###int for intensity
7 | spikeInDesign=pData(SpikeIn95)[52:59,]
8 |
9 | cdfname <- getCdfInfo(SpikeIn95)
10 | psets <- as.list(cdfname)
11 | psets <- psets[order(names(psets))]
12 | index <- unlist(sapply(psets, function(x) x[, 1]), use.names = FALSE)
13 | locations <- indices2xy(index,cdf="hgu95acdf")
14 |
15 | save(int,spikeInDesign,locations,file="spikeInSubset.rda")
16 |
--------------------------------------------------------------------------------
/Rscripts/read_tcga_meth.R:
--------------------------------------------------------------------------------
1 | library(minfi)
2 | ## DOwnload
3 | ## colon/DNA_Methylation/JHU_USC__HumanMethylation450/Level_1/
4 | ## from tcga and put in datadir
5 | datadir="/datadir"
6 |
7 |
8 | clinicalDir=file.path(datadir,"Clinical/Biotab")
9 | sample_tab=read.delim(file.path(clinicalDir,"biospecimen_sample_coad.txt"),sep="\t",stringsAsFactors=FALSE)
10 | keep=sample_tab$sample_type %in% c("Primary Tumor", "Solid Tissue Normal")
11 | sample_tab=sample_tab[keep,]
12 |
13 | patient_id=unique(sapply(strsplit(sample_tab$bcr_sample_barcode,split="-"), function(x) paste(x[1:3],collapse="-")))
14 |
15 | tumor_sample_id=sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Primary Tumor"]
16 | normal_sample_id=sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Solid Tissue Normal"]
17 |
18 | # read tumor data
19 | tumor_tab=read.delim(file.path(clinicalDir,"biospecimen_tumor_sample_coad.txt"),sep="\t",stringsAsFactors=FALSE)
20 |
21 | tab=merge(sample_tab, tumor_tab, by="bcr_sample_uuid", suffixes=c(".sample",".tumor"),all.x=TRUE)
22 |
23 | # read normal data
24 | normal_tab=read.delim(file.path(clinicalDir,"biospecimen_normal_control_coad.txt"),sep="\t",stringsAsFactors=FALSE)
25 | tab=merge(tab, normal_tab, by="bcr_sample_uuid", suffixes=c(".tumor",".normal"),all.x=TRUE)
26 |
27 | tab$bcr_patient_barcode=tab$bcr_patient_barcode.tumor
28 |
29 | ii=is.na(tab$bcr_patient_barcode)
30 | tab$bcr_patient_barcode[ii]=tab$bcr_patient_barcode.normal[ii]
31 |
32 | # read patient data
33 | patient_tab=read.delim(file.path(clinicalDir,"clinical_patient_coad.txt"),sep="\t",stringsAsFactors=FALSE)
34 | names(patient_tab)=paste("patient",names(patient_tab),sep=".")
35 | tmp=merge(tab,patient_tab,by.x="bcr_patient_barcode",by.y="patient.bcr_patient_barcode",all.x=TRUE,suffixes=c(".sample",".patient"))
36 | tab=tmp
37 |
38 | # read meth metadata
39 | methMetaDir=file.path(datadir,"METADATA/JHU_USC__HumanMethylation450")
40 | methMeta_tab=read.delim(file.path(methMetaDir,"jhu-usc.edu_COAD.HumanMethylation450.1.4.0.sdrf.txt"),sep="\t",stringsAsFactors=FALSE)
41 |
42 | sample_barcode=sapply(strsplit(methMeta_tab$Comment..TCGA.Barcode.,split="-"),function(x) paste(x[1:4],collapse="-"))
43 | m=match(tab$bcr_sample_barcode,sample_barcode)
44 | tab$Basename=gsub("_Grn\\.idat","",methMeta_tab$Array.Data.File[m])
45 | tab=tab[!is.na(tab$Basename),]
46 |
47 | basedir=file.path(datadir,"DNA_Methylation/JHU_USC__HumanMethylation450/Level_1")
48 | tab$Basename=file.path(basedir,tab$Basename)
49 | keep=file.exists(paste(tab$Basename,"_Grn.idat",sep=""))
50 | colon_targets=tab
51 | objs=grep("tab",ls(),value=TRUE)
52 | rm(list=objs)
53 | objs=grep("dir",ls(),value=TRUE,ignore=TRUE)
54 | rm(list=objs)
55 |
56 | nms=names(colon_targets)
57 | targets=colon_targets[nms]
58 |
59 | targets$Status=factor(ifelse(targets$sample_type=="Primary Tumor","cancer","normal"),levels=c("normal","cancer"))
60 | targets$Tissue=tolower(targets$patient.tumor_tissue_site)
61 | targets$Sex=targets$patient.gender
62 |
63 | datadir="rdas"
64 | save(targets,colon_targets,breast_targets,lung_targets,file=file.path(datadir,"targets.rda"))
65 |
66 | ##read raw data
67 | rgset <- read.450k(targets$Basename,verbose=TRUE)
68 | pData(rgset)<-targets
69 |
70 | #normalize with illumina default
71 | mset1<-preprocessIllumina(rgset)
72 | mset1<-mapToGenome(mset1)
73 | meth <- getBeta(mset1,type="Illumina")
74 | gr <- granges(mset1)
75 | pd <- pData(mset1)
76 |
77 | save(meth,gr,pd,file="coloncancermeth.rda")
78 |
--------------------------------------------------------------------------------
/advinference/quick_Bioc_intro.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Introduction to Advanced Statistics for the Life Sciences"
3 | author: "Rafa"
4 | date: "January 31, 2015"
5 | output: html_document
6 | layout: page
7 | ---
8 |
9 | ```{r options, echo=FALSE}
10 | library(knitr)
11 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
12 | ```
13 |
14 | # Installing Bioconductor
15 |
16 | Many of the datasets we will use in this chapter require packages made available via the Bioconductor project. Bioconductor is similar to CRAN but uses a different set of functions for downloads. It also includes many more data packages as well as _annotation_ packages that store information about either high-throughout products or information about molecular endpoints such as genes. We will need to some of these packages in this chapter. Here we show how to install the Biobase package.
17 |
18 | ```{r,eval=FALSE}
19 | source("http://bioconductor.org/biocLite.R")
20 | biocLite("Biobase")
21 | ```
22 |
23 | You can install a suite of recommended packages by simply typing `biocLite()`
24 |
25 | # Data organized in three tables
26 |
27 | One of the great advantages of using Bioconductor for high throughput data is that it provides object classes specifically designed to keep high throughput data organized. Below we show an example of how the three tables that are needed to conduct data analysis are available from Bioconductor data objects. For example for gene expression we can use the ExpressionSet object.
28 |
29 | ```{r,message=FALSE}
30 | library(Biobase)
31 | ##can be installed like this: devtools::install_github("genomicsclass/GSE5859")
32 | library(GSE5859)
33 | data(GSE5859)
34 | class(e)
35 | ```
36 |
37 |
38 | These objects were originally designed for gene expression data so the methods to extract the high throughput measurements have related names:
39 | ```{r}
40 | dat <- exprs(e)
41 | dim(dat)
42 | ```
43 |
44 | The information about samples is also stored in this object and the functions to create it try to guarantee that the columns of `exprs(e)` match the rows of the sample information table. `pData` is use as shorthand for _phenotype_ data.
45 | :
46 |
47 | ```{r}
48 | sampleInfo <- pData(e)
49 | dim(sampleInfo)
50 | head(sampleInfo)
51 | ```
52 |
53 | A final table, which we will cover in much more detail in the Bioconductor chapter, is a table that describes the rows, in this case genes. Because each product will have a different table, these have already been created in Bioconductor. Because there are certain products that are widely used, Bioconductor makes databases available from which you can extract this information. This every object does not have to carry around this information:
54 |
55 | ```{r}
56 | library(hgfocus.db)
57 | annot <- select(hgfocus.db, keys=featureNames(e), keytype="PROBEID", columns=c("CHR", "CHRLOC", "SYMBOL"))
58 | ##pick one
59 | annot <-annot[match(featureNames(e),annot$PROBEID),]
60 | head(annot)
61 | dim(annot)
62 | ```
63 |
--------------------------------------------------------------------------------
/advinference/storage/crossvalidation.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Cross-validation
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | In this lab, we will explore a method for picking parameters in a
12 | prediction / machine learning task, which is called
13 | *cross-validation*.
14 |
15 | Suppose we have a prediction algorithm which is going to predict the
16 | class of some observations using a number of features. For example, we
17 | will use the gene expression values to predict the tissue type in our
18 | tissues gene expression dataset.
19 |
20 | If this algorithm has a parameter which controls the behavior, we
21 | might pick the value of this parameter which minimizes the
22 | classification error. However, trying to classify the same
23 | observations as we use to *train* the model can be misleading.
24 | In lecture, we saw that for K-nearest neighbors, using k=1 will always
25 | give 0 classification error in the training set (because we use the
26 | single observation to classify itself). Instead, it's better to pick
27 | the parameter using the algorithms performance on a set of
28 | observations which the algorithm has never seen, a *test* set.
29 |
30 | Cross-validation is simply a method which splits the data into a
31 | number of *folds*. If we have N folds, then the algorithm typically
32 | trains on (N-1) of the folds, and test the algorithms performance on
33 | the left-out single fold. This is then repeated N times until each
34 | fold has been used as a *test* set.
35 |
36 | Let's load in the tissue gene expression dataset:
37 |
38 | ```{r}
39 | # library(devtools)
40 | # install_github("dagdata","genomicsclass")
41 | library(dagdata)
42 | data(tissuesGeneExpression)
43 | library(Biobase)
44 | rownames(tab) <- tab$filename
45 | t <- ExpressionSet(e, AnnotatedDataFrame(tab))
46 | t$Tissue <- factor(t$Tissue)
47 | colnames(t) <- paste0(t$Tissue, seq_len(ncol(t)))
48 | ```
49 |
50 | Let's drop one of the tissues which doesn't have many samples:
51 |
52 | ```{r}
53 | library(class)
54 | table(t$Tissue)
55 | t <- t[,t$Tissue != "placenta"]
56 | t$Tissue <- droplevels(t$Tissue)
57 | table(t$Tissue)
58 | x <- t(exprs(t))
59 | ```
60 |
61 | We will use the `createFolds` function from the `caret`
62 | package to make 5 folds of the data, which are
63 | balanced over the tissues. Don't be confused that the
64 | `createFolds` function uses the same letter 'k' as the k in
65 | K-nearest neighbors. These 'k' are unrelated.
66 | The caret function `createFolds` is
67 | asking for how many folds to create, the 'N' from above. The `knn`
68 | function is asking how many closest observations to use to classify
69 | the test observations.
70 |
71 | ```{r}
72 | # install.packages("caret")
73 | library(caret)
74 | set.seed(1)
75 | idx <- createFolds(t$Tissue, k=5)
76 | sapply(idx, function(i) table(t$Tissue[i]))
77 | ```
78 |
79 | Now we can try out the K-nearest neighbors method on a single fold:
80 |
81 | ```{r}
82 | pred <- knn(train = x[ -idx[[1]], ], test = x[ idx[[1]], ], cl=t$Tissue[ -idx[[1]] ], k=5)
83 | table(true=t$Tissue[ idx[[1]] ], pred)
84 | ```
85 |
86 | As the prediction is looking too good in the space of all the genes,
87 | let's make it more difficult for the K-nearest neighbors algorithm.
88 | We will use a reduced dimension representation of the dataset, using
89 | the *multi-dimensional scaling* algorithm used in the previous section.
90 |
91 | ```{r}
92 | xsmall <- cmdscale(dist(x))
93 | ```
94 |
95 | Now we will create a loop, which tries out each value of k from 1 to
96 | 12, and runs the K-nearest neighbors algorithm on each fold. We then
97 | ask for the proportion of errors for each fold, and report the average
98 | from the 5 cross-validation folds:
99 |
100 | ```{r}
101 | set.seed(1)
102 | ks <- 1:12
103 | res <- sapply(ks, function(k) {
104 | # try out each version of k from 1 to 12
105 |
106 | res.k <- sapply(seq_along(idx), function(i) {
107 | # loop over each of the 5 cross-validation folds
108 |
109 | # predict the held-out samples using k nearest neighbors
110 | pred <- knn(train = xsmall[ -idx[[i]], ],
111 | test = xsmall[ idx[[i]], ],
112 | cl = t$Tissue[ -idx[[i]] ], k = k)
113 |
114 | # the ratio of misclassified samples
115 | mean(t$Tissue[ idx[[i]] ] != pred)
116 | })
117 |
118 | # average over the 5 folds
119 | mean(res.k)
120 | })
121 | ```
122 |
123 | Now we can plot the mean misclassification rate for each value of k:
124 |
125 | ```{r}
126 | plot(ks, res, type="o")
127 | ```
128 |
--------------------------------------------------------------------------------
/advinference/storage/heatmaps.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Dimension reduction and heatmaps
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ## Dimension reduction
12 |
13 | We start loading the tissue gene expression dataset:
14 |
15 | ```{r}
16 | # library(devtools)
17 | # install_github("dagdata","genomicsclass")
18 | library(dagdata)
19 | data(tissuesGeneExpression)
20 | library(Biobase)
21 | rownames(tab) <- tab$filename
22 | t <- ExpressionSet(e, AnnotatedDataFrame(tab))
23 | t$Tissue <- factor(t$Tissue)
24 | colnames(t) <- paste0(t$Tissue, seq_len(ncol(t)))
25 | ```
26 |
27 | As we noticed in the end of the clustering section, we weren't able to
28 | *see* why the k-means algorithm defined a certain set of clusters
29 | using only the first two genes.
30 |
31 | ```{r}
32 | x <- t(exprs(t))
33 | km <- kmeans(x, centers=3)
34 | library(rafalib)
35 | mypar()
36 | plot(x[,1], x[,2], col=km$cluster, pch=16)
37 | ```
38 |
39 | Instead of the first two genes, let's use the *multi-dimensional
40 | scaling* algorithm which Rafa introduced in the lectures. This is a
41 | projection from the space of all genes to a two dimensional space,
42 | which mostly preserves the inter-sample distances. The `cmdscale`
43 | function in R takes a distance object and returns a matrix which has
44 | two dimensions (columns) for each sample.
45 |
46 | ```{r}
47 | mds <- cmdscale(dist(x))
48 | plot(mds, col=km$cluster, pch=16)
49 | ```
50 |
51 | We can also plot the names of the tissues with the color of the cluster.
52 |
53 | ```{r}
54 | plot(mds, type="n")
55 | text(mds, colnames(t), col=km$cluster)
56 | ```
57 |
58 | ...or the names of the tissues with the color of the tissue.
59 |
60 | ```{r}
61 | plot(mds, type="n")
62 | text(mds, colnames(t), col=as.fumeric(t$Tissue))
63 | ```
64 |
65 | ## Heatmaps
66 |
67 | Heatmaps are useful plots for visualizing the expression values for a
68 | subset of genes over all the samples. The *dendrogram* on top and on
69 | the side is a hierarchical clustering as we saw before. First we will
70 | use the `heatmap` available in base R. First define a color palette.
71 |
72 | ```{r}
73 | # install.packages("RColorBrewer")
74 | library(RColorBrewer)
75 | hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
76 | ```
77 |
78 | Now, pick the genes with the top variance over all samples:
79 |
80 | ```{r}
81 | library(genefilter)
82 | rv <- rowVars(exprs(t))
83 | idx <- order(-rv)[1:40]
84 | ```
85 |
86 | Now we can plot a heatmap of these genes:
87 |
88 | ```{r}
89 | heatmap(exprs(t)[idx,], col=hmcol)
90 | ```
91 |
92 | The `heatmap.2` function in the `gplots` package on CRAN is a bit more
93 | customizable, and stretches to fill the window. Here we add colors to
94 | indicate the tissue on the top:
95 |
96 | ```{r}
97 | # install.packages("gplots")
98 | library(gplots)
99 | cols <- palette(brewer.pal(8, "Dark2"))[t$Tissue]
100 | cbind(colnames(t),cols)
101 | heatmap.2(exprs(t)[idx,], trace="none", ColSideColors=cols, col=hmcol)
102 | ```
103 |
104 |
--------------------------------------------------------------------------------
/advinference/storage/hierarchical_modeling.R:
--------------------------------------------------------------------------------
1 | # The following script produces the plots seen in the hierarchical modeling lecture.
2 | # These are also produced in the using_limma.Rmd file.
3 |
4 | # biocLite("SpikeInSubset")
5 | library(SpikeInSubset)
6 | data(rma95)
7 | library(genefilter)
8 | fac <- factor(rep(1:2,each=3))
9 | tt <- rowttests(exprs(rma95),fac)
10 | mask <- with(tt, abs(dm) < .2 & p.value < .01)
11 | spike <- rownames(rma95) %in% colnames(pData(rma95))
12 | cols <- ifelse(mask,"red",ifelse(spike,"dodgerblue","black"))
13 |
14 | with(tt, plot(-dm, -log10(p.value), cex=.8, pch=16,
15 | xlim=c(-1,1), ylim=c(0,5),
16 | xlab="difference in means",
17 | col=cols))
18 | abline(h=2,v=c(-.2,.2), lty=2)
19 |
20 | tt$s <- apply(exprs(rma95), 1, function(row) sqrt(.5 * (var(row[1:3]) + var(row[4:6]))))
21 | with(tt, plot(s, -log10(p.value), cex=.8, pch=16,
22 | log="x",xlab="estimate of standard deviation",
23 | col=cols))
24 |
25 | library(limma)
26 | fit <- lmFit(rma95, model.matrix(~ fac))
27 | ebfit <- ebayes(fit)
28 | limmares <- data.frame(dm=coef(fit)[,"fac2"], p.value=ebfit$p.value[,"fac2"])
29 | with(limmares, plot(dm, -log10(p.value),cex=.8, pch=16,
30 | col=cols,xlab="difference in means",
31 | xlim=c(-1,1), ylim=c(0,5)))
32 | abline(h=2,v=c(-.2,.2), lty=2)
33 |
34 |
35 | n <- 40
36 | qs <- seq(from=0,to=.2,length=n)
37 | idx <- sapply(seq_len(n),function(i) which(as.integer(cut(tt$s^2,qs)) == i)[1])
38 | idx <- idx[!is.na(idx)]
39 | par(mar=c(5,5,2,2))
40 | plot(1,1,xlim=c(0,.21),ylim=c(0,1),type="n",
41 | xlab="variance estimates",ylab="",yaxt="n")
42 | axis(2,at=c(.1,.9),c("before","after"),las=2)
43 | segments((tt$s^2)[idx],rep(.1,n),
44 | ebfit$s2.post[idx],rep(.9,n))
45 |
46 |
--------------------------------------------------------------------------------
/advinference/storage/justsvd_duplicate_content.txt:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Principal component analysis and Singular value decomposition
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
8 | ```
9 |
10 | We have measurements for $m$ genes and $n$ samples in a matrix $Y_{m\times n}$. Suppose we
11 | suspect that a batch effect is responsible for most the variability. We know that some samples fall in one batch and the rest in an other, but we don't know which sample is in which batch. Can we discover the batch? If we assume that many genes will have a different average in batch compared to the other then we can quantify this problem as searching for the separation that makes many of these differences in average large. TO simplify and illustrate further assume $n/2$ samples are in one batch and $n/2$ in the other but we dont know whcih. Can we find the separation?
12 |
13 | Assume the gene in row $i$ is affected by batch. Then
14 | $$
15 | (Y_{i1}, \dots, Y_{in}) (v_1,\dots,v_n) = \sum_{i=1}^n v_i Y_{in}'
16 | $$
17 | with each $v_i$ either $1/(n/2)$ or $-1/(n/2)$ will give us the average difference between each batch for gene $i$, call it $\m_i$. Because we think the batch effect many genes then we want to find the vector $v=(v_1\dots,v_n)$ that maximizes the variace of $m_1,\dots,m_n$.
18 |
19 | There is actually a nice mathematical result that can help us find this vector. In fact, if we let $v$ be any vector with standard deviation 1, then the $v$ that maximizes the variance of $Y_i v$ is called the first _principal component_ directions or eigen vector. The vectors of "differences" $Y_i v$, $i=1,\dots,n$ is the first principal component and below we will refer to it as $v_1$
20 |
21 | Now, suppose we think there is more unwanted variability affecting several genes. We can subtract the first principal component from $Y_{m\time n}$, $r_{m\times n}=Y_{m \times n} - Y_{m \times n} v_1 v_1'$ we can then find the vector $v_2$ that results in the most variable vector $r_{m\times n} v_2$. We continue this way until to obtain $n$ eigen vectors $V_{n\times n} = (v_1,\dots v_n)$.
22 |
23 | ## Singular value decomposition (SVD)
24 |
25 | The SVD is a very powerful mathematical result that gives us an algorithm to write a matrix in the following way:
26 |
27 | $
28 | Y_{m\times n} = U_{m\ times n} D_{n \times n} V’_{n \times n}
29 | $
30 |
31 | With the columns of $V$ the matrix with columns the eigen vectors defined above. The matrices $U$ and $V$ are _orthogonal_ meaning that
32 | with $U_i'U_i=1$ and $U_i'U_i$=0 where $U_i$ and $U_j$ are $i$th and $j$th columns of 1.
33 |
34 | Notice this matrix:
35 | $$
36 | Y_{m\times n} V = U_{m \times n} D_{n\times n}
37 | $$
38 | has the principal coponents as columns and that the standard deviation of the $i$ principal component is $D_{i,i}/n$:
39 | $$
40 | (Y_{m\times n} V)'(Y_{m\times n} V) = D_{n\times n} U'_{m\times n} U_{m\times n} = D^2_{n\times n}
41 | $$
42 |
43 | ## Example
44 | Let's consider a simple example. Suppose we have the heights of identical twin pairs in an $m\times 2$ matrix. We are asked to
45 |
46 | ```{r}
47 | library(MASS)
48 | set.seed(1)
49 | y=mvrnorm(1000,c(0,0),3^2*matrix(c(1,.9,.9,1),2,2))
50 | mypar(1,1)
51 | plot(y,xlab="Twin 1 (inches away from avg)",ylab="Twin 2 (inches away from avg)")
52 | ```
53 |
54 |
55 | Transmitting the two heights seems inefficient given how correlated they. If we tranmist the pricipal components instead we save money. Let's see how:
56 |
57 | ```{r}
58 | s=svd(y)
59 | plot(s$u[,1]*s$d[1],s$u[,2]*s$d[2],ylim=range(s$u[,1]*s$d[1]),xlab="First PC",ylab="Second PC")
60 | ```
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/advinference/storage/multtest.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Multiple testing
4 | ---
5 |
6 | The following code reproduces the images in the Multiple testing lecture.
7 |
8 | ```{r options, echo=FALSE}
9 | library(knitr)
10 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
11 | ```
12 |
13 | First we make a function for drawing sections of a uniform distribution.
14 |
15 | ```{r}
16 | drawU <- function(a,y,boxes=FALSE,xmax=1) {
17 | plot(1,1,type="n",xlim=c(0,xmax),ylim=c(0,1.2*y),bty="L",xlab="p",ylab="",las=1)
18 | lines(c(0,1,1,0,0),c(0,0,y,y,0))
19 | polygon(c(0,a,a,0),c(0,0,y,y),col=rgb(1,0,0,.5))
20 | polygon(c(a,1,1,a),c(0,0,y,y),col=rgb(0,0,0,.1))
21 | x <- 1/a
22 | if (boxes) {
23 | segments(0:x/x, 0, 0:x/x, y)
24 | }
25 | }
26 | ```
27 |
28 | Here we draw a uniform distribution, and show how many p-values we expect at different cutoffs.
29 |
30 | ```{r}
31 | drawU(.3,1)
32 | drawU(1/20,1)
33 | drawU(1/20,20000/20,TRUE)
34 | drawU(1/100,20000/100,TRUE)
35 | drawU(1/1000,20000/1000,TRUE,.01)
36 | ```
37 |
38 | The distribution of p-values using a z-score.
39 |
40 | ```{r}
41 | z <- rnorm(100)
42 | brks <- 0:20/20
43 | hist(pnorm(z),col="grey",main="",xlab="p",breaks=brks)
44 | ```
45 |
46 | The same as above, but with now many more z-scores. Now the distribution looks more uniform.
47 |
48 | ```{r}
49 | z <- rnorm(10000)
50 | hist(pnorm(z),col="grey",main="",xlab="p",breaks=brks)
51 | ```
52 |
53 | What happens if we spike in 500 small z-scores?
54 |
55 | ```{r}
56 | z <- c(rnorm(10000), rep(-3.72,500))
57 | h <- hist(pnorm(z),col="grey",main="",xlab="p",breaks=brks)
58 | abline(h=10000/20,col="red",lwd=4,lty=3)
59 | abline(h=h$counts[1],col="blue",lwd=4,lty=3)
60 | ```
61 |
62 | Now, making the bins in the histogram smaller, i.e. looking for a smaller p-value threshold:
63 |
64 | ```{r}
65 | h <- hist(pnorm(z),col="grey",main="",xlab="p",breaks=0:50/50)
66 | abline(h=10000/50,col="red",lwd=4,lty=3)
67 | abline(h=h$counts[1],col="blue",lwd=4,lty=3)
68 | ```
69 |
70 | Even smaller...
71 |
72 | ```{r}
73 | h <- hist(pnorm(z),col="grey",main="",xlab="p",breaks=0:100/100)
74 | abline(h=10000/100,col="red",lwd=4,lty=3)
75 | abline(h=h$counts[1],col="blue",lwd=4,lty=3)
76 | ```
77 |
78 | Even smaller...
79 |
80 | ```{r}
81 | h <- hist(pnorm(z),col="grey",main="",xlab="p",breaks=0:10000/10000)
82 | abline(h=10000/10000,col="red",lwd=4,lty=3)
83 | abline(h=sum(h$counts[1]),col="blue",lwd=4,lty=3)
84 | ```
85 |
86 | This visualizes the [Benjamini Hochberg method](#foot).
87 |
88 | ```{r}
89 | set.seed(1)
90 | pvals <- c(runif(90),runif(10,0,.001))
91 | plot(sort(pvals),xlab="i",ylab="p-value",ylim=c(0,1))
92 | abline(0, .05/length(pvals))
93 | legend("top",expression(slope~alpha/m))
94 | ```
95 |
96 | ```{r}
97 | plot(sort(pvals),xlab="i",ylab="p-value",ylim=c(0,.03),xlim=c(0,13))
98 | abline(0, .05/length(pvals))
99 | legend("top",expression(slope~alpha/m))
100 | ```
101 |
102 | ## Footnotes
103 |
104 | Yoav Benjamini and Yosef Hochberg, "Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing". Journal of the Royal Statistical Society. 1995.
105 |
106 |
--------------------------------------------------------------------------------
/advinference/storage/pca_svd.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Running PCA and SVD in R
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | In this unit, we will show how to perform principal component analysis (PCA) and singular value decomposition (SVD) in R, and how the two are related to each other. We will use the tissue gene expression dataset from the week 5 lectures and labs.
12 |
13 | ```{r}
14 | # library(devtools)
15 | # install_github("dagdata","genomicsclass")
16 | library(dagdata)
17 | data(tissuesGeneExpression)
18 | library(rafalib)
19 | group <- as.fumeric(tab$Tissue)
20 | ```
21 |
22 |
23 | First, the typical principal component analysis on the samples would be to transpose the data such that the samples are rows of the data matrix. The `prcomp` function can be used to return the principal components and other variables.
24 |
25 | ```{r}
26 | x <- t(e)
27 | pc <- prcomp(x)
28 | # ?prcomp
29 | names(pc)
30 | plot(pc$x[,1], pc$x[,2], col=group, main="PCA", xlab="PC1", ylab="PC2")
31 | ```
32 |
33 | This PCA is equivalent to performing the SVD on the centered data, where the centering occurs on the columns (here genes). We can use the `sweep` function to perform arbitrary operations on the rows and columns of a matrix. The second argument specifies we want to operate on the columns (1 would be used for rows), and the third and fourth arguments specify that we want to subtract the column means.
34 |
35 | ```{r}
36 | cx <- sweep(x, 2, colMeans(x), "-")
37 | sv <- svd(cx)
38 | names(sv)
39 | plot(sv$u[,1], sv$u[,2], col=group, main="SVD", xlab="U1", ylab="U2")
40 | ```
41 |
42 | So the columns of U from the SVD correspond to the principal components `x` in the PCA. Furthermore, the matrix V from the SVD is equivalent to the `rotation` matrix returned by `prcomp`.
43 |
44 | ```{r}
45 | sv$v[1:5,1:5]
46 | pc$rotation[1:5,1:5]
47 | ```
48 |
49 | The diagonal elements of D from the SVD are proportional to the standard deviations returned by PCA. The difference is that the standard deviations from `prcomp` are sample standard deviations (`prcomp` returns unbiased estimates of sample variance, so with the $n / (n - 1)$ correction). The elements of D are formed by taking the sum of the squares of the principal components but not dividing by the sample size.
50 |
51 | ```{r}
52 | head(sv$d^2)
53 | head(pc$sdev^2)
54 | head(sv$d^2 / (ncol(e) - 1))
55 | ```
56 |
57 | By dividing the variances by the sum, we get a plot of the ratio of variance explained by each principal component.
58 |
59 | ```{r}
60 | plot(sv$d^2 / sum(sv$d^2), xlim=c(0,15), type="b", pch=16,
61 | xlab="principal components",
62 | ylab="variance explained")
63 | plot(sv$d^2 / sum(sv$d^2), type="b", pch=16,
64 | xlab="principal components",
65 | ylab="variance explained")
66 | ```
67 |
68 | Note that, not centering the data before running `svd` results in a slightly different plot:
69 |
70 | ```{r}
71 | svNoCenter <- svd(x)
72 | plot(pc$x[,1], pc$x[,2], col=group, main="PCA", xlab="PC1", ylab="PC2")
73 | points(0,0,pch=3,cex=4,lwd=4)
74 | plot(svNoCenter$u[,1], svNoCenter$u[,2], col=group, main="SVD not centered", xlab="U1", ylab="U2")
75 | ```
76 |
77 | # SVD on (genes vs samples) and (samples vs genes)
78 |
79 | Finally, we show that the SVD on the data matrix where samples are columns -- as used in the Surrogate Variable Analysis SVA -- is equivalent to the SVD on the data matrix where the samples are rows, if no centering has been done.
80 |
81 |
82 | ```{r}
83 | sv2 <- svd(t(e))
84 | plot(sv2$u[,1], sv2$u[,2], col=group, main="samples vs genes (typical PCA)", xlab="U1", ylab="U2")
85 | sv1 <- svd(e)
86 | plot(sv1$v[,1], sv1$v[,2], col=group, main="genes vs samples (SVA)", xlab="V1", ylab="V2")
87 | ```
88 |
89 | The question of which direction to center depends on what the focus of the analysis is. For comparing sample distances, as in the typical PCA plot, the rows are samples and the genes are centered. For finding genes which contribute to batch, as in the SVA model, the rows are genes and the samples are centered.
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/advinference/storage/sva.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Batch adjustment
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | To illustrate how we can adjust for batch effects using statistcal methods, we will create a data example in which the outcome of interest is confounded with batch but not completely. We will also select a outcome for which we have an expectation of what genes should be differentially expressed. Namely, we make sex the outcome of interest and expect genes on the Y chromosome to be differentially expressed. Note that we may also see genes from the X chromosome as differentially expressed as some escape X inactivation.
12 |
13 | We start by finding the genes on the Y chromosome.
14 | ```{r}
15 | library(rafalib)
16 | library(GSE5859Subset)
17 | data(GSE5859Subset)
18 | y <- geneExpression-rowMeans(geneExpression)
19 | ```
20 |
21 |
22 | To illustrate the confounding we will pick some genes to show in a heatmap plot. We pick all Y chromosome genes, some genes that we see correlate with batch, and then some randomly selected genes.
23 | ```{r}
24 | ind1 <- which(geneAnnotation$CHR=="chrY") ##real differences
25 | month <- factor(format(sampleInfo$date,"%m"))
26 | tt<-genefilter::rowttests(y,month)
27 | ind2 <- setdiff(c(order(tt$dm)[1:25],order(-tt$dm)[1:25]),ind1)
28 | ###now pick at random from rest:
29 | set.seed(1)
30 | ind0 <- setdiff(sample(seq(along=tt$dm),50),c(ind2,ind1))
31 | geneindex<-c(ind2,ind0,ind1)
32 |
33 | mat<-geneExpression[geneindex,]
34 | mat <- mat-rowMeans(mat)
35 | ```
36 |
37 | Here is a the data for the selected genes:
38 | ```{r}
39 | icolors <- rev(brewer.pal(11,"RdYlBu"))
40 | mypar(1,1)
41 | image(t(mat),xaxt="n",yaxt="n",col=icolors)
42 | ```
43 |
44 | So what follows is like the analysis we would do in practice. We don't know there is a batch and we are interested in finding genes that are different between males and females. We start by computing t-statistics and p-values comparing males and females. We use histograms to notice the problem introduced by the batch.
45 |
46 | The batch effect adjustment methods are best described with the linear models so we start by writing down the linear more for this particular case:
47 |
48 |
49 |
50 | ## SVA
51 |
52 | ```{r}
53 | library(sva)
54 | library(limma)
55 | sex <- sampleInfo$group
56 | mod <- model.matrix(~sex)
57 | cind <- order( as.Date(sampleInfo$date) )
58 | dates <- gsub("2005-","",sampleInfo$date)
59 | weights=rep(1,nrow(y))
60 | for(b in 1:5){
61 | mypar2(1,1)
62 | par(mar = c(4.1, 5.1, 3.5, 2.1))
63 | layout(matrix(c(1:3),nrow=1),widths=c(5,1.5,5))
64 | image(1:ncol(mat),1:nrow(mat),t(mat[,cind]*weights[geneindex]),xaxt="n",yaxt="n",col=icolors,xlab="",ylab="")
65 | axis(side=1,seq(along=dates),dates[cind],las=2)
66 | abline(v=12.5)
67 |
68 |
69 | svafit <- sva(y,mod,B=b,n.sv=5)
70 | weights = svafit$pprob.gam*(1-svafit$pprob.b)
71 | ## Weighted SVD
72 | surrogate <- svd( y*weights)$v[,1]
73 |
74 | image(matrix(weights[geneindex],nrow=1),,xaxt="n",yaxt="n",col=brewer.pal(9,"Blues"))
75 | plot(surrogate[cind],bg=sex[cind]+1,pch=21,xlab="",xaxt="n",ylab="Surrogate variable",ylim=c(-.5,.5),cex=1.5)
76 | axis(side=1,seq(along=dates),dates[cind],las=2)
77 | abline(v=12.5)
78 | text(1,0.5,"June")
79 | text(13.5,0.5,"Oct")
80 | legend("bottomright",c("0","1"),col=c(1,2),pch=16)
81 | }
82 | ```
83 |
84 |
85 | ```{r}
86 | lmfit <- lmFit(dat,svaX)
87 | tt<-lmfit$coef[,2]*sqrt(lmfit$df.residual)/(2*lmfit$sigma)
88 | mypar(1,2)
89 | pval<-2*(1-pt(abs(tt),lmfit$df.residual[1]))
90 | hist(pval[!chr%in%c("chrX","chrY")],xlab="p-values",ylim=HLIM,main="")
91 | hist(pval[chr%in%c("chrY")],nc=20,xlab="p-value",ylim=c(0,9),main="")
92 | ```
93 |
94 | Decompose the data
95 | ```{r}
96 | Batch<- lmfit$coef[geneindex,3:7]%*%t(svaX[,3:7])
97 | Signal<-lmfit$coef[geneindex,1:2]%*%t(svaX[,1:2])
98 | error <- dat[geneindex,]-Signal-Batch
99 | ##demean for plot
100 | Signal <-Signal-rowMeans(Signal)
101 | mat <- dat[geneindex,]-rowMeans(dat[geneindex,])
102 | mypar(1,4,mar = c(2.75, 4.5, 2.6, 1.1))
103 | image(t(mat),col=icolors,zlim=c(-5,5),xaxt="n",yaxt="n")
104 | image(t(Signal),col=icolors,zlim=c(-5,5),xaxt="n",yaxt="n")
105 | image(t(Batch),col=icolors,zlim=c(-5,5),xaxt="n",yaxt="n")
106 | image(t(error),col=icolors,zlim=c(-5,5),xaxt="n",yaxt="n")
107 | ```
108 |
109 | ## Footnotes
110 |
111 | ### Principal Components Analysis (PCA)
112 |
113 | Jolliffe, Ian. Principal component analysis. John Wiley & Sons, Ltd, 2005.
114 |
115 | Dunteman, George H. Principal components analysis. No. 69. Sage, 1989.
116 |
--------------------------------------------------------------------------------
/advinference/storage/transformations.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Transformation
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ## Transformations
12 |
13 | ### Mean-variance relationship
14 |
15 | In microarrays and RNAseq data we observe strong variance dependence on mean.
16 | ```{r}
17 | if (!file.exists("bottomly_eset.RData")) download.file("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData","bottomly_eset.RData")
18 | load("bottomly_eset.RData")
19 | library("Biobase")
20 | ind <- which(pData(bottomly.eset)$strain=="C57BL/6J")
21 | Y <- exprs(bottomly.eset)[,ind]
22 | avgs<-rowMeans(Y)
23 | sds <-genefilter::rowSds(Y)
24 | mypar(1,1)
25 | splot(avgs,sds,log="xy",subset=which(avgs>0),xlab="Average",ylab="SD")
26 | ```
27 |
28 | This means that the larger values, vary the most. If we need to compute a mean to, say, normalize, it will be highly sensitive to the variation of the max:
29 | ```{r}
30 | maxs <- apply(Y,2,max)
31 | sampleavgs <- colMeans(Y)
32 | plot(maxs,sampleavgs/min(sampleavgs),xlab="Max",ylab="Sample average increase",pch=21,bg=1,cex=1.5)
33 | ```
34 | The log transformation can remove the strong dependence.
35 |
36 | ```{r}
37 | lY <- log2(Y+0.5)
38 | lavgs<-rowMeans(lY)
39 | lsds <-genefilter::rowSds(lY)
40 | splot(lavgs,lsds,xlab="Average of log counts",ylab="SD of log counts")
41 | ```
42 |
43 | ```{r}
44 | lsampleavgs <- colMeans(lY)
45 | plot(maxs,sampleavgs/min(sampleavgs),xlab="Max",ylab="Sample average increase",bg=1,pch=21,cex=1.5)
46 | points(maxs,lsampleavgs/min(lsampleavgs),xlab="Max",ylab="Sample average",bg=2,pch=21,cex=1.5)
47 | legend("topleft",c("Original","Log"),pch=16,col=1:2,box.lwd=0)
48 | ```
49 |
50 |
--------------------------------------------------------------------------------
/bioc/EDA_plots_for_NGS.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Exploratory Data Analysis for NGS
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | This is a dataset produced by Bottomly et al., sequencing two strains of mouse with many biological replicates. This dataset and a number of other sequencing datasets have been compiled from raw data into read counts tables by Frazee, Langmead, and Leek as part of the ReCount project. These datasets are made publicly available at the following website:
12 |
13 | http://bowtie-bio.sourceforge.net/recount/
14 |
15 | Unlike many sequencing studies, Bottomly et al., realizing the such information is important for downstream analysis, provided the experiment number for all samples. Below we can see that the experimental batch explains more variation than the condition of interest: the strain of mouse.
16 |
17 | We can make similar figures for NGS to the ones shown in the previous sections. However, the log transform does not work because RNAseq data contains many 0s. One quick way to get around this is by adding a constant before taking the log. A typical one is 0.5 which gives us a log2 value of -1 for 0s.
18 |
19 | ```{r}
20 | if (!file.exists("bottomly_eset.RData")) download.file("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData","bottomly_eset.RData")
21 | load("bottomly_eset.RData")
22 | library("Biobase")
23 | exprs(bottomly.eset)[1,]
24 | pData(bottomly.eset)
25 | ```
26 |
27 | ```{r}
28 | Y <- log2(exprs(bottomly.eset) + 0.5)
29 | # library(devtools)
30 | # install_github("rafalib","ririzarr")
31 | library("rafalib")
32 | mypar(1,1)
33 | for(i in 1:ncol(Y)){
34 | shist(Y[,i],unit=0.25,col=i,plotHist=FALSE,add=i!=1)
35 | }
36 | ```
37 |
38 | If we get rid of the zeros (i.e., those with log2 value of -1), we can more easily see that shape of the distribution for the expressed genes:
39 |
40 | ```{r}
41 | mypar(1,1)
42 | for(i in 1:ncol(Y)){
43 | idx <- Y[,i] > -1
44 | shist(Y[idx,i],unit=0.25,col=i,plotHist=FALSE,add=i!=1)
45 | }
46 | ```
47 |
48 | Plotting two samples against each other shows the spreading of points at the low end of expression from the log transformation. This can also be seen with randomly generated Poisson data.
49 |
50 | ```{r}
51 | mypar(1,2)
52 | idx <- rowSums(Y[,1:2]) > 0
53 | plot(Y[idx,1], Y[idx,2], cex=.1)
54 | rm <- rowMeans(2^Y[idx,1:2])
55 | simulated1 <- rpois(length(idx), rm)
56 | simulated2 <- rpois(length(idx), rm)
57 | plot(log2(simulated1 + .5), log2(simulated2 + .5), cex=.1)
58 | ```
59 |
60 | The MA plot is again easier to look at, in that we don't have to rotate our heads sideways by 45 degrees to see deviations from the diagonal.
61 |
62 | ```{r}
63 | mypar(1,1)
64 | maplot(Y[idx,1],Y[idx,2])
65 | ```
66 |
67 |
68 |
--------------------------------------------------------------------------------
/bioc/anno4liftover.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Chromosomes and their substructures 4: Translating addresses between genome builds"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 |
16 | ```{r setup,echo=FALSE,results="hide"}
17 | suppressPackageStartupMessages({
18 | library(BSgenome.Hsapiens.UCSC.hg19)
19 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
20 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
21 | library(Biostrings)
22 | library(GenomicRanges)
23 | library(IRanges)
24 | library(ph525x)
25 | library(Homo.sapiens)
26 | library(rtracklayer)
27 | })
28 | ```
29 |
30 | # Translating addresses between genome builds: liftOver
31 |
32 | The rtracklayer package includes an interface to the
33 | liftOver utilities developed for the UCSC genome browser.
34 | The idea is that a collection of local alignments
35 | can be defined and used to remap coordinates from
36 | one reference build to another.
37 |
38 | We can illustrate this with gene addresses created for hg38,
39 | the current reference build. We want to translate them
40 | for comparison to addresses asserted for hg19.
41 |
42 | We need a "chain file", uncompressed. You can
43 | get it from the following URL, and use gunzip on your
44 | system to uncompress in your home dir, if you would
45 | like to emulate the commands below.
46 |
47 | "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz"
48 |
49 | ```{r domyimport}
50 | library(rtracklayer)
51 | ch = import.chain("~/hg38ToHg19.over.chain")
52 | ch
53 | str(ch[[1]])
54 | ```
55 |
56 | Let's get the addresses for genes on chromosome 1
57 | in hg38.
58 |
59 | ```{r get38}
60 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
61 | tx38 = TxDb.Hsapiens.UCSC.hg38.knownGene
62 | seqlevels(tx38, force=TRUE) = "chr1"
63 | g1_38 = genes(tx38)
64 | ```
65 |
66 | Now execute the liftOver:
67 |
68 | ```{r doli}
69 | g1_19L = liftOver(g1_38, ch)
70 | ```
71 |
72 | The result is a list of GRanges, one for
73 | each translation event.
74 |
75 | ```{r lktx}
76 | g1_19L
77 | ```
78 |
79 | Verification of accuracy of translation is covered in exercises.
80 |
--------------------------------------------------------------------------------
/bioc/annoCheat.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Genomic annotation in Bioconductor: Cheat sheet"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 | # Summarizing the key genome annotation resources in Bioconductor
16 |
17 | ## Executive summary
18 |
19 | ### Organism-oriented annotation
20 |
21 | For biological annotation, generally sequence or gene based, there
22 | are three key types of package
23 |
24 | * Reference sequence packages: BSgenome.[Organism].[Curator].[BuildID]
25 | * Gene model database packages: TxDb.[Organism].[Curator].[BuildID].[Catalog]
26 | * Annotation map package: org.[Organism2let].[Institution].db
27 |
28 | wherever brackets are used, you must substitute an appropriate token.
29 | You can survey all annotation packages at [the annotation page](http://bioconductor.org/packages/release/BiocViews.html#___AnnotationData).
30 |
31 | Packages Homo.sapiens, Mus.musculus and Rattus.norvegicus are specialized
32 | integrative annotation resources with an evolving interface. We have
33 | illustrated their use in lectures and labs.
34 |
35 | ### Systems biology oriented annotation
36 |
37 | Packages GO.db, KEGG.db, KEGGREST, and reactome.db are primarily
38 | intended as organism-independent resources organizing genes into
39 | groups. However, there are organism-specific mappings between
40 | gene-oriented annotation and these resources, that involve specific
41 | abbreviations and symbol conventions. These are described
42 | when these packages are used.
43 |
44 | ## Names for organisms and their abbreviations
45 |
46 | The standard Linnaean taxonomy is used very generally. So you
47 | need to know that
48 |
49 | * Human = *Homo sapiens*
50 | * Mouse = *Mus musculus*
51 | * Rat = *Rattus norvegicus*
52 | * Yeast = *Saccharomyces cerevisiae*
53 | * Zebrafish = *Danio rerio*
54 | * Cow = *Bos taurus*
55 |
56 | and so on. We use two sorts of abbreviations. For
57 | Biostrings-based packages, the contraction of first
58 | and second names is used
59 |
60 | * Human = Hsapiens
61 | * Mouse = Mmusculus
62 | * Rat = Rnorvegicus
63 | * Yeast = Scerevisiae ...
64 |
65 | For NCBI-based annotation maps, we contract further
66 |
67 | * Human = Hs
68 | * Mouse = Mm
69 | * Rat = Rn
70 | * Yeast = Sc ...
71 |
72 | ## Genomic sequence
73 |
74 | These packages have four-component names that specify the reference build used
75 |
76 | * Human = BSgenome.Hsapiens.UCSC.hg19
77 | * Mouse = BSgenome.Mmusculus.UCSC.mm10
78 | * Rat = BSgenome.Rnorvegicus.UCSC.rn5
79 | * Yeast = BSgenome.Scerevisiae.UCSC.sacCer3
80 |
81 | ## Gene models
82 |
83 | These packages have five-component names that specify the reference build used and
84 | the gene catalog
85 |
86 | * Human = TxDb.Hsapiens.UCSC.hg19.knownGene
87 | * Mouse = TxDb.Mmusculus.UCSC.mm10.knownGene
88 | * Rat = TxDb.Rnorvegicus.UCSC.rn5.knownGene
89 | * Yeast = TxDb.Scerevisiae.UCSC.sacCer3.sgdGene
90 |
91 | ## Annotation maps
92 |
93 | These packages have four component names, with two components fixed. The
94 | variable components indicate organism and curating institution.
95 |
96 | * Human = org.Hs.eg.db
97 | * Mouse = org.Mm.eg.db
98 | * Rat = org.Rn.eg.db
99 | * Yeast = org.Sc.sgd.db
100 |
101 | ## Additional options
102 |
103 | There are often alternative curating institutions available such as
104 | Ensembl.
105 |
--------------------------------------------------------------------------------
/bioc/eset.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "The ExpressionSet container"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | ---
8 |
9 | ```{r options, echo=FALSE}
10 | library(knitr)
11 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
12 | ```
13 |
14 | # Overview
15 |
16 | We'll work with the basic representation of expression experiments
17 | in Bioconductor. An example is in package Biobase.
18 |
19 | ```{r do1}
20 | library(Biobase)
21 | data(sample.ExpressionSet)
22 | sample.ExpressionSet
23 | ```
24 |
25 | We'll abbreviate the name:
26 |
27 | ```{r do2}
28 | samp = sample.ExpressionSet
29 | ```
30 |
31 | # Queries and extractors
32 |
33 | ```{r do3}
34 | dim(samp)
35 | exprs(samp)[1:5,1:6] # extract expression values
36 | pData(samp) # extract sample level data
37 | experimentData(samp)
38 | abstract(samp) # special accessor
39 | ```
40 |
41 | Have a look at annotation package pmid2MIAME function to see
42 | how to extract abstracts of papers from pubmed. These can be
43 | bound into ExpressionSets with experimentData().
44 |
45 | # Matrix-like subscripting
46 |
47 | We can use matrix-like syntax directly to restrict the
48 | ExpressionSet, getting back a new ExpressionSet
49 | ```{r doex}
50 | samp[1:4,3:20]
51 | samp[, samp$sex=="Male"]
52 | ```
53 |
54 |
55 |
--------------------------------------------------------------------------------
/bioc/eset_sumexp.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: ExpressionSet and SummarizedExperiment
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ```{r message=FALSE}
12 | library(Biobase)
13 | library(GEOquery)
14 | ```
15 |
16 | ```{r}
17 | geoq <- getGEO("GSE9514") # download a dataset from GEO
18 | names(geoq)
19 | e <- geoq[[1]] # extract ExpressionSet
20 | ```
21 |
22 | ### ExpressionSet
23 |
24 | ```{r}
25 | # exprs gives expression matrix
26 | dim(e) # number of features and samples in ExpressionSet
27 | exprs(e)[1:3,1:3]
28 | dim(exprs(e)) # rows are features, columns are samples
29 |
30 | # pData gives phenotype data (sample information)
31 | pData(e)[1:3,1:6]
32 | dim(pData(e)) # rows of pData correspond to columns of exprs
33 | names(pData(e))
34 | pData(e)$characteristics_ch1
35 |
36 | # fData gives feature data (probe information)
37 | fData(e)[1:3,1:3]
38 | dim(fData(e)) # rows of fData correspond to rows of exprs
39 | names(fData(e))
40 | head(fData(e)$"Gene Symbol")
41 | head(rownames(e))
42 |
43 | # additional annotation tied to ExpressionSet
44 | experimentData(e)
45 | annotation(e)
46 | ```
47 |
48 | ### Summarized Experiment
49 |
50 | ```{r message=FALSE}
51 | library(parathyroidSE)
52 | ```
53 |
54 |
55 | ```{r}
56 | data(parathyroidGenesSE)
57 | se <- parathyroidGenesSE
58 | se
59 | ```
60 |
61 |
62 | ```{r}
63 | # assay contains results of the assay
64 | dim(se)
65 | assay(se)[1:3,1:3]
66 | dim(assay(se)) # rows = features (ranges), columns = samples
67 |
68 | # colData contains sample information
69 | colData(se)[1:3,1:6]
70 | dim(colData(se))
71 | names(colData(se))
72 | colData(se)$treatment
73 |
74 | # rowRanges contains feature information
75 | rowRanges(se)[1]
76 | class(rowRanges(se))
77 | length(rowRanges(se))
78 | head(rownames(se))
79 | metadata(rowRanges(se))
80 |
81 | # additional metadata, including sample information
82 | metadata(se)$MIAME
83 | abstract(metadata(se)$MIAME)
84 | ```
85 |
86 | ## Footnotes
87 |
88 | For more information about the `GenomicRanges` package, check out the PLOS Comp Bio paper, which the authors of GenomicRanges published:
89 |
90 | http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118
91 |
92 | For more information on *SummarizedExperiment*:
93 |
94 | http://www.nature.com/nmeth/journal/v12/n2/abs/nmeth.3252.html
95 |
96 | Also the software vignettes have a lot of details about the functionality. Check out "An Introduction to Genomic Ranges Classes". All of the vignette PDFs are available here:
97 |
98 | http://www.bioconductor.org/packages/release/bioc/html/GenomicRanges.html
99 |
100 |
--------------------------------------------------------------------------------
/bioc/ggbioNote.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "A note on visualization options"
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | There are many interesting approaches to visualizing genome-scale data.
12 | Two major packages in Bioconductor are Gviz and ggbio. Both represent
13 | significant efforts at bridging the gap between graphics facilities
14 | and various genomic data structures.
15 |
16 | ggbio's `autoplot` method can be very useful for broad overviews.
17 | For a GRanges instance, each range for which data exists can be
18 | depicted as a band on the chromosome. The karyogram layout
19 | gives a genome-wide view, but it can be important to control
20 | the handling of extra-chromosomal sequence levels.
21 |
22 | ```{r getl,echo=FALSE,results="hide"}
23 | library(ERBS)
24 | library(GenomeInfoDb)
25 | library(ggbio)
26 | ```
27 | ```{r lkd, fig=TRUE}
28 | library(ERBS)
29 | data(HepG2)
30 | library(GenomeInfoDb) # trim all but autosomal chroms
31 | seqlevels(HepG2, force=TRUE) = paste0("chr", 1:22)
32 | data(GM12878)
33 | seqlevels(GM12878, force=TRUE) = paste0("chr", 1:22)
34 | library(ggbio)
35 | autoplot(HepG2, layout="karyogram", main="ESRRA binding on HepG2")
36 | ```
37 |
38 | Notice that the title is not printed, currently a bug.
39 |
40 | ```{r lkm,fig=TRUE}
41 | autoplot(GM12878, layout="karyogram", main="ESRRA binding on GM12878")
42 | ```
43 |
--------------------------------------------------------------------------------
/bioc/importBed.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Importing genomic regions from files"
3 | author: "Mike Love and Rafa"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 | # Introduction
16 |
17 | A common way you will be accessing genomic regions data is through files. The standard used by most software application is [BED](http://genome.ucsc.edu/FAQ/FAQformat.html#format1). The ENCODE project has created it's new format: [NarrowPeak](http://genome.ucsc.edu/FAQ/FAQformat.html#format12). Here we demonstrate the `import` function from the `rtracklayer` package that facilitates the creation of `GRanges` objects from these files.
18 |
19 |
20 | # Finding the files
21 |
22 | Here we use as an example the original files used to create the objects in the `ERBS` library. Once you install this package the files can be found here:
23 |
24 | ```{r}
25 | dir <- file.path( system.file(package="ERBS"), "extdata")
26 | ```
27 |
28 | We included a `sampleInfo` file that includes the names and provenance of the data.
29 |
30 | ```{r}
31 | sampleInfo <- read.table(file.path(dir,"sampleInfo.txt"), stringsAsFactors=FALSE)
32 | sampleInfo
33 | ```
34 |
35 | As an example we will read in just the first file:
36 |
37 | ```{r}
38 | filename <- file.path(dir,sampleInfo[1,1])
39 | ```
40 |
41 | # Import
42 |
43 | To import the files we can now use the `import` function. Note that import does not support NarrowPeak files but it does support BED files and is able read it in.
44 |
45 |
46 | ```{r}
47 | library(rtracklayer)
48 | HepG2 <- import(filenames[1], format="bedGraph")
49 | HepG2
50 | ```
51 | We do successfully create a `GRanges` object but note the metadata names are missing. We can add these by hand.
52 | ```{r}
53 | names(mcols(HepG2)) <- c("name","score","col","signalValue","pValue","qValue","peak")
54 | ```
55 |
56 | # Adding metadata
57 |
58 | A much more important piece of information that is missing here relates to provenance and genome annotation. Where do the files original come from? What build of the human genome was used? What chromosomes were considered?
59 |
60 | We highly recommend that you add this information to your object even if is not installed in the file. Here is how we constructed the objects in `ERBS`
61 |
62 | Add data provenance:
63 | ```{r}
64 | metadata(HepG2) <- list("ENCODE accession: ENCSR000EEW. ESRRA ChIP-seq peaks of HepG2 cell line https://www.encodeproject.org/experiments/ENCSR000EEW/")
65 | metadata(GM12878) <- list("ENCODE accession: ENCSR000DYQ. ESRRA ChIP-seq peaks of GM12878 cell line https://www.encodeproject.org/experiments/ENCSR000DYQ/")
66 | ```
67 |
68 | Next, we can add the genome build that was used:
69 |
70 | ```{r}
71 | # add simple text descriptor for genome
72 | genome(HepG2) <- "hg19"
73 | genome(GM12878) <- "hg19"
74 | ```
75 |
76 | Finally we denote the chromosome annotation that should be used by copying it from one of the `BSgenome` objects.
77 |
78 | We start by checking that they are in fact the same style
79 |
80 | ```{r}
81 | # import chromosomal length information as well (also UCSC)
82 | library(BSgenome.Hsapiens.UCSC.hg19)
83 | seqlevelsStyle(HepG2)
84 | seqlevelsStyle(Hsapiens)
85 | ```
86 |
87 | and that all the chromosome names in `HepG2` are in `Hsapiens`
88 |
89 | ```{r}
90 | seqlevels(HepG2)%in% seqlevels(Hsapiens)
91 | ```
92 |
93 | Once we see that they are then we simply borrow the information from the `Hsapines` object:
94 | ```{r}
95 | seqinfo(HepG2) <- seqinfo(Hsapiens)
96 | ```
--------------------------------------------------------------------------------
/bioc/inference_with_bioc.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Inference with bioc
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 |
12 | # Introduction
13 |
14 | In this section we will cover inference in the context of genomics experiments. We apply some of the concepts we have covered in previous sections including t-tests, multiple comparisons and standard deviation estimates from hierarchical models.
15 |
16 | We start by loading the pooling experiment data
17 |
18 |
19 | ```{r,message=FALSE}
20 | library(Biobase)
21 | library(maPooling)
22 | data(maPooling)
23 | pd=pData(maPooling)
24 | individuals=which(rowSums(pd)==1)
25 | ```
26 |
27 | And extracting the individual mice as well as their strain
28 |
29 | ```{r}
30 | individuals=which(rowSums(pd)==1)
31 | individuals=individuals[-grep("tr",names(individuals))]
32 | y=exprs(maPooling)[,individuals]
33 | g=factor(as.numeric(grepl("b",names(individuals))))
34 | ```
35 |
36 |
37 | # T-test
38 |
39 | We can now apply a t-test to each gene using the `rowttest` function in the `genefilter` package
40 |
41 | ```{r}
42 | library(genefilter)
43 | tt=rowttests(y,g)
44 | ```
45 |
46 | Now which genes do we report as statistically significant? For somewhat arbitrary reasons, in science p-values of 0.01 and 0.05 are used as cutoff. In this particular example we get
47 |
48 | ```{r}
49 | sum(tt$p.value<0.01)
50 | sum(tt$p.value<0.05)
51 | ```
52 |
53 |
54 | # Multiple testing
55 | We described multiple testing in detail in course 3. Here we provide a quick summary.
56 |
57 | Do we report all these genes? Let's explore what happens if we split the first group into two, forcing the null hypothesis to be true
58 |
59 | ```{r}
60 | set.seed(0)
61 | shuffledIndex <- factor(sample(c(0,1),sum(g==0),replace=TRUE ))
62 | nulltt <- rowttests(y[,g==0],shuffledIndex)
63 | sum(nulltt$p.value<0.01)
64 | sum(nulltt$p.value<0.05)
65 | ```
66 |
67 | If we use the 0.05 cutoff we will be reporting 840 false positives. We have described several ways to adjust for this include the `qvalue` method available in the `qvalue` package. After this adjustment we include a smaller list of genes.
68 |
69 | ```{r}
70 | library(qvalue)
71 | qvals = qvalue(tt$p.value)$qvalue
72 | sum(qvals<0.05)
73 | sum(qvals<0.01)
74 | ```
75 |
76 | And now the null case generates fewer false positives:
77 |
78 | ```{r}
79 | library(qvalue)
80 | nullqvals = qvalue(nulltt$p.value)$qvalue
81 | sum(nullqvals<0.05)
82 | sum(nullqvals<0.01)
83 | ```
84 |
85 |
--------------------------------------------------------------------------------
/bioc/moreGR.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "GRanges operations related to gene model, TSS, and promoter region identification"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 |
16 |
17 | ```{r setup,echo=FALSE,results="hide"}
18 | suppressPackageStartupMessages({
19 | library(BSgenome.Hsapiens.UCSC.hg19)
20 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
21 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
22 | library(Biostrings)
23 | library(GenomicRanges)
24 | library(IRanges)
25 | library(ph525x)
26 | library(Homo.sapiens)
27 | library(Gviz)
28 | })
29 | ```
30 | # Overview
31 |
32 | In this document we work with a small set of ranges and
33 | illustrate basic intra-range operations reduce, disjoin, gaps.
34 | We then add strand and seqname information and show how
35 | resize and flank are useful for identifying TSS and promoter regions.
36 |
37 | ## A simple set of ranges
38 |
39 | ```{r newr}
40 | ir <- IRanges(c(3, 8, 14, 15, 19, 34, 40),
41 | width = c(12, 6, 6, 15, 6, 2, 7))
42 | ```
43 |
44 | ```{r plotr,echo=FALSE}
45 | plotRanges <- function(x, xlim = x, main = deparse(substitute(x)),
46 | col = "black", sep = 0.5, ...)
47 | {
48 | height <- 1
49 | if (is(xlim, "Ranges"))
50 | xlim <- c(min(start(xlim)), max(end(xlim)))
51 | bins <- disjointBins(IRanges(start(x), end(x) + 1))
52 | plot.new()
53 | plot.window(xlim, c(0, max(bins)*(height + sep)))
54 | ybottom <- bins * (sep + height) - height
55 | rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...)
56 | title(main)
57 | axis(1)
58 | }
59 |
60 | plotGRanges = function (x, xlim = x, col = "black", sep = 0.5, xlimits = c(0,
61 | 60), ...)
62 | {
63 | main = deparse(substitute(x))
64 | ch = as.character(seqnames(x)[1])
65 | x = ranges(x)
66 | height <- 1
67 | if (is(xlim, "Ranges"))
68 | xlim <- c(min(start(xlim)), max(end(xlim)))
69 | bins <- disjointBins(IRanges(start(x), end(x) + 1))
70 | plot.new()
71 | plot.window(xlim = xlimits, c(0, max(bins) * (height + sep)))
72 | ybottom <- bins * (sep + height) - height
73 | rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height,
74 | col = col, ...)
75 | title(main, xlab = ch)
76 | axis(1)
77 | }
78 | ```
79 |
80 | Let's visualize `ir` and several intra-range operations.
81 | ```{r lkir,fig=TRUE, out.height="1100px"}
82 | par(mfrow=c(4,1), mar=c(4,2,2,2))
83 | plotRanges(ir, xlim=c(0,60))
84 | plotRanges(reduce(ir), xlim=c(0,60))
85 | plotRanges(disjoin(ir), xlim=c(0,60))
86 | plotRanges(gaps(ir), xlim=c(0,60))
87 | ```
88 |
89 | reduce(x) produces a set of
90 | nonoverlapping ranges that cover all positions covered by x.
91 | This can be used to reduce complexity of a gene model
92 | with many transcripts, where we may just want the addresses
93 | of intervals known to be transcribed, regardless of transcript
94 | of residence.
95 |
96 | disjoin(x) produces a set of ranges that cover all positions
97 | covered by x, such that none of the ranges in the
98 | disjoin output overlaps any end points of intervals in x.
99 | This gives us the largest possible collection of contiguous
100 | intervals that are separated wherever the original set
101 | of intervals had an endpoint.
102 |
103 | gaps(x) produces a set of ranges covering the positions
104 | in [start(x), end(x)] that are not covered by any range in x.
105 | Given coding sequence addresses and exon intervals, this can
106 | be used to enumerate introns.
107 |
108 | # Extension to GRanges
109 |
110 | We add chromosome and strand information.
111 |
112 | ```{r dogr}
113 | library(GenomicRanges)
114 | gir = GRanges(seqnames="chr1", ir, strand=c(rep("+", 4), rep("-",3)))
115 | ```
116 |
117 | Let's assume the intervals represent genes.
118 | The following plots illustrate the identification of
119 | transcription start sites (green), upstream promoter
120 | regions (purple), downstream promoter regions (brown).
121 |
122 | ```{r dopr,fig=TRUE, out.height="1100px", out.width="1100px"}
123 | par(mfrow=c(4,1), mar=c(4,2,2,2))
124 | plotGRanges(gir, xlim=c(0,60))
125 | plotGRanges(resize(gir,1), xlim=c(0,60))
126 | plotGRanges(flank(gir,3), xlim=c(0,60), col="purple")
127 | plotGRanges(flank(gir,2,start=FALSE), xlim=c(0,60), col="brown")
128 | ```
129 |
130 | Note that we do not need to take special steps to
131 | deal with the differences in strand.
132 |
--------------------------------------------------------------------------------
/bioc/operateGRanges.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "GRanges operations related to gene model, TSS, and promoter region identification"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 |
16 |
17 | ```{r setup,echo=FALSE,results="hide"}
18 | suppressPackageStartupMessages({
19 | library(BSgenome.Hsapiens.UCSC.hg19)
20 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
21 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
22 | library(Biostrings)
23 | library(GenomicRanges)
24 | library(IRanges)
25 | library(ph525x)
26 | library(Homo.sapiens)
27 | library(Gviz)
28 | })
29 | ```
30 | # Overview
31 |
32 | In this document we work with a small set of ranges and
33 | illustrate basic intra-range operations reduce, disjoin, gaps.
34 | We then add strand and seqname information and show how
35 | resize and flank are useful for identifying TSS and promoter regions.
36 |
37 | ## A simple set of ranges
38 |
39 | ```{r newr}
40 | ir <- IRanges(c(3, 8, 14, 15, 19, 34, 40),
41 | width = c(12, 6, 6, 15, 6, 2, 7))
42 | ```
43 |
44 | ```{r plotr,echo=FALSE}
45 | plotRanges <- function(x, xlim = x, main = deparse(substitute(x)),
46 | col = "black", sep = 0.5, ...)
47 | {
48 | height <- 1
49 | if (is(xlim, "Ranges"))
50 | xlim <- c(min(start(xlim)), max(end(xlim)))
51 | bins <- disjointBins(IRanges(start(x), end(x) + 1))
52 | plot.new()
53 | plot.window(xlim, c(0, max(bins)*(height + sep)))
54 | ybottom <- bins * (sep + height) - height
55 | rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...)
56 | title(main)
57 | axis(1)
58 | }
59 |
60 | plotGRanges = function (x, xlim = x, col = "black", sep = 0.5, xlimits = c(0,
61 | 60), ...)
62 | {
63 | main = deparse(substitute(x))
64 | ch = as.character(seqnames(x)[1])
65 | x = ranges(x)
66 | height <- 1
67 | if (is(xlim, "Ranges"))
68 | xlim <- c(min(start(xlim)), max(end(xlim)))
69 | bins <- disjointBins(IRanges(start(x), end(x) + 1))
70 | plot.new()
71 | plot.window(xlim = xlimits, c(0, max(bins) * (height + sep)))
72 | ybottom <- bins * (sep + height) - height
73 | rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height,
74 | col = col, ...)
75 | title(main, xlab = ch)
76 | axis(1)
77 | }
78 | ```
79 |
80 | Let's visualize `ir` and several intra-range operations.
81 | ```{r lkir,fig=TRUE, out.height="1100px"}
82 | par(mfrow=c(4,1), mar=c(4,2,2,2))
83 | plotRanges(ir, xlim=c(0,60))
84 | plotRanges(reduce(ir), xlim=c(0,60))
85 | plotRanges(disjoin(ir), xlim=c(0,60))
86 | plotRanges(gaps(ir), xlim=c(0,60))
87 | ```
88 |
89 | reduce(x) produces a set of
90 | nonoverlapping ranges that cover all positions covered by x.
91 | This can be used to reduce complexity of a gene model
92 | with many transcripts, where we may just want the addresses
93 | of intervals known to be transcribed, regardless of transcript
94 | of residence.
95 |
96 | disjoin(x) produces a set of ranges that cover all positions
97 | covered by x, such that none of the ranges in the
98 | disjoin output overlaps any end points of intervals in x.
99 | This gives us the largest possible collection of contiguous
100 | intervals that are separated wherever the original set
101 | of intervals had an endpoint.
102 |
103 | gaps(x) produces a set of ranges covering the positions
104 | in [start(x), end(x)] that are not covered by any range in x.
105 | Given coding sequence addresses and exon intervals, this can
106 | be used to enumerate introns.
107 |
108 | # Extension to GRanges
109 |
110 | We add chromosome and strand information.
111 |
112 | ```{r dogr}
113 | library(GenomicRanges)
114 | gir = GRanges(seqnames="chr1", ir, strand=c(rep("+", 4), rep("-",3)))
115 | ```
116 |
117 | Let's assume the intervals represent genes.
118 | The following plots illustrate the identification of
119 | transcription start sites (green), upstream promoter
120 | regions (purple), downstream promoter regions (brown).
121 |
122 | ```{r dopr,fig=TRUE, out.height="1100px", out.width="1100px"}
123 | par(mfrow=c(4,1), mar=c(4,2,2,2))
124 | plotGRanges(gir, xlim=c(0,60))
125 | plotGRanges(resize(gir,1), xlim=c(0,60),col="green")
126 | plotGRanges(flank(gir,3), xlim=c(0,60), col="purple")
127 | plotGRanges(flank(gir,2,start=FALSE), xlim=c(0,60), col="brown")
128 | ```
129 |
130 | Note that we do not need to take special steps to
131 | deal with the differences in strand.
132 |
--------------------------------------------------------------------------------
/bioc/reading_microarray_data.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Reading in microarray data
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ## Affymterix CEL files
12 |
13 | We start by reading in the sample information table. This is usually created by the person who performed the experiment.
14 |
15 | The raw data files for this lab are in the `rawdata` repository, available here:
16 |
17 |
18 |
19 | Click Download ZIP in order to download all the files, then unzip this file, which should result in a `rawdata-master` folder. Make sure this folder is in your current working directory.
20 |
21 | First we save the initial working directory, so we can return to it.
22 |
23 | ```{r}
24 | wd <- getwd()
25 | ```
26 |
27 | Now we can start reading in the files:
28 |
29 | ```{r}
30 | datadir <- paste0(wd, "/rawdata-master")
31 | basedir <- paste0(datadir, "/celfiles")
32 | setwd(basedir)
33 | library(affy)
34 | tab <- read.delim("sampleinfo.txt",check.names=FALSE,as.is=TRUE)
35 | rownames(tab) <- tab$filenames
36 | tab
37 | fns <- list.celfiles(basedir)
38 | fns
39 | fns %in% tab[,1] ##check
40 | ab <- ReadAffy(phenoData=tab)
41 | ```
42 |
43 | This creates an AffyBatch object which object contains the information you need. (These commands may download some annotation packages to interpret the arrays.)
44 |
45 | ```{r}
46 | dim(pm(ab))
47 | dim(pData(ab))
48 | annotation(ab)
49 | ```
50 |
51 | Note, this object You can then preprocess RMA
52 | ```{r}
53 | e <- rma(ab)
54 | ```
55 |
56 | Now we go back to the previous working directory.
57 |
58 | ```{r}
59 | setwd(wd)
60 | ```
61 |
62 | If you are not interested in probe level data you could can use this function:
63 |
64 | ```{r}
65 | setwd(basedir)
66 | ejust <- justRMA(filenames=tab[,1],phenoData=tab)
67 | dim(ejust)
68 | ```
69 |
70 |
71 | ##Agilent data
72 |
73 | ```{r}
74 | library(limma)
75 | library(rafalib)
76 | basedir <- paste0(datadir, "/agilent")
77 | setwd(basedir)
78 | targets <- readTargets("TargetBeta7.txt")
79 | RG <- read.maimages(targets$FileName, source="genepix")
80 | MA <- MA.RG(RG,bc.method="none")
81 | mypar(1,1)
82 | imageplot(MA$M[,2], RG$printer, zlim=c(-3,3))
83 | dev.off()
84 | ```
85 |
86 |
87 | Now we go back to the previous working directory.
88 |
89 | ```{r}
90 | setwd(wd)
91 | ```
92 |
93 |
94 |
95 | ## oligo
96 | We can also use oligo to read affy arrays
97 |
98 | ```{r}
99 | detach("package:affy")
100 | library(oligo)
101 | basedir <- paste0(datadir,"/celfiles")
102 | setwd(basedir)
103 | tab <- read.delim("sampleinfo.txt",check.names=FALSE,as.is=TRUE)
104 | fns <- list.celfiles(listGzipped=TRUE)
105 | fns %in% tab[,1] ##check
106 | pd <- as(tab, "AnnotatedDataFrame")
107 | efs <- read.celfiles(filenames=tab[,1],phenoData=pd,sampleNames=sampleNames(pd))
108 | ```
109 |
110 | ```{r}
111 | e <- rma(efs)
112 | ```
113 |
--------------------------------------------------------------------------------
/bioc/seq4motif.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Genomic sequence -- utility for motif checking"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 | ```{r setup,echo=FALSE,results="hide",message=FALSE}
16 | library(GenomicFeatures)
17 | library(GenomicRanges)
18 | library(IRanges)
19 | library(devtools)
20 | library(ERBS)
21 | library(Homo.sapiens)
22 | ```
23 |
24 | # Overview
25 |
26 | In this document we'll show how to look for occurrences
27 | of a binding motif in genomic sequence underlying binding peaks.
28 |
29 | Recall that we have the ER binding peaks for two cell
30 | lines in the ERBS package. We'll focus on HepG2
31 |
32 | ```{r pkgs}
33 | library(ERBS)
34 | data(HepG2)
35 | HepG2
36 | ```
37 |
38 | We'd like to look at the genomic sequence underneath the peaks
39 | and inspect it for the binding motif "TCAAGGTCA". This is
40 | easy to do with the Biostrings and BSGenome infrastructure.
41 |
42 | # Reference genomic sequence for humans
43 |
44 | We'll work with hg19. The BSgenome... package will
45 | create variable `Hsapiens` on attachment.
46 | This variable gives a metadata report.
47 |
48 | ```{r gethg}
49 | library(BSgenome.Hsapiens.UCSC.hg19)
50 | Hsapiens
51 | ```
52 |
53 | The reference sequence for a chromosome can be obtained
54 | with the $ operator.
55 |
56 | ```{r getch}
57 | Hsapiens$chr17
58 | ```
59 |
60 | # Targeted retrieval of reference sequence
61 |
62 | The getSeq function obtains sequence
63 | corresponding to addresses listed in GRanges.
64 | We'll obtain the sequence under the peaks as
65 | `hepseq`, and a set of control sequences of similar
66 | lengths obtained by shifting the binding peak intervals
67 | by 2500 bases and obtaining the reference sequence in the
68 | shifted intervals.
69 |
70 | ```{r getsq}
71 | hepseq = getSeq(Hsapiens, HepG2)
72 | rhepseq = getSeq(Hsapiens, shift(HepG2,2500))
73 | hepseq
74 | ```
75 |
76 | # Counting motif occurrences
77 |
78 | We count the occurrences of the ESRRA
79 | binding motif
80 | "TCAAGGTCA" in the bound intervals (and their reverse complement
81 | representation). This is compared to the frequency of occurrence in the
82 | control sequences. We'll use the `vcountPattern` function of the
83 | Biostrings package to carry this out.
84 |
85 | ```{r lk1}
86 | sum(vcountPattern("TCAAGGTCA", hepseq))+sum(vcountPattern("TCAAGGTCA",
87 | reverseComplement(hepseq)))
88 | sum(vcountPattern("TCAAGGTCA", rhepseq))+sum(vcountPattern("TCAAGGTCA",
89 | reverseComplement(rhepseq)))
90 | ```
91 |
92 | We see a 9-fold increase in occupancy in the bound regions compared
93 | to the shifted regions. This is not the way one assesses motif occurrences.
94 | First, the motif is generally represented as a model and not a string.
95 | The model is typically expressed as a position weight matrix (PWM).
96 | Second, the most common software tools for evaluating motif enrichment are
97 | MEME and FIMO; matchPWM of the Biostrings package can perform similar analyses.
98 | package can also
99 |
--------------------------------------------------------------------------------
/bioc/storage/EDA_plots_for_microarray.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Exploratory Data Analysis for microarray
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | # EDA for Microarray data
12 | Here we are analyzing microarray data from eight samples: two groups of four. A first step in any analysis of genomics data is to learn its general properties and search for problematic samples. By viewing the data from the first sample we immediately notice that over 90% of data is below 1,000 and the remaining 10% spans values up to 40,000. By taking the log we get a better picture of the distribution. We use base 2 because memorizing the powers of 2 is easy. It gives us a friendly range: 4-16.
13 |
14 | ```{r,fig.width=6, fig.height=3}
15 | # BiocManager::install("genomicsclass/SpikeInEDA")
16 | library(SpikeInEDA)
17 | data(SpikeInEDA)
18 | par(mfrow=c(1,2))
19 | hist(int[,1])
20 | hist(log2(int[,1]))
21 | ```
22 |
23 | Next we look at all eight histograms simultaneously. To facilitate this we introduce the _density estimator_ or _smooth histogram_. Basically we create a histogram, draw a smooth curve through the top of the bars, and keep that curve. This permits us to put several histograms on the same page:
24 |
25 | ```{r,fig.width=3, fig.height=3}
26 | par(mfrow=c(1,1))
27 | for(i in 1:ncol(int))
28 | if(i==1) plot(density(log2(int[,i])),col=(i==4)+1) else lines(density(log2(int[,i])),col=(i==4)+1)
29 | ```
30 | Note that one histogram (we higlighted it by making it red) looks different: it has a different shape from the rest. So is this sample different from the rest in any significant way? If we compute the correlation between this sample and the rest it is not very different and all very high.
31 | ```{r}
32 | signif(cor(int),2)
33 | ```
34 | The problem is not immediately obviou from a scatter plot.
35 | ```{r,fig.width=6, fig.height=3}
36 | ##we don't need to show all the points so we take samples
37 | library(rafalib)
38 | splot<-function(x,y,...){
39 | ind<-sample(length(x),10000)
40 | x=x[ind];y=y[ind]
41 | plot(x,y,...)
42 | }
43 | mypar(1,2)
44 | splot(log2(int[,1]),log2(int[,2]))
45 | splot(log2(int[,1]),log2(int[,4]))
46 | ```
47 | Note that samples 1 through 4 are replicates and should produce the same values up to measurement error. Scatterplots and correlation are not the best tools to detect problems. Note for example that 1,2,3,4 and 100,200,300,400 two lists with very different values have perfect correlation. A better measure is the differences between the values and therefore a better plot is a rotation of the scatter plot containg the differences (log ratios) on the y-axis and the averages (in the log scale) on the x-axis. This plot is a refered to as an MA-plot.
48 | ```{r,fig.width=6, fig.height=3}
49 | maplot<- function(x,y,...) splot((x+y)/2,y-x,...)
50 | mypar(1,3)
51 | maplot(log2(int[,1]),log2(int[,2]),xlab="A",ylab="M",ylim=c(-2,2))
52 | maplot(log2(int[,1]),log2(int[,3]),xlab="A",ylab="M",ylim=c(-2,2))
53 | maplot(log2(int[,1]),log2(int[,4]),xlab="A",ylab="M",ylim=c(-2,2))
54 | ```
55 | Now the problem is obvious. It turns out this samples comes from an array for which a spatial problem can be detected at the original image level. We actually have the grid locations for these measurements and can recreate the image.
56 |
57 | ```{r, fig.width=6, fig.height=3}
58 | ##we are doing this for two arrays 1 and 4
59 | library(matrixStats) ##need rowMedians
60 | library(RColorBrewer)
61 | for(i in c(1,4)){
62 | r=log2(int[,i])-rowMedians(log2(int))
63 | ## r are residuals from median array
64 | ## to avoind outliers taking over colors of image
65 | ### define a MAX
66 | MAX<-1
67 | r[r>MAX]<-MAX
68 | r[r< -MAX] <- -MAX
69 | ##we now that every other column is skipped
70 | mat=matrix(NA,max(locations[,1]),max(locations[,2]+1)/2)
71 | for(j in 1:nrow(locations)){
72 | mat[locations[j,1],(locations[j,2]+1)/2]<-r[j]
73 | }
74 | image(mat,col=brewer.pal(11,"RdBu"))
75 | }
76 | ```
77 |
78 | On the second image we can clearly see the spatial pattern (blue are positive residuals, red are negative)
79 |
--------------------------------------------------------------------------------
/bioc/storage/GEOquery.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Downloading data from GEO using GEOquery
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ## Example of how to download CEL files from GEO
12 |
13 | ## contributed by Stephanie Hicks
14 |
15 | If the `GEOquery` R/Biocondcutor package is not installed, use `biocLite()` to install the package:
16 | ```{r, eval=FALSE}
17 | source("http://bioconductor.org/biocLite.R")
18 | biocLite("GEOquery")
19 | ```
20 |
21 | Load the `GEOquery` R/Bioconductor package:
22 | ```{r, message=FALSE}
23 | library(GEOquery)
24 | ```
25 |
26 |
27 | ### Access the GEO Series Data
28 | To access the GEO Sample (GSM), GEO Series (GSE) (lists of GSM files that together form a single experiment) or GEO Dataset (GDS), use the function `getGEO()` which returns a list of ExpressionSets:
29 | ```{r, message=FALSE, eval=FALSE}
30 | ###This will download a 20 Mb
31 | gse <- getGEO("GSE21653", GSEMatrix=TRUE)
32 | show(gse)
33 | ```
34 |
35 |
36 | ### Accessing raw data from GEO
37 | If raw data such as .CEL files exist on GEO, you can easily access this dea using the `getGEOSuppFiles()` function. The function takes in a GEO accession as the argument and will download all the raw data associated with that accession. By default the `getGEOSuppFiles()` function will create a directory within the current working directory to store the raw data. Here, the file paths of the downloaded files (often with as a .tar extension) are stored in a data frame called `filePaths`.
38 |
39 | ```{r,eval=FALSE}
40 | filePaths = getGEOSuppFiles('GSE21653')
41 | filePaths
42 | ```
43 | From here you can use, for example, `ReadAffy()` to read in the CEL files.
44 |
45 |
46 | ### Access GSE Data Tables from GEO
47 | To access the phenotypic information about the samples, the best way is to use `getGEO()` function to obtain the GSE object and then extract the phenoData object from that. Unfortunately this means downloadint the entire GSE Matrix file.
48 |
49 | ```{r,eval=FALSE}
50 | dim(pData(gse[[1]]))
51 | head(pData(gse[[1]])[,1:3])
52 | ```
53 |
54 | Sometimes GSEs are include separate data tables with the sample information. If these exist, you can uuse the `getGSEDataTables()` function. For example here is the phenoData object from a different GSE accession GSE3494 with a Data Table.
55 | ```{r}
56 | df1 <- getGSEDataTables("GSE3494")
57 | lapply(df1,head)
58 | ```
59 |
--------------------------------------------------------------------------------
/bioc/storage/anno1refbuilds.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Chromosomes and their substructures 1: Reference genomes"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 | ```{r setup,echo=FALSE,results="hide"}
16 | suppressPackageStartupMessages({
17 | library(BSgenome.Hsapiens.NCBI.GRCh38)
18 | library(Biostrings)
19 | library(GenomicRanges)
20 | library(IRanges)
21 | })
22 | ```
23 |
24 | # Genomic sequence, reference builds
25 |
26 | ## Human
27 |
28 | The genomic sequence for humans has recently
29 | been revised. We can use the most recent major
30 | revision as follows:
31 |
32 | ```{r hg38}
33 | library(BSgenome.Hsapiens.NCBI.GRCh38)
34 | Hsapiens
35 | h38 = Hsapiens # for later
36 | ```
37 |
38 | Notice the number of sequences reported, and their names. We can
39 | get the sequence for a chromosome by using list-like
40 | syntax with `Hsapiens`.
41 |
42 | ```{r lkc22}
43 | h38$"22"
44 | ```
45 |
46 | This shows that the starting and ending bases are indeterminate.
47 | We can obtain the overall nucleotide frequencies as
48 |
49 | ```{r lkf}
50 | alphabetFrequency(Hsapiens$"22")
51 | ```
52 |
53 | A great deal of reference data in use are annotated to
54 | build hg19 (also known as GRCh37).
55 |
56 | ```{r lk19}
57 | library(BSgenome.Hsapiens.UCSC.hg19)
58 | Hsapiens
59 | h19 = Hsapiens
60 | ```
61 |
62 | Note that there is a different sequence naming convention
63 | and a different number of sequences managed in this build.
64 |
65 |
66 | ## Other organisms
67 |
68 | If you have an internet connection, the `available.genomes` function
69 | will list packages that contain reference sequences.
70 |
71 | ```{r lkav, eval=FALSE}
72 | available.genomes()
73 | ```
74 |
75 | For organisms not covered at present by the project, tools
76 | for building compatible packages are available in the
77 | BSgenome package (see the BSgenomeForge vignette).
78 |
79 |
80 |
--------------------------------------------------------------------------------
/bioc/storage/anno2Biostrings.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Chromosomes and their substructures 2: Biostrings"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 | ```{r setup,echo=FALSE,results="hide"}
16 | suppressPackageStartupMessages({
17 | library(BSgenome.Hsapiens.NCBI.GRCh38)
18 | library(Biostrings)
19 | library(GenomicRanges)
20 | library(IRanges)
21 | })
22 | ```
23 |
24 |
25 | # Biostrings: basic infrastructure for computing on sequences
26 |
27 | ## Construction, sets, restricted alphabets
28 |
29 | Very large strings like chromosome sequences receive
30 | special handling in Bioconductor. We use a general container
31 | class called `BString` for "big" strings that are
32 | distringuished from R character vectors in that BStrings a) obey
33 | different rules for copying and b) do not contain multiple
34 | strings (see the man page for BString). Classes `DNAString`
35 | and `AAString` have restrictions on the characters that can be
36 | managed in instances.
37 |
38 | ```{r lkbs}
39 | library(Biostrings)
40 | bdemo = BString("BCDEF")
41 | ddemo = try(DNAString("BCDEF"))
42 | cat(ddemo)
43 | ademo = try(AAString("BCDEF"))
44 | ```
45 |
46 | Efficient management of multiple strings employs classes with
47 | "Set" as suffix.
48 | ```{r lkds}
49 | ddem2 = DNAStringSet(c("ACTG", "GTCAG"))
50 | ddem2
51 | ```
52 |
53 | The restrictions on contents of genomic strings are defined
54 | in constant vectors in `Biostrings`. For example
55 | ```{r lkcon}
56 | AA_ALPHABET
57 | IUPAC_CODE_MAP
58 | ```
59 |
60 | ## Operations
61 |
62 | There are over 200 functions defined in the Biostrings package,
63 | all devoted to computation on sequence data. Here's an
64 | example illustrating basic notions.
65 |
66 | ```{r doop}
67 | D = DNAString("ACTGACGTACGTAGGCTAGCGATCGATATACGATATACG")
68 | translate(D)
69 | codons(D)
70 | ```
71 |
72 | Notice that the output of codons is printed as a `Views` instance.
73 | This is a very efficient approach to creating references to
74 | subsequences of a sequence, without copying any data.
75 |
--------------------------------------------------------------------------------
/bioc/storage/anno4liftover.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Chromosomes and their substructures 4: Translating addresses between genome builds"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 |
16 | ```{r setup,echo=FALSE,results="hide"}
17 | suppressPackageStartupMessages({
18 | library(BSgenome.Hsapiens.UCSC.hg19)
19 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
20 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
21 | library(Biostrings)
22 | library(GenomicRanges)
23 | library(IRanges)
24 | library(ph525x)
25 | library(Homo.sapiens)
26 | library(rtracklayer)
27 | })
28 | ```
29 |
30 | # Translating addresses between genome builds: liftOver
31 |
32 | The rtracklayer package includes an interface to the
33 | liftOver utilities developed for the UCSC genome browser.
34 | The idea is that a collection of local alignments
35 | can be defined and used to remap coordinates from
36 | one reference build to another.
37 |
38 | We can illustrate this with gene addresses created for hg38,
39 | the current reference build. We want to translate them
40 | for comparison to addresses asserted for hg19.
41 |
42 | We need a "chain file", uncompressed. You can
43 | get it from the following URL, and use gunzip on your
44 | system to uncompress in your home dir, if you would
45 | like to emulate the commands below.
46 |
47 | "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz"
48 |
49 | ```{r domyimport}
50 | library(rtracklayer)
51 | ch = import.chain("~/hg38ToHg19.over.chain")
52 | ch
53 | str(ch[[1]])
54 | ```
55 |
56 | Let's get the addresses for genes on chromosome 1
57 | in hg38.
58 |
59 | ```{r get38}
60 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
61 | tx38 = TxDb.Hsapiens.UCSC.hg38.knownGene
62 | seqlevels(tx38, force=TRUE) = "chr1"
63 | g1_38 = genes(tx38)
64 | ```
65 |
66 | Now execute the liftOver:
67 |
68 | ```{r doli}
69 | g1_19L = liftOver(g1_38, ch)
70 | ```
71 |
72 | The result is a list of GRanges, one for
73 | each translation event.
74 |
75 | ```{r lktx}
76 | g1_19L
77 | ```
78 |
79 | Verification of accuracy of translation is covered in exercises.
80 |
--------------------------------------------------------------------------------
/bioc/storage/annoPhen.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Annotating phenotypes and molecular function"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | ---
8 |
9 | ```{r options, echo=FALSE}
10 | library(knitr)
11 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
12 | ```
13 |
14 | # The phenotype concept
15 |
16 |
17 | - "Phenotype" is an extremely broad term
18 | - In this course, it connotes low-dimensional representation of observable characteristics of an organism
19 | - Representation can be numerical or categorical
20 | * units of measurement should be recorded
21 | * "codes" for categorical items should be clear
22 |
23 | ## Example: ExperimentalData package on COPD
24 |
25 | We cross-tabulate gender and disease status for individuals in a study of chronic obstructive pulmonary disease
26 |
27 | ```{r dodim}
28 | library(COPDSexualDimorphism.data)
29 | data(lgrc.expr.meta)
30 | with(expr.meta, table(gender, diagmaj))
31 | ```
32 | ## Continuous by categorical
33 |
34 | Here's a boxplot of pack-years distributions, stratified by
35 | gender and disease status. The stratum labels become clumsy.
36 |
37 | ```{r lkbx, fig.width=8,fig.height=4.5,dpi=300,out.width="1920px",height="1080px",}
38 | gd = with(expr.meta, factor(paste(gender,diagmaj)))
39 | expr.meta$gd = gd
40 | library(ggplot2)
41 | ggplot(expr.meta, aes(x=gd, y=pkyrs)) + geom_boxplot()
42 | #plot(pkyrs~gd, data=expr.meta)
43 | ```
44 | ## Phenotype carefully and record faithfully
45 |
46 | - Validated questionnaires and protocols
47 | - Standardized terminology, units
48 | - Precise phenotypic characterization fosters more accurate mechanistic modeling
49 | - Caveat: molecular "basis" suggests causal directionality, but phenotype and environment can influence molecular state
50 |
51 | # Computing tools for inference on molecular mechanisms
52 |
53 | - "Molecular basis" is likewise a broad notion
54 | - Systematic terminologies exist to help clarify what is asserted in a given hypothesis or finding
55 | - At the boundaries of scientific knowledge, disagreement is common and terminologies diverge
56 | - Two examples:
57 | * What is a gene?
58 | * What is a gene's function?
59 |
60 | # Gene: A concrete computational definition
61 |
62 | - ORMDL3 is a gene implicated in genome-wide association
63 | studies as a factor in risk of asthma
64 | - Here's a view of its "structure" according to human reference build hg19 (use ph525x::modPlot)
65 | ```{r domo,fig.height=4,fig.width=7}
66 | library(ph525x)
67 | modPlot("ORMDL3", collapse=FALSE, useGeneSym=FALSE)
68 | ```
69 | * This will change with new reference build GRCh38
70 |
71 | # Characterizing ORMDL3 functionality
72 |
73 | ```{r dohum}
74 | library(Homo.sapiens)
75 | orfunc = select(Homo.sapiens, key="ORMDL3", keytype="SYMBOL",
76 | columns=c("GO", "TERM"))
77 | orfunc[,c("ONTOLOGY", "TERM")]
78 | ```
79 | - Gene Ontology standardizes terminology for biological processes, cellular components, and molecular functions
80 |
81 | # Summary
82 |
83 | * Phenotype characterization is challenging and frequently non-standard
84 | * Tokens available for data analysis in R are fairly simple and are used in ad hoc ways to characterize sample phenotype and condition
85 | * Reasoning about molecular processes underlying phenotype and disease states is intrinsically complex
86 | * Standardized vocabularies and models exist and are available in Bioconductor, but limitations must be admitted
87 |
88 |
--------------------------------------------------------------------------------
/bioc/storage/chromComp.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Computing with chromosomes and variants"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | ---
8 |
9 | ```{r options, echo=FALSE}
10 | library(knitr)
11 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
12 | ```
13 |
14 | # Overview
15 |
16 | We will consider how to do various very high-level tasks with chromosomes and variants in Bioconductor.
17 |
18 | - listing packages representing reference builds for humans and model organisms
19 | - acquiring human reference genome sequence
20 | - finding views of genes as sequences
21 | - examining the dbSNP catalog of small variants in populations of human genomes
22 | - examining the NHGRI GWAS catalog of associations between variants and phenotypes
23 |
24 | # BSgenome and available genomes
25 |
26 | ```{r sillib,echo=FALSE,results="hide"}
27 | suppressPackageStartupMessages({
28 | library(IRanges)
29 | library(BSgenome)
30 | library(BSgenome.Hsapiens.UCSC.hg19)
31 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
32 | library(SNPlocs.Hsapiens.dbSNP.20120608)
33 | library(gwascat)
34 | library(ggbio)
35 | })
36 | ```
37 |
38 | ```{r lkbs}
39 | library(BSgenome)
40 | head(available.genomes()) # requires internet access
41 | grep("Hsapiens", available.genomes(), value=TRUE)
42 | ```
43 |
44 | # The human reference sequence, build hg19; gene sequences
45 |
46 | ```{r lkhs}
47 | library(BSgenome.Hsapiens.UCSC.hg19)
48 | Hsapiens
49 | c17 = Hsapiens$chr17
50 | c17
51 | ```
52 |
53 | The class of `c17` is `r class(c17)`. This is a full in-memory representation of all the bases of the chromosome. We can work with substructures of interest without duplicating the contents of memory devoted to the sequence. We'll obtain a view of coding sequences of genes on chromosome 17. To do this we will employ a special transcript database structure.
54 |
55 | ```{r gettx}
56 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
57 | txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
58 | txdb
59 | ```
60 |
61 | We are only interested in information on chr17 at the moment. We establish chr17 as the active sequence in this transcript database
62 | ```{r settx}
63 | tmp = isActiveSeq(txdb)
64 | tmp[] = FALSE # turn all off
65 | tmp[17] = TRUE # turn 17 on
66 | isActiveSeq(txdb) = tmp
67 | g17 = genes(txdb)
68 | g17
69 | ```
70 |
71 | Now we make a structure that has addresses and sequences of genes.
72 |
73 | ```{r getv}
74 | gs17 = getSeq(Hsapiens, g17)
75 | gs17
76 | ```
77 |
78 | In the next version of Bioconductor this can be accomplished somewhat
79 | more efficiently using "Views()".
80 |
81 | # dbSNP
82 |
83 | We have an image of the dbSNP variant catalog for hg19. The information retained is limited to the dbSNP identifier, chromosome location, and variant content.
84 |
85 | ```{r dodb}
86 | library(SNPlocs.Hsapiens.dbSNP.20120608)
87 | sl17 = getSNPlocs("ch17", as.GRanges=TRUE)
88 | sl17
89 | ```
90 |
91 | The allele codes are translated by the IUPAC map.
92 | ```{r lkal}
93 | IUPAC_CODE_MAP
94 | ```
95 |
96 | # GWAS catalog
97 |
98 | National Human Genome Research Institute maintains a listing of genetic association studies that have found significant associations between DNA variants and major phenotypes and diseases. Inclusion in the catalog requires that the findings be replicated in an independent population.
99 |
100 | ```{r lkgw}
101 | library(gwascat)
102 | data(gwrngs19) # for hg19
103 | gwrngs19
104 | ```
105 |
106 | A simple display of associations and phenotypes is available
107 | with the `traitsManh` function.
108 |
109 | ```{r lkg2,fig=TRUE}
110 | example(traitsManh)
111 | ```
112 |
--------------------------------------------------------------------------------
/bioc/storage/chromIntro.Rmd:
--------------------------------------------------------------------------------
1 |
2 |
3 | ---
4 | layout: page
5 | title: "Introductory problems with chromosomes and variants"
6 | Author: "Vince Carey"
7 | ---
8 |
9 | # The composition of a gene
10 |
11 | We can obtain chromosomal sequence for all genes
12 | on chr17 as follows.
13 |
14 | ```{r getviews}
15 | library(BSgenome.Hsapiens.UCSC.hg19)
16 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
17 | txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
18 | aseq = isActiveSeq(txdb)
19 | aseq[] = FALSE
20 | aseq["chr17"] = TRUE
21 | isActiveSeq(txdb) = aseq
22 | gs17 = genes(txdb)
23 | gsv17 = Views(Hsapiens, gs17)
24 | gsv17
25 | ```
26 |
27 | What is the distribution of nucleotide counts for gene
28 | ORMDL3? We need to obtain the ENTREZID:
29 | ```{r gettag}
30 | library(Homo.sapiens)
31 | eid = select(Homo.sapiens, keys="ORMDL3", keytype="SYMBOL",
32 | columns="ENTREZID")$ENTREZID
33 | ```
34 | Now tabulate nucleotides:
35 | ```{r dotab}
36 | alphabetFrequency( gsv17[ which(mcols(gsv17)$gene_id == eid) ] )
37 | ```
38 |
39 | # Determination of the alternate allele
40 |
41 | rs145615430 is a SNP on chr17. What is the alternate allele?
42 |
43 | ```{r dolo}
44 | library(SNPlocs.Hsapiens.dbSNP.20120608)
45 | s17 = getSNPlocs("ch17")
46 | head(s17)
47 | ```
48 |
49 | We see that it is at base 56 on chr17.
50 |
51 | ```{r getch}
52 | c17 = Hsapiens$chr17
53 | substr(c17, 56, 56)
54 | ```
55 | The IUPAC code is Y, indicating a CT diallele, so the alternate
56 | is T.
57 |
58 | The associated dbSNP record indicates no frequency data available.
59 |
60 | # SNPs in ORMDL3
61 |
62 | Are there population-level polymorphisms in the coding region
63 | of ORMDL3? We can use GRanges to investigate.
64 | ```{r dopo}
65 | orgr = granges( gsv17[ which(mcols(gsv17)$gene_id == eid) ] )
66 | s17r = getSNPlocs("ch17", as.GRanges=TRUE)
67 | seqlevelsStyle(s17r) = "UCSC"
68 | genome(s17r) = genome(orgr)
69 | seqlevels(s17r) = seqlevels(orgr) = "chr17"
70 | fo = findOverlaps(s17r, orgr, ignore.strand=TRUE)
71 | s17r[subjectHits(fo)]
72 | ```
73 |
74 | # GWAS hits for ORMDL3
75 |
76 | We can see the traits of GWAS in which ORMDL3 was implicated.
77 |
78 | ```{r lkgw}
79 | library(gwascat)
80 | data(gwrngs19)
81 | gwrngs19[ grep("ORMDL3", mcols(gwrngs19)$Reported.Gene.s) ]
82 | ``
83 |
--------------------------------------------------------------------------------
/bioc/storage/mapping_features.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Mapping features to genes
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ## Using Bioconductor annotation packages
12 |
13 | This unit will focus on mapping features to genes, i.e., getting annotation information from one format to another. We start by loading in the `maPooling` dataset from previous lectures.
14 |
15 | ```{r}
16 | # library(devtools)
17 | # install_github("dagdata","genomicsclass")
18 | library(dagdata)
19 | library(Biobase)
20 | data(maPooling)
21 | e <- maPooling
22 | head(rownames(e))
23 | annotation(e)
24 | ```
25 |
26 | The annotation for this ExpressionSet is *rae230a*. Many platforms will have database annotation packages already existing on Bioconductor. We can access these, first by installing, and then loading the library. We will use the `AnnotationDbi` package to query the information in the library.
27 |
28 | While in this unit we will use a microarray annotation package as an example, the same commands can be used for an organism package, such as the homo sapiens annotation package `org.Hs.eg.db`, which let's one query from one kind of gene annotation to another.
29 |
30 | ```{r}
31 | # biocLite(paste0(annotation(e),".db"))
32 | library(rae230a.db)
33 | # biocLite("AnnotationDbi")
34 | library(AnnotationDbi)
35 | ```
36 |
37 | Annotation packages have *columns*, some of which may be *keys*. You can query the database using a *key*, and ask for one or more *columns* in return. We will use the rownames of the ExpressionSet as keys.
38 |
39 | ```{r}
40 | columns(rae230a.db)
41 | keytypes(rae230a.db)
42 | head(keys(rae230a.db, keytype="PROBEID"))
43 | head(rownames(e))
44 | ```
45 |
46 | The following `select` call will return the Entrez ID, ENSEMBL ID, and gene symbol for each Probe ID, which are the rownames of the ExpressionSet.
47 |
48 | ```{r}
49 | res <- select(rae230a.db, keys=rownames(e),
50 | columns=c("ENTREZID","ENSEMBL","SYMBOL"),
51 | keytype="PROBEID")
52 | head(res)
53 | idx <- match(rownames(e), res$PROBEID)
54 | ```
55 |
56 | We need to align the `res` object so that we pull out, in order, one row for each row of the ExpressionSet.
57 |
58 | ```{r}
59 | head(rownames(e))
60 | head(res$PROBEID,7)
61 | head(idx)
62 | ```
63 |
64 | Here we add the new information to the `fData` of `e`. If there were already information in `fData`, we would have used `cbind` to add the new columns. Note here that, since we have a one-to-many mapping, the `match` function gave us the first match that it found. You could also collapse all possible matches of the Probe ID to the Genes using `split` and `paste` with the `collapse` argument. However, here we keep it simple and just take the first match in the `res` object.
65 |
66 | ```{r}
67 | fData(e) <- res[idx,]
68 | head(fData(e),10)
69 | all.equal(fData(e)$PROBEID, rownames(e))
70 | ```
71 |
72 | ## Using Biomart
73 |
74 | An alternate way to map from one annotation to another is using the `biomaRt` package. For more information on which Biomarts are available and how to access them, see the `biomaRt` vignette.
75 |
76 | ```{r}
77 | # biocLite("biomaRt")
78 | library(biomaRt)
79 | # vignette("biomaRt")
80 | m <- useMart( "ensembl", dataset = "rnorvegicus_gene_ensembl")
81 | map <- getBM(mart = m,
82 | attributes = c("ensembl_gene_id", "entrezgene"),
83 | filters = "ensembl_gene_id",
84 | values = fData(e)$ENSEMBL)
85 | head(map)
86 | ```
87 |
88 | Finally, we need to align the new information with the old information using the `match` function as before, again picking the first match from a one-to-many mapping. We see that for the most part the new and the old Entrez IDs are the same, though some differences occur when we pick one from the one-to-many mappings that exist.
89 |
90 |
91 | ```{r}
92 | idx <- match(fData(e)$ENSEMBL, map$ensembl_gene_id)
93 | fData(e)$NEW_ENTREZID <- map$entrezgene[idx]
94 | head(fData(e))
95 | mean(fData(e)$ENTREZID == fData(e)$NEW_ENTREZID, na.rm=TRUE)
96 | ```
97 |
98 |
--------------------------------------------------------------------------------
/bioc/storage/probeSearch.Rmd:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | layout: page
4 | title: "Searching the reference genome for array probe sequences"
5 | Author: "Vince Carey"
6 | ---
7 |
8 | # Introduction
9 |
10 | Classic Affymetrix expression arrays are known as "3'-biased".
11 | This is because the probe sequences used were selected primarily
12 | from
13 | sequences constituting the 3' untranslated region of mammalian
14 | genes. In this document we'll see how Bioconductor's
15 | annotation facilities can be used to check asserted locations
16 | of array probes.
17 |
18 | # The probe packages; sequence for a gene
19 |
20 | With Affymetrix expression arrays
21 | the primary unit intended for analysis is the probe
22 | set used for mRNA abundance quantification.
23 | Probe sequences are provided in Bioconductors *probe
24 | package series.
25 |
26 | ```{r quietatt, echo=FALSE, results="hide"}
27 | options(width=90)
28 | suppressPackageStartupMessages({
29 | library(hgu133plus2probe) # probe package
30 | library(hgu133plus2.db) # ChipDb package, annotation mapping
31 | library(dplyr)
32 | library(Biostrings)
33 | library(BSgenome.Hsapiens.UCSC.hg19)
34 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
35 | library(lumiHumanAll.db)
36 | library(lumi)
37 | library(SNPlocs.Hsapiens.dbSNP.20120608)
38 | library(GenomeInfoDb)
39 | })
40 | ```
41 | ```{r lkp}
42 | library(hgu133plus2probe) # probe package
43 | library(hgu133plus2.db) # ChipDb package, annotation mapping
44 | ```
45 |
46 | We'll use the `select` method to find identifiers for
47 | a gene of interest in various disease processes, BCL2L2.
48 | ```{r getid}
49 | AnnotationDbi::select(hgu133plus2.db,
50 | key="BCL2L2", keytype="SYMBOL", columns=c("PROBEID", "ENTREZID", "CHRLOC",
51 | "CHRLOCEND"))
52 | ```
53 |
54 | Now we will obtain the probe sequences for one of these
55 | probe sets
56 | ```{r gets}
57 | library(dplyr)
58 | bs = hgu133plus2probe %>%
59 | filter(Probe.Set.Name == "209311_at")
60 | bs
61 | ```
62 |
63 | # Matching the sequences to the reference genome
64 |
65 | First we convert the character data on probe sequence
66 | to Biostrings DNAStrings.
67 | ```{r conv}
68 | ss = bs[,"sequence"]
69 | library(Biostrings)
70 | sss = DNAStringSet(ss)
71 | ```
72 | Obtain the reference sequence for chr14
73 | ```{r getbsg}
74 | library(BSgenome.Hsapiens.UCSC.hg19)
75 | c14 = Hsapiens$chr14
76 | c14
77 | ```
78 | Biostrings can match fairly large numbers of
79 | sequences (called query sequences) to a
80 | subject sequence using the Aho-Corasick approach
81 | (see ?PDict for a reference.)
82 | ```{r dodi}
83 | pd = PDict(sss)
84 | pd
85 | ```
86 |
87 | We now carry out the search.
88 | ```{r doma}
89 | mats = matchPDict(pd, c14)
90 | mats
91 | ```
92 |
93 | # Are the probes in 3'UTR?
94 |
95 | We will use TranscriptDb to conclude this study.
96 | ```{r getlk}
97 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
98 | txdb = TxDb.Hsapiens.UCSC.hg19.knownGene # shorten
99 | ```
100 | For simplicity, we'll restrict attention to chr14.
101 | ```{r doac}
102 | ii = isActiveSeq(txdb)
103 | ii[] = FALSE
104 | ii["chr14"] = TRUE
105 | isActiveSeq(txdb) = ii
106 | utrs3 = threeUTRsByTranscript(txdb, use.names=TRUE)
107 | utrs3
108 | ```
109 |
110 | We can structure the record of matches of probe sequences
111 | to reference genome as a GRanges:
112 | ```{r restr}
113 | mats = GRanges("chr14", unlist(mats))
114 | fo = findOverlaps(mats, utrs3)
115 | table(subjectHits(fo))
116 | ufo = unique(subjectHits(fo))
117 | utrs3[ufo]
118 | ```
119 |
120 | # Analogous work with Illumina probe sequences
121 |
122 | You can perform a similar check with illumina probes.
123 | ```{r lkl}
124 | library(lumiHumanAll.db)
125 | library(lumi)
126 | sel = AnnotationDbi::select(lumiHumanAll.db, key="BCL2L2",
127 | keytype="SYMBOL", columns="PROBEID")
128 | sel
129 | id2seq(sel)
130 | ```
131 |
132 | Search and verify.
133 |
134 | # SNPs in probes?
135 |
136 | There is a slight complication because dbSNP
137 | uses an unusual chromosome naming convention.
138 | ```{r lksn}
139 | library(SNPlocs.Hsapiens.dbSNP.20120608)
140 | library(GenomeInfoDb)
141 | s14 = getSNPlocs("ch14", as.GRanges=TRUE)
142 | seqlevelsStyle(s14) = "UCSC"
143 | findOverlaps(s14, mats)
144 | ```
145 |
146 | We find that there are population level polymorphisms
147 | within the sequences for two of the Affy probes.
148 | Under what conditions
149 | would this cause a problem for accurate expression quantification?
150 | What sorts of statistical methods could ameliorate this?
151 |
--------------------------------------------------------------------------------
/bioc/storage/using_limma_old_no_comments.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Using limma
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 |
12 | # Using limma for microarray analysis
13 |
14 | Here we provide the code shown in the video
15 | ```{r}
16 | biocLite("SpikeInSubset")
17 | library(SpikeInSubset)
18 | data(rma95)
19 | library(genefilter)
20 | fac <- factor(rep(1:2,each=3))
21 | tt <- rowttests(exprs(rma95),fac)
22 | mask <- with(tt, abs(dm) < .2 & p.value < .01)
23 | spike <- rownames(rma95) %in% colnames(pData(rma95))
24 | cols <- ifelse(mask,"red",ifelse(spike,"dodgerblue","black"))
25 |
26 | with(tt, plot(-dm, -log10(p.value), cex=.8, pch=16,
27 | xlim=c(-1,1), ylim=c(0,5),
28 | xlab="difference in means",
29 | col=cols))
30 | abline(h=2,v=c(-.2,.2), lty=2)
31 |
32 | tt$s <- apply(exprs(rma95), 1, function(row) sqrt(.5 * (var(row[1:3]) + var(row[4:6]))))
33 | with(tt, plot(s, -log10(p.value), cex=.8, pch=16,
34 | log="x",xlab="estimate of standard deviation",
35 | col=cols))
36 |
37 | library(limma)
38 | fit <- lmFit(rma95, model.matrix(~ fac))
39 | ebfit <- ebayes(fit)
40 | limmares <- data.frame(dm=coef(fit)[,"fac2"], p.value=ebfit$p.value[,"fac2"])
41 | with(limmares, plot(dm, -log10(p.value),cex=.8, pch=16,
42 | col=cols,xlab="difference in means",
43 | xlim=c(-1,1), ylim=c(0,5)))
44 | abline(h=2,v=c(-.2,.2), lty=2)
45 |
46 |
47 | n <- 40
48 | qs <- seq(from=0,to=.2,length=n)
49 | idx <- sapply(seq_len(n),function(i) which(as.integer(cut(tt$s^2,qs)) == i)[1])
50 | idx <- idx[!is.na(idx)]
51 | par(mar=c(5,5,2,2))
52 | plot(1,1,xlim=c(0,.21),ylim=c(0,1),type="n",
53 | xlab="variance estimates",ylab="",yaxt="n")
54 | axis(2,at=c(.1,.9),c("before","after"),las=2)
55 | segments((tt$s^2)[idx],rep(.1,n),
56 | ebfit$s2.post[idx],rep(.9,n))
57 | ```
58 |
--------------------------------------------------------------------------------
/bioc/tophat.md:
--------------------------------------------------------------------------------
1 | # Short video of mapping RNA-Seq reads
2 |
3 | Note that the commands used in this lab require you have a lot of free disk space (the FASTQ files alone are 28 GB) and many cores available for running the alignment program. We do not expect students to replicate the commands in this video. We do not expect students install the alignment software on their machines. Much of the software for processing NGS data is designed for Linux systems. Note that the case studies (in particular the variant discovery and genotyping case study) will go into more depth on using Linux for processing NGS data.
4 |
5 | The FASTQ file we are looking at in the beginning of this screencast was downloaded from:
6 |
7 | http://trace.ncbi.nlm.nih.gov/Traces/sra/?run=SRR1177756
8 |
9 | This is a human RNA-Seq sample from a [study](http://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP032775) of naturally acquired immunity to malaria.
10 |
11 | We discuss the following software in the screencast:
12 |
13 | * fastq-dump from the [SRA toolkit](http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software)
14 | * [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)
15 | * [Tophat2](http://ccb.jhu.edu/software/tophat/index.shtml)
16 | * [Samtools](http://samtools.sourceforge.net/)
17 |
18 | To extract the FASTQ files from the SRA file, we used the following line. The `--split-files` argument is used to extract two files for the two paired-ends of the fragments which were sequenced.
19 |
20 | ```
21 | fastq-dump --split-files SRR1177756.sra
22 | ```
23 |
24 | The call for running Tophat2 was:
25 |
26 | ```
27 | export BOWTIE2_INDEXES=/path/to/your/Bowtie2Index
28 |
29 | tophat2 -o SRR1177756_tophat_out -p 10 genome SRR1177756_1.fastq SRR1177756_2.fastq
30 | ```
31 |
32 | To view the reads we used Samtools:
33 |
34 | ```
35 | samtools view accepted_hits.bam | head -1000 | less
36 | ```
37 |
38 | For demonstration purposes (you wouldn't necessarily repeat these lines in a typical workflow), we merged the mapped and unmapped reads into a single sorted file. For this we used the following calls:
39 |
40 | ```
41 | samtools sort -n accepted_hits.bam accepted_hits_name_sorted
42 | samtools sort -n unmapped.bam unmapped_name_sorted
43 | samtools merge -n all_reads.bam accepted_hits_name_sorted.bam unmapped_name_sorted.bam
44 | ```
45 |
46 |
--------------------------------------------------------------------------------
/biocadv_6x/bioc2_ggbio.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "Sketching the binding landscape over chromosomes with ggbio's karyogram layout"
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | There are many interesting approaches to visualizing genome-scale data.
12 | Two major packages in Bioconductor are Gviz and ggbio. Both represent
13 | significant efforts at bridging the gap between graphics facilities
14 | and various genomic data structures.
15 |
16 | ggbio's `autoplot` method can be very useful for broad overviews.
17 | For a GRanges instance, each range for which data exists can be
18 | depicted as a band on the chromosome. The karyogram layout
19 | gives a genome-wide view, but it can be important to control
20 | the handling of extra-chromosomal sequence levels.
21 |
22 | ```{r getl,echo=FALSE,results="hide"}
23 | suppressWarnings({
24 | suppressPackageStartupMessages({
25 | library(ERBS)
26 | library(GenomeInfoDb)
27 | library(ggbio)
28 | })
29 | })
30 | ```
31 |
32 | Here is the layout for the liver cell line:
33 | ```{r lkd, fig=TRUE}
34 | library(ERBS)
35 | data(HepG2)
36 | library(GenomeInfoDb) # trim all but autosomal chroms
37 | HepG2 = keepStandardChromosomes(HepG2)
38 | data(GM12878)
39 | GM12878 = keepStandardChromosomes(GM12878)
40 | library(ggbio)
41 | autoplot(HepG2, layout="karyogram", main="ESRRA binding on HepG2")
42 | ```
43 |
44 | And for the B-cell line:
45 |
46 | ```{r lkm,fig=TRUE}
47 | autoplot(GM12878, layout="karyogram", main="ESRRA binding on GM12878")
48 | ```
49 |
--------------------------------------------------------------------------------
/biocadv_6x/bioc2_gvfeat.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "Gviz for plotting data with genomic features"
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | It is often of interest to display observed data in the
12 | context of genomic reference information. We'll examine how to
13 | do this with the ESRRA binding data and Gviz.
14 |
15 | First we load up relevant data and annotation packages along with
16 | Gviz.
17 |
18 | ```{r getl,echo=FALSE,results="hide"}
19 | suppressWarnings({
20 | suppressPackageStartupMessages({
21 | library(ERBS)
22 | library(Gviz)
23 | library(Homo.sapiens)
24 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
25 | })
26 | })
27 | ```
28 | ```{r getp}
29 | library(ERBS)
30 | library(Gviz)
31 | library(Homo.sapiens)
32 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
33 | txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
34 | ```
35 |
36 | ## Genes in the vicinity of ESRRA
37 |
38 | How can we identify a slice of the human genome containing
39 | ESRRA and some neighboring genes? There are various approaches;
40 | we'll start by obtaining the ENTREZ identifier.
41 |
42 | ```{r getid}
43 | library(Homo.sapiens)
44 | eid = select(Homo.sapiens, keys="ESRRA", keytype="SYMBOL", columns="ENTREZID")
45 | eid
46 | ```
47 |
48 | Now we obtain the addresses for the ESRRA gene body,
49 | collect addresses of neighboring genes, and bind in the
50 | symbols for these genes.
51 |
52 | ```{r done}
53 | allg = genes(txdb)
54 | esrraAddr = genes(txdb, filter=list(gene_id=2101)) # redundant...
55 | esrraNeigh = subsetByOverlaps(allg, esrraAddr+500000)
56 | esrraNeigh$symbol = mapIds(Homo.sapiens, keys=esrraNeigh$gene_id, keytype="ENTREZID",
57 | column="SYMBOL")
58 | ```
59 |
60 | A quick check on the task with Gviz:
61 | ```{r lknei,fig=TRUE}
62 | plotTracks(GeneRegionTrack(esrraNeigh, showId=TRUE))
63 | ```
64 |
65 | ## The ESRRA binding peaks in this region
66 |
67 | We obtain the ESRRA binding data for the GM12878 EBV-transformed
68 | B-cell and subset to events near our group of genes.
69 | ```{r gete}
70 | data(GM12878)
71 | sc = subsetByOverlaps(GM12878, range(esrraNeigh))
72 | sc
73 | ```
74 |
75 | ## Computing an ideogram to give context on the chromosome
76 |
77 | This computation is slow.
78 | ```{r doid,cache=TRUE}
79 | idxTrack = IdeogramTrack(genome="hg19", chr="chr11")
80 | ```
81 |
82 | ## Putting it all together
83 |
84 | We start at the top with the ideogram to identify chromosome and
85 | region on chromosome to which we are zooming with observational
86 | and structural information.
87 |
88 | ```{r dofull,fig=TRUE}
89 | plotTracks(list(idxTrack, GenomeAxisTrack(),
90 | DataTrack(sc[,7], name="ESRRA peak values"),
91 | GeneRegionTrack(esrraNeigh, showId=TRUE,
92 | name="genes near ESRRA"), GenomeAxisTrack()))
93 | ```
94 |
--------------------------------------------------------------------------------
/biocadv_6x/bioc2_hybstor.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "Sharded GRanges: a hybrid in/out of memory strategy for large sets of ranges"
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | suppressPackageStartupMessages({
10 | library(Biobase)
11 | library(geuvStore2)
12 | library(gQTLBase)
13 | library(gQTLstats)
14 | library(foreach)
15 | library(doParallel)
16 | library(ph525x)
17 | })
18 | ```
19 |
20 |
21 | ## Introduction
22 |
23 | We've looked at a number of approaches to
24 | working with data external to R:
25 |
26 | * HDF5, which manages groups of multidimensional arrays on disk
27 | * sqlite, a zero-configuration relational database
28 | * tabix, a simple approach to indexing records on genomic coordinates
29 |
30 | Here I want to describe an approach that seems useful for millions
31 | of ranges annotated in the course of searching for variants that
32 | affect gene expression at the population level. The approach
33 | is based on a concept of storing data in "shards", homogeneous small
34 | fragments that can be quickly loaded and unloaded, discoverable
35 | by index and traversable in parallel.
36 |
37 | ## Motivation: An integrative view of associations in GEUVADIS
38 |
39 | The [GEUVADIS study](http://www.nature.com/nature/journal/v501/n7468/full/nature12531.html) is an intensive multiomic study of gene expression in multiple
40 | populations. We want to make use of the data from this study to
41 | investigate variants affecting genes of interest, with one tool
42 | an interactive graphical utility illustrated in the video:
43 |
44 | ```{r lkgg,fig=TRUE}
45 | library(ph525x)
46 | ggshot()
47 | ```
48 |
49 | We want to be able to select genes by symbol and explore names
50 | and epigenetic contexts of variants whose content is associated with
51 | expression variation. It is useful to have the variants annotated
52 | using GRanges, but a very large GRanges object (there are hundreds
53 | of millions of SNP-gene associations recorded) can be unwieldy.
54 | Solutions using RDBMS or HDF5 may be viable but more infrastructure
55 | for rapidly searching such stores using genomic coordinates,
56 | and for converting query results to GRanges will be needed.
57 |
58 | BatchJobs was used to generate the association tests, and it
59 | produces
60 | an organized system of "sharded" GRanges recording the
61 | associations along with metadata about the associated features.
62 | This system can be stored in a package, exemplified by geuvStore.
63 |
64 | ## A quick look at geuvStore
65 |
66 | The association test results are organized using a BatchJobs
67 | registry that is wrapped in an S4 class called ciseStore.
68 | ```{r lkgv}
69 | library(geuvStore2)
70 | m = makeGeuvStore2()
71 | class(m)
72 | m
73 | ```
74 |
75 | The show method for m probes into the store and retrieves one record
76 | from one GRanges instance.
77 |
78 | ## Scalable traversal
79 |
80 | The traversal of all GRanges available in this selection is
81 | governed by foreach loops.
82 | ```{r lksca, cache=TRUE}
83 | library(gQTLBase)
84 | ut1 = system.time(l1 <- storeApply(m, length))
85 | ut1
86 | library(doParallel)
87 | registerDoParallel(cores=2)
88 | ut2 = system.time(l2 <- storeApply(m, length))
89 | ut2
90 | print(sum(unlist(l2)))
91 | all.equal(unlist(l1), unlist(l2))
92 | ```
93 | We see that doubling the number of processors reduces the
94 | time required to get the length of each component of the archive.
95 | With large numbers of cores, we can quickly assemble information
96 | about many variants.
97 |
98 | ## Scalable histogram construction
99 |
100 | When the histogram bins are fixed, divide and conquer can be
101 | used to assemble a histogram in parallel over many chunks.
102 |
103 | ```{r lkhhh,cache=TRUE}
104 | registerDoParallel(cores=1)
105 | system.time(ll <- storeToHist(m, getter=function(x)log(mcols(x)$chisq+1), breaks=c(0,seq(.1,5,.1),10)))
106 | registerDoParallel(cores=2)
107 | system.time(ll <- storeToHist(m, getter=function(x)log(mcols(x)$chisq+1), breaks=c(0,seq(.1,5,.1),10)))
108 | ```
109 |
110 | ## Indexing for targeted retrievals
111 |
112 | The ciseStore class includes two maps: one from range to shard number,
113 | another from gene identifier to shard number. This allows rapid
114 | retrievals.
115 |
116 | ```{r lkex}
117 | myr = GRanges(2, IRanges(1975.7e5, width=50000))
118 | extractByRanges(m, myr)
119 | ```
120 |
121 | ## Conclusions
122 |
123 | geuvStore2 is a complex architecture that aims to provide a
124 | partly baked representation of quantities from genome-scale
125 | surveys that can be scalably surveyed and integrated. This
126 | is accomplished by keeping ranges for association scores
127 | and metadata in small sharded GRanges with some simple indexes,
128 | retrieval utilities, and with support for parallelized traversal
129 | and summary. It would be very nice to achieve these aims with
130 | a more homogeneous underlying architecture such as HDF5, and
131 | this may be possible as file-backed SummarizedExperiments come
132 | on line.
133 |
134 |
--------------------------------------------------------------------------------
/biocadv_6x/bioc2_ov.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Bioconductor for genome-scale data -- quick intro
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ```{r setup,echo=FALSE,results="hide"}
12 | suppressWarnings({
13 | suppressPackageStartupMessages({
14 | library(Biobase)
15 | library(GSE5859)
16 | library(annotate)
17 | library(BiocParallel)
18 | library(VariantAnnotation)
19 | library(BSgenome.Hsapiens.UCSC.hg19)
20 | })
21 | })
22 | ```
23 |
24 | # PH525.6x: Basic premise and overview
25 |
26 | You know to manipulate and analyze data using R, and
27 | you understand a considerable amount about statistical modeling.
28 | If you've taken PH525.5x, you've gotten significant background
29 | on current agendas in computational biology, and have learned
30 | how to deal with genomic data from the management, annotation,
31 | and analysis perspectives.
32 |
33 | In this course, we will use Bioconductor as the foundation
34 | of demonstrations and exercises in
35 | * methods for genome-scale data visualization including interactive graphics with the shiny and ggvis packages;
36 | * programming strategies for scalable bioinformatics with multicore and cluster computing infrastructure;
37 | * integrative management and analysis of multiassay experiments, with illustrations from The Cancer Genome Atlas (TCGA);
38 | * approaches to improving reproducibility of genome-scale analyses.
39 |
40 | One week will be devoted to each of these topics.
41 |
--------------------------------------------------------------------------------
/biocadv_6x/bioc2_rainfall.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "A view of genetic heterogeneity between and within cancer types"
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | suppressPackageStartupMessages({
10 | library(ph525x)
11 | library(RTCGAToolbox)
12 | })
13 | ```
14 |
15 |
16 | ## Introduction
17 |
18 | We will use data in the ph525x package on mutations in
19 | breast cancer and rectal adenocarcinoma to illustrate
20 | some issues in dealing with mutations data from TCGA.
21 | A basic objective is construction of a "rainfall plot".
22 | An example is Figure 6 from [Alexandrov et al. 2013](http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3776390&tool=pmcentrez&rendertype=abstract):
23 |
24 | ```{r lkkat,fig=TRUE,echo=FALSE}
25 | kataegis()
26 | ```
27 |
28 | These plots include data from deeply sequenced individual tumors,
29 | and we'd like to understand how to construct them using
30 | tools from Bioconductor.
31 |
32 | ## The mutation data frames from RTCGAToolbox
33 |
34 | The `readMuts` data are from the 20150402 TCGA production.
35 | ```{r lkread}
36 | library(ph525x)
37 | data(readMuts)
38 | dim(readMuts)
39 | data(brcaMuts)
40 | dim(brcaMuts)
41 | ```
42 |
43 | ## Mutation types and their contents
44 |
45 | ```{r lkrmut}
46 | table(readMuts$Variant_Type)
47 | with(readMuts, head(Reference_Allele[Variant_Type=="DEL"]))
48 | ```
49 |
50 | ## Tabulating substitution types
51 |
52 | The following function enumerates substitutions according to
53 | the [COSMIC convention](http://cancer.sanger.ac.uk/cosmic/signatures):
54 | "The profile of each signature is displayed using the six substitution subtypes: C>A, C>G, C>T, T>A, T>C, and T>G (all substitutions are referred to by the pyrimidine of the mutated Watson–Crick base pair)."
55 |
56 | ```{r dosubt}
57 | subt = function(ref, a1, a2) {
58 | alt = ifelse(a1 != ref, a1, a2)
59 | tmp = ref
60 | needsw = which(alt %in% c("C", "T"))
61 | ref[needsw] = alt[needsw]
62 | alt[needsw] = tmp[needsw]
63 | paste(ref, alt, sep = ">")
64 | }
65 | with(readMuts[readMuts$Variant_Type=="SNP",],
66 | table(subt(Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2)))
67 | ```
68 |
69 | A>G and G>A substitutions are not included in kataegis plots.
70 |
71 | To define the colors used for substitutions:
72 |
73 | ```{r lkkac}
74 | ph525x:::kataColors
75 | ```
76 |
77 | ## Total genomic distance
78 |
79 | The mutation locations reported are not particularly convenient for genome-wide
80 | plotting as the distances are all relative to chromosome start.
81 | The following hidden function computes total distance relative
82 | to start of chr1, assuming that the data are held in GRanges.
83 | ```{r lktg}
84 | ph525x:::totalgd
85 | ```
86 |
87 | ## A demo plot for four tumors
88 |
89 | The rainfall function will organize the input data by sample, and
90 | samples can, in the present version, be selected according to
91 | their position in an ordering based on number of mutations reported.
92 | The default plots the sample with the greatest number of mutations.
93 | The oind parameter allows selection of samples further down in the
94 | ordering. We embellish the plot with a simple kernel estimate
95 | of the density of mutations along the chromosomes. The
96 | function invisibly returns a list of items related to the plot.
97 |
98 | ```{r do4f,fig=TRUE,fig.height=8}
99 | rainouts = list()
100 | par(mfrow=c(4,1),mar=c(4,5,1,1))
101 | for (i in 1:4) rainouts[[i]] = rainfall(readMuts, oind=i)
102 | ```
103 |
104 | ```{r lkrao}
105 | str(rainouts[[1]])
106 | ```
107 |
108 |
109 |
--------------------------------------------------------------------------------
/biocadv_6x/finalViz.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "Some final comments on genome-scale visualization"
4 | ---
5 |
6 | ```{r options, echo=FALSE, message=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | suppressPackageStartupMessages({
10 | suppressWarnings({
11 | library(BiocStyle)
12 | library(Biobase)
13 | library(randomForest)
14 | library(MLInterfaces)
15 | library(tissuesGeneExpression)
16 | library(limma)
17 | library(ph525x)
18 | library(RCircos)
19 | })
20 | })
21 | ```
22 |
23 |
24 | ## RCircos
25 |
26 | RCircos is not distributed in Bioconductor, but can
27 | be useful for developing compact displays of interactions
28 | among genomic elements. I am unaware of any interfaces between
29 | Bioconductor data classes and RCircos, and this topic deserves
30 | attention.
31 |
32 | In the ph525x package we have added a selection of trans-eQTL
33 | findings from Westra et al. Nature 2013 (doi: 10.1038/ng.2756).
34 | We show a few SNP-gene associations from this study:
35 | ```{r lksn,fig=TRUE,message=FALSE,fig.height=9,fig.width=9}
36 | library(ph525x)
37 | data(westraTransSel)
38 | westraTransSel[1:3]
39 | sglToCircos(westraTransSel[1:5])
40 | ```
41 |
42 | ## ComplexHeatmap
43 |
44 | `r Biocpkg("ComplexHeatmap")` has a very nice vignette addressing many
45 | issues in combining heatmaps and repurposing the heatmap
46 | concept. The oncoprint example in the vignette is particularly
47 | comrelling. To use this interactively with TCGA, contact
48 | [the ISB](http://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/FAQ.html) and obtain a cloud platform account.
49 | Then obtain the `r Biocpkg("cgcR")`
50 | package, load it, and run `isbApp()`. You will have to authenticate
51 | with google to get access to the BigQuery representation of TCGA.
52 |
53 | ## WebGL and interaction with data
54 |
55 | In the short concluding video we use the MLInterfaces plspinHcube
56 | function to illustrate several aspects of interactivity: GUI for
57 | tuning, mouse-controlled rotation, and mouseover for point interrogation.
58 |
59 | ## EpiViz
60 |
61 | The `r Biocpkg("epivizr")` package interacts with the
62 | [epiviz](https://epiviz.github.io/) system and is capable of substantial feats of data integration and
63 | higher-level data interactivity.
64 |
--------------------------------------------------------------------------------
/biocadv_6x/multiOOM.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "Benchmarking multiple out-of-memory strategies"
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | suppressPackageStartupMessages({
10 | suppressWarnings({
11 | library(ph525x)
12 | library(microbenchmark)
13 | library(bigmemory)
14 | library(rhdf5)
15 | library(RSQLite)
16 | })
17 | })
18 | ```
19 |
20 |
21 | ## Introduction
22 |
23 | In many large-data situations, it is impractical to load and retain
24 | data in R's working memory space. We have had a look at
25 | HDF5, SQLite and tabix-indexed text as possible solutions
26 | to problems arising with memory constraints. We'll call
27 | these "out-of-memory" (OOM) approaches
28 |
29 | How can we obtain data on which approach will be most effective
30 | for a given task? Comparative benchmarking is a very useful skill and
31 | we give a very rudimentary account of this here.
32 |
33 | ## The harness
34 |
35 | It is common to speak of a program that drives other programs
36 | as a "harness" (see [wikipedia](https://en.wikipedia.org/wiki/Test_harness)
37 | for related discussion). We have such a program in ph525x:
38 |
39 | ```{r lkph}
40 | benchOOM
41 | ```
42 |
43 | This program is going to help us assess performance of various
44 | OOM approaches. We consider a very limited problem, that of
45 | managing data that could reside in an R matrix.
46 | The main parameters are
47 |
48 | * `NR` and `NC`: row and column dimensions
49 | * `times`: number of benchmark replications for averaging
50 | * `inseed`: a seed for random number generation to ensure reproducibility
51 | * `methods`: a list of methods
52 |
53 | The `methods` parameter is most complex. Each element of the list
54 | is assumed to be a function with the matrix to
55 | be managed via OOM as the first argument, some additional
56 | parameters, and a parameter `intimes` that gives the number
57 | of benchmark replicates.
58 |
59 | Our objective is to produce a table that looks like
60 |
61 | ```
62 | > b1
63 | NR NC times meth wr ingFull ing1K
64 | 1 5000 100 5 hdf5 10.71714 9.4100810 14.2984402
65 | 2 5000 100 5 ff 25.34365 63.0977338 4.4320688
66 | 3 5000 100 5 sqlite 174.89003 105.1254638 28.4717496
67 | 4 5000 100 5 data.table 49.35190 7.9871552 13.9007588
68 | 5 5000 100 5 bigmemory 23.39697 0.9660878 0.9950034
69 | ```
70 |
71 | where each method listed in `meth` is asked to perform the same
72 | task a fixed number of times for averaging. The construction of
73 | the table occurs by binding together metadata about the task and
74 | method to the result of `getStats`. We'll leave the details
75 | of `getStats` to independent investigation.
76 |
77 |
78 | ## An example method (OOM benchmarker)
79 |
80 | Let's look at the method for HDF5:
81 | ```{r lkme}
82 | ph525x:::.h5RoundTrip
83 | ```
84 |
85 | The program has three main phases
86 |
87 | * HDF5-related setup, cleaning out any previous archives and establishing
88 | the basic target file
89 | * Benchmarking of data export via `h5write`
90 | * Benchmarking of ingestion via `h5read` with various restrictions
91 |
92 | The results of `microbenchmark` are assembled in a list.
93 |
--------------------------------------------------------------------------------
/biocintro_5x/bioc1_align.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "Notes on video of mapping RNA-seq reads"
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | # Short video of mapping RNA-Seq reads
12 |
13 | Note that the commands used in this lab require you have a lot of free disk space (the FASTQ files alone are 28 GB) and many cores available for running the alignment program. We do not expect students to replicate the commands in this video. We do not expect students install the alignment software on their machines. Much of the software for processing NGS data is designed for Linux systems. Note that the case studies (in particular the variant discovery and genotyping case study) will go into more depth on using Linux for processing NGS data.
14 |
15 | The FASTQ file we are looking at in the beginning of this screencast was downloaded from:
16 |
17 | http://trace.ncbi.nlm.nih.gov/Traces/sra/?run=SRR1177756
18 |
19 | This is a human RNA-Seq sample from a [study](http://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP032775) of naturally acquired immunity to malaria.
20 |
21 | We discuss the following software in the screencast:
22 |
23 | * fastq-dump from the [SRA toolkit](http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software)
24 | * [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)
25 | * [Tophat2](http://ccb.jhu.edu/software/tophat/index.shtml)
26 | * [Samtools](http://samtools.sourceforge.net/)
27 |
28 | To extract the FASTQ files from the SRA file, we used the following line. The `--split-files` argument is used to extract two files for the two paired-ends of the fragments which were sequenced.
29 |
30 | ```
31 | fastq-dump --split-files SRR1177756.sra
32 | ```
33 |
34 | The call for running Tophat2 was:
35 |
36 | ```
37 | export BOWTIE2_INDEXES=/path/to/your/Bowtie2Index
38 |
39 | tophat2 -o SRR1177756_tophat_out -p 10 genome SRR1177756_1.fastq SRR1177756_2.fastq
40 | ```
41 |
42 | To view the reads we used Samtools:
43 |
44 | ```
45 | samtools view accepted_hits.bam | head -1000 | less
46 | ```
47 |
48 | For demonstration purposes (you wouldn't necessarily repeat these lines in a typical workflow), we merged the mapped and unmapped reads into a single sorted file. For this we used the following calls:
49 |
50 | ```
51 | samtools sort -n accepted_hits.bam accepted_hits_name_sorted
52 | samtools sort -n unmapped.bam unmapped_name_sorted
53 | samtools merge -n all_reads.bam accepted_hits_name_sorted.bam unmapped_name_sorted.bam
54 | ```
55 |
56 |
--------------------------------------------------------------------------------
/biocintro_5x/bioc1_annoCheat.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Genomic annotation in Bioconductor: Cheat sheet"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 | # Summarizing the key genome annotation resources in Bioconductor
16 |
17 | ## Executive summary
18 |
19 | ### Organism-oriented annotation
20 |
21 | For biological annotation, generally sequence or gene based, there
22 | are three key types of package
23 |
24 | * Reference sequence packages: BSgenome.[Organism].[Curator].[BuildID]
25 | * Gene model database packages: TxDb.[Organism].[Curator].[BuildID].[Catalog],
26 | and, EnsDb.[Organism].[version], for Ensembl-derived annotation
27 | * Annotation map package: org.[Organism2let].[Institution].db
28 |
29 | wherever brackets are used, you must substitute an appropriate token.
30 | You can survey all annotation packages at [the annotation page](http://bioconductor.org/packages/release/BiocViews.html#___AnnotationData).
31 |
32 | Packages Homo.sapiens, Mus.musculus and Rattus.norvegicus are specialized
33 | integrative annotation resources with an evolving interface.
34 |
35 | ### Systems biology oriented annotation
36 |
37 | Packages GO.db, KEGG.db, KEGGREST, and reactome.db are primarily
38 | intended as organism-independent resources organizing genes into
39 | groups. However, there are organism-specific mappings between
40 | gene-oriented annotation and these resources, that involve specific
41 | abbreviations and symbol conventions. These are described
42 | when these packages are used.
43 |
44 | ## Names for organisms and their abbreviations
45 |
46 | The standard Linnaean taxonomy is used very generally. So you
47 | need to know that
48 |
49 | * Human = *Homo sapiens*
50 | * Mouse = *Mus musculus*
51 | * Rat = *Rattus norvegicus*
52 | * Yeast = *Saccharomyces cerevisiae*
53 | * Zebrafish = *Danio rerio*
54 | * Cow = *Bos taurus*
55 |
56 | and so on. We use two sorts of abbreviations. For
57 | Biostrings-based packages, the contraction of first
58 | and second names is used
59 |
60 | * Human = Hsapiens
61 | * Mouse = Mmusculus
62 | * Rat = Rnorvegicus
63 | * Yeast = Scerevisiae ...
64 |
65 | For NCBI-based annotation maps, we contract further
66 |
67 | * Human = Hs
68 | * Mouse = Mm
69 | * Rat = Rn
70 | * Yeast = Sc ...
71 |
72 | ## Genomic sequence
73 |
74 | These packages have four-component names that specify the reference build used
75 |
76 | * Human = BSgenome.Hsapiens.UCSC.hg19
77 | * Mouse = BSgenome.Mmusculus.UCSC.mm10
78 | * Rat = BSgenome.Rnorvegicus.UCSC.rn5
79 | * Yeast = BSgenome.Scerevisiae.UCSC.sacCer3
80 |
81 | ## Gene models
82 |
83 | These packages have five-component names that specify the reference build used and
84 | the gene catalog
85 |
86 | * Human = TxDb.Hsapiens.UCSC.hg19.knownGene
87 | * Mouse = TxDb.Mmusculus.UCSC.mm10.knownGene
88 | * Rat = TxDb.Rnorvegicus.UCSC.rn5.knownGene
89 | * Yeast = TxDb.Scerevisiae.UCSC.sacCer3.sgdGene
90 |
91 | Additional packages that are relevant are
92 |
93 | * Human = TxDb.Hsapiens.UCSC.hg38.knownGene
94 | * Human = EnsDb.Hsapiens.v75 -- related to hg19/GRCh37
95 |
96 | ## Annotation maps
97 |
98 | These packages have four component names, with two components fixed. The
99 | variable components indicate organism and curating institution.
100 |
101 | * Human = org.Hs.eg.db
102 | * Mouse = org.Mm.eg.db
103 | * Rat = org.Rn.eg.db
104 | * Yeast = org.Sc.sgd.db
105 |
106 | ## Additional options
107 |
108 | There are often alternative curating institutions available such as
109 | Ensembl.
110 |
--------------------------------------------------------------------------------
/biocintro_5x/bioc1_grangeOps.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "GRanges operations related to gene model, TSS, and promoter region identification"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output:
6 | pdf_document: default
7 | html_document: default
8 | layout: page
9 | toc: yes
10 | ---
11 |
12 | ```{r options, echo=FALSE}
13 | library(knitr)
14 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
15 | ```
16 |
17 |
18 |
19 | ```{r setup,echo=FALSE,results="hide"}
20 | suppressPackageStartupMessages({
21 | library(BSgenome.Hsapiens.UCSC.hg19)
22 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
23 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
24 | library(Biostrings)
25 | library(GenomicRanges)
26 | library(IRanges)
27 | library(ph525x)
28 | library(Homo.sapiens)
29 | library(Gviz)
30 | })
31 | ```
32 | # Overview
33 |
34 | In this document we work with a small set of ranges and
35 | illustrate basic intra-range operations reduce, disjoin, gaps.
36 | We then add strand and seqname information and show how
37 | resize and flank are useful for identifying TSS and promoter regions.
38 |
39 | ## A simple set of ranges
40 |
41 | ```{r newr}
42 | ir <- IRanges(c(3, 8, 14, 15, 19, 34, 40),
43 | width = c(12, 6, 6, 15, 6, 2, 7))
44 | ```
45 |
46 | ```{r plotr,echo=FALSE}
47 | plotRanges <- function(x, xlim = x, main = deparse(substitute(x)),
48 | col = "black", sep = 0.5, ...)
49 | {
50 | height <- 1
51 | if (is(xlim, "Ranges"))
52 | xlim <- c(min(start(xlim)), max(end(xlim)))
53 | bins <- disjointBins(IRanges(start(x), end(x) + 1))
54 | plot.new()
55 | plot.window(xlim, c(0, max(bins)*(height + sep)))
56 | ybottom <- bins * (sep + height) - height
57 | rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...)
58 | title(main)
59 | axis(1)
60 | }
61 |
62 | plotGRanges = function (x, xlim = x, col = "black", sep = 0.5, xlimits = c(0,
63 | 60), ...)
64 | {
65 | main = deparse(substitute(x))
66 | ch = as.character(seqnames(x)[1])
67 | x = ranges(x)
68 | height <- 1
69 | if (is(xlim, "Ranges"))
70 | xlim <- c(min(start(xlim)), max(end(xlim)))
71 | bins <- disjointBins(IRanges(start(x), end(x) + 1))
72 | plot.new()
73 | plot.window(xlim = xlimits, c(0, max(bins) * (height + sep)))
74 | ybottom <- bins * (sep + height) - height
75 | rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height,
76 | col = col, ...)
77 | title(main, xlab = ch)
78 | axis(1)
79 | }
80 | ```
81 |
82 | Let's visualize `ir` and several intra-range operations.
83 | ```{r lkir,fig=TRUE, out.height="800px"}
84 | par(mfrow=c(4,1), mar=c(4,2,2,2))
85 | plotRanges(ir, xlim=c(0,60))
86 | plotRanges(reduce(ir), xlim=c(0,60))
87 | plotRanges(disjoin(ir), xlim=c(0,60))
88 | plotRanges(gaps(ir), xlim=c(0,60))
89 | ```
90 |
91 | reduce(x) produces a set of
92 | nonoverlapping ranges that cover all positions covered by x.
93 | This can be used to reduce complexity of a gene model
94 | with many transcripts, where we may just want the addresses
95 | of intervals known to be transcribed, regardless of transcript
96 | of residence.
97 |
98 | disjoin(x) produces a set of ranges that cover all positions
99 | covered by x, such that none of the ranges in the
100 | disjoin output overlaps any end points of intervals in x.
101 | This gives us the largest possible collection of contiguous
102 | intervals that are separated wherever the original set
103 | of intervals had an endpoint.
104 |
105 | gaps(x) produces a set of ranges covering the positions
106 | in [start(x), end(x)] that are not covered by any range in x.
107 | Given coding sequence addresses and exon intervals, this can
108 | be used to enumerate introns.
109 |
110 | # Extension to GRanges
111 |
112 | We add chromosome and strand information.
113 |
114 | ```{r dogr}
115 | library(GenomicRanges)
116 | gir = GRanges(seqnames="chr1", ir, strand=c(rep("+", 4), rep("-",3)))
117 | ```
118 |
119 | Let's assume the intervals represent genes.
120 | The following plots illustrate the identification of
121 | transcription start sites (green), upstream promoter
122 | regions (purple), downstream promoter regions (brown).
123 |
124 | ```{r dopr,fig=TRUE, out.height="800px", out.width="500px"}
125 | par(mfrow=c(4,1), mar=c(4,2,2,2))
126 | plotGRanges(gir, xlim=c(0,60))
127 | plotGRanges(resize(gir,1), xlim=c(0,60),col="green")
128 | plotGRanges(flank(gir,3), xlim=c(0,60), col="purple")
129 | plotGRanges(flank(gir,2,start=FALSE), xlim=c(0,60), col="brown")
130 | ```
131 |
132 | Note that we do not need to take special steps to
133 | deal with the differences in strand.
134 |
--------------------------------------------------------------------------------
/biocintro_5x/bioc1_liftOver.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Translating addresses between genome builds"
3 | author: "Vince"
4 | date: "March 19, 2015"
5 | output: html_document
6 | layout: page
7 | toc: yes
8 | ---
9 |
10 | ```{r options, echo=FALSE}
11 | library(knitr)
12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
13 | ```
14 |
15 |
16 | ```{r setup,echo=FALSE,results="hide"}
17 | suppressWarnings({
18 | suppressMessages({
19 | suppressPackageStartupMessages({
20 | library(BSgenome.Hsapiens.UCSC.hg19)
21 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
22 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
23 | library(Biostrings)
24 | library(GenomicRanges)
25 | library(IRanges)
26 | library(ph525x)
27 | library(Homo.sapiens)
28 | library(rtracklayer)
29 | })
30 | })
31 | })
32 | ```
33 |
34 | # Translating addresses between genome builds: liftOver
35 |
36 | The rtracklayer package includes an interface to the
37 | liftOver utilities developed for the UCSC genome browser.
38 | The idea is that a collection of local alignments
39 | can be defined and used to remap coordinates from
40 | one reference build to another.
41 |
42 | We can illustrate this with gene addresses created for hg38,
43 | the current reference build. We want to translate them
44 | for comparison to addresses asserted for hg19.
45 |
46 | ## Acquiring a chain file
47 |
48 | Address translation between reference builds can be specified
49 | using a [chain format file](https://genome.ucsc.edu/goldenpath/help/chain.html). Two ways of getting the chain file are:
50 |
51 | ### Direct manual acquisition
52 |
53 | You can
54 | get it from the following URL, and use gunzip on your
55 | system to uncompress in your home dir, if you would
56 | like to emulate the commands below.
57 |
58 | "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz"
59 |
60 | ### Acquisition through AnnotationHub
61 |
62 | This is fully programmatic but may involve acquiring and caching
63 | a metadata database with the AnnotationHub package.
64 |
65 | ```{r doviaah}
66 | library(AnnotationHub)
67 | ah = AnnotationHub()
68 | q1 = query(ah, c("chain")) # list all resources with 'chain' in metadata
69 | q1
70 | q2 = query(ah, c("chain", "hg38ToHg19")) # the one we want
71 | ch = ah[[names(q2)]]
72 | ```
73 |
74 | ```{r domyimport}
75 | library(rtracklayer)
76 | # following only if you do not use AnnotationHub
77 | # ch = import.chain("~/hg38ToHg19.over.chain")
78 | ch
79 | str(ch[[1]])
80 | ```
81 |
82 | Let's get the addresses for genes on chromosome 1
83 | in hg38.
84 |
85 | ```{r get38}
86 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
87 | tx38 = TxDb.Hsapiens.UCSC.hg38.knownGene
88 | seqlevels(tx38) = "chr1"
89 | g1_38 = genes(tx38)
90 | ```
91 |
92 | Now execute the liftOver:
93 |
94 | ```{r doli}
95 | g1_19L = liftOver(g1_38, ch)
96 | ```
97 |
98 | The result is a list of GRanges, one for
99 | each translation event.
100 |
101 | ```{r lktx}
102 | g1_19L
103 | ```
104 |
105 | Verification of accuracy of translation is covered in exercises.
106 |
--------------------------------------------------------------------------------
/biocintro_5x/bioc1_t_mult.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: "Inference: t-tests, multiple comparisons"
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 |
12 | # Introduction
13 |
14 | In the previous section, we focused on a pair of genes to
15 | illustrate two aspects of variation. One of the genes appeared to
16 | have high between-mouse variation that was hidden in the act
17 | of pooling samples within strain. When strains were compared on
18 | the basis of the pooled data, there was an appearance of a significant
19 | strain
20 | effect for this gene ($p < 10^{-6}$), but when individual-level data were used to
21 | perform the comparison, the strain effect was found to be very
22 | weak at best ($p = 0.089$). The lesson is to recognize that the
23 | most scientifically compelling questions concern biological variation,
24 | which can only be directly measured with good experimental design. Accurate
25 | interpretation of origin and size of biological variation requires
26 | appropriate statistical analysis.
27 |
28 | In this section we will cover inference in the context of genome-scale experiments. There are several serious conceptual problems:
29 |
30 | - there are many tests, often at least one test for each one of tens of thousands of features
31 | - each feature (typically a gene) exhibits its own technical and biological variability
32 | - there may be unmeasured or unreported sources of biological variation (such as time of day)
33 | - many features are inherently interrelated, so the tests are not independent
34 |
35 | We will apply some of the concepts we have covered in previous
36 | sections including t-tests and multiple comparisons; later we will
37 | compute standard deviation estimates from hierarchical models.
38 |
39 | We start by loading the pooling experiment data
40 |
41 |
42 | ```{r,message=FALSE}
43 | library(Biobase)
44 | library(maPooling)
45 | data(maPooling)
46 | pd=pData(maPooling)
47 | individuals=which(rowSums(pd)==1)
48 | ```
49 |
50 | And extracting the individual mice as well as their strain
51 |
52 | ```{r}
53 | individuals=which(rowSums(pd)==1)
54 | individuals=individuals[-grep("tr",names(individuals))]
55 | y=exprs(maPooling)[,individuals]
56 | g=factor(as.numeric(grepl("b",names(individuals))))
57 | ```
58 |
59 |
60 |
61 | # T-tests
62 |
63 | We can now apply a t-test to each gene using the `rowttest` function in the `genefilter` package
64 |
65 | ```{r}
66 | library(genefilter)
67 | tt=rowttests(y,g)
68 | ```
69 |
70 |
71 | Now which genes do we report as statistically significant? For somewhat arbitrary reasons, in science p-values of 0.01 and 0.05 are used as cutoff. In this particular example we get
72 |
73 | ```{r}
74 | NsigAt01 = sum(tt$p.value<0.01)
75 | NsigAt01
76 | NsigAt05 = sum(tt$p.value<0.05)
77 | NsigAt05
78 | ```
79 |
80 |
81 |
82 | # Multiple testing
83 | We described multiple testing in detail [in course 3](http://genomicsclass.github.io/book/pages/multiple_testing.html). Here we provide a quick summary.
84 |
85 | Do we report all the nominally significant
86 | genes identified above? Let's explore what happens if we split the first group into two, forcing the null hypothesis to be true
87 |
88 | ```{r}
89 | set.seed(0)
90 | shuffledIndex <- factor(sample(c(0,1),sum(g==0),replace=TRUE ))
91 | nulltt <- rowttests(y[,g==0],shuffledIndex)
92 | NfalselySigAt01 = sum(nulltt$p.value<0.01)
93 | NfalselySigAt01
94 | NfalselySigAt05 = sum(nulltt$p.value<0.05)
95 | NfalselySigAt05
96 | ```
97 |
98 |
99 |
100 | If we use the 0.05 cutoff we will be reporting `r NfalselySigAt05` false positives. We have described several ways to adjust for this including the `qvalue` method available in the `r Biocpkg("qvalue")` package. After this adjustment we acquire
101 | a smaller list of genes.
102 |
103 | ```{r}
104 | library(qvalue)
105 | qvals = qvalue(tt$p.value)$qvalue
106 | sum(qvals<0.05)
107 | sum(qvals<0.01)
108 | ```
109 |
110 |
111 | And now the null case generates no false positives:
112 |
113 | ```{r}
114 | library(qvalue)
115 | nullqvals = qvalue(nulltt$p.value)$qvalue
116 | sum(nullqvals<0.05)
117 | sum(nullqvals<0.01)
118 | ```
119 |
120 | This addresses in a fairly general way the problem of inflating
121 | significance claims when performing many hypothesis tests at
122 | a fixed nominal level of significance.
123 |
--------------------------------------------------------------------------------
/biocintro_5x/optalign.Rmd:
--------------------------------------------------------------------------------
1 | # Short video of mapping RNA-Seq reads
2 |
3 | Note that the commands used in this lab require you have a lot of free disk space (the FASTQ files alone are 28 GB) and many cores available for running the alignment program. We do not expect students to replicate the commands in this video. We do not expect students install the alignment software on their machines. Much of the software for processing NGS data is designed for Linux systems. Note that the case studies (in particular the variant discovery and genotyping case study) will go into more depth on using Linux for processing NGS data.
4 |
5 | The FASTQ file we are looking at in the beginning of this screencast was downloaded from:
6 |
7 | http://trace.ncbi.nlm.nih.gov/Traces/sra/?run=SRR1177756
8 |
9 | This is a human RNA-Seq sample from a [study](http://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP032775) of naturally acquired immunity to malaria.
10 |
11 | We discuss the following software in the screencast:
12 |
13 | * fastq-dump from the [SRA toolkit](http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software)
14 | * [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml)
15 | * [Tophat2](http://ccb.jhu.edu/software/tophat/index.shtml)
16 | * [Samtools](http://samtools.sourceforge.net/)
17 |
18 | To extract the FASTQ files from the SRA file, we used the following line. The `--split-files` argument is used to extract two files for the two paired-ends of the fragments which were sequenced.
19 |
20 | ```
21 | fastq-dump --split-files SRR1177756.sra
22 | ```
23 |
24 | The call for running Tophat2 was:
25 |
26 | ```
27 | export BOWTIE2_INDEXES=/path/to/your/Bowtie2Index
28 |
29 | tophat2 -o SRR1177756_tophat_out -p 10 genome SRR1177756_1.fastq SRR1177756_2.fastq
30 | ```
31 |
32 | To view the reads we used Samtools:
33 |
34 | ```
35 | samtools view accepted_hits.bam | head -1000 | less
36 | ```
37 |
38 | For demonstration purposes (you wouldn't necessarily repeat these lines in a typical workflow), we merged the mapped and unmapped reads into a single sorted file. For this we used the following calls:
39 |
40 | ```
41 | samtools sort -n accepted_hits.bam accepted_hits_name_sorted
42 | samtools sort -n unmapped.bam unmapped_name_sorted
43 | samtools merge -n all_reads.bam accepted_hits_name_sorted.bam unmapped_name_sorted.bam
44 | ```
45 |
46 |
--------------------------------------------------------------------------------
/chipseq/ChIPseq_quiz.R:
--------------------------------------------------------------------------------
1 | library(DiffBind)
2 | setwd(system.file("extra", package="DiffBind"))
3 | read.csv("tamoxifen.csv")
4 | list.files("peaks")
5 | ta <- dba(sampleSheet="tamoxifen.csv")
6 | head(ta$peaks[[1]])
7 |
8 | pks <- GRanges(ta$peaks[[1]]$V1,
9 | IRanges(ta$peaks[[1]]$V2,
10 | ta$peaks[[1]]$V3))
11 |
12 | table(seqnames(pks))
13 |
14 | # find the distances between peaks
15 | plot(start(pks))
16 | dists <- start(pks)[-1] - start(pks)[-length(pks)]
17 | max(dists)
18 | which.max(dists)
19 | abline(h=start(pks)[which.max(dists)])
20 | abline(h=start(pks)[which.max(dists) + 1])
21 |
22 |
23 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
24 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
25 | g <- genes(txdb)
26 |
27 | # find the gene which peak 500 is within
28 | idx <- which(g %over% pks[500])
29 | g[idx]
30 | pks[500]
31 |
32 | # find the nearest gene to peak
33 | idx <- nearest(pks[475], g)
34 | pks[475]
35 | g[idx]
36 | distance(pks[475], g[idx])
37 |
38 | # same as
39 | distanceToNearest(pks[475], g)
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/chipseq/MACS.txt:
--------------------------------------------------------------------------------
1 | from GEO, download a ChIP-seq experiment plus an Input experiment in the same tissue
2 |
3 | "Differential oestrogen receptor binding is associated with clinical outcome in breast cancer"
4 |
5 | http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3272464/
6 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE32222
7 |
8 | Chromatin IP against ER MCF-7.3
9 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM798425
10 | ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX%2FSRX371%2FSRX371469/SRR1021789/SRR1021789.sra
11 |
12 | Input DNA MCF-7_Input
13 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM798440
14 | ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX%2FSRX371%2FSRX371484/SRR1021804/SRR1021804.sra
15 |
16 |
17 | Extracting FASTQ from SRA using sratoolkit.2.3.3-4:
18 |
19 | fastq-dump SRR1021789.sra
20 | fastq-dump SRR1021804.sra
21 |
22 | Aligning FASTQ using bowtie2-2.1.0:
23 |
24 | bowtie2 -p 10 -x /path/to/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/genome SRR1021789.fastq -S SRR1021789.sam
25 | bowtie2 -p 10 -x /path/to/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/genome SRR1021804.fastq -S SRR1021804.sam
26 |
27 | Running MACS usign MACS-2.0.10.20130306:
28 |
29 | macs2 callpeak -t SRR1021789.sam -c SRR1021804.sam -f SAM -g hs -n estrogen
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/example.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "This is the title"
3 | layout: page
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | # First section
12 |
13 | An introduction
14 |
15 |
16 |
17 |
18 |
19 |
20 | # Random normals
21 |
22 | Here are 10 random normals:
23 |
24 | ```{r}
25 | rnorm(10)
26 | ```
27 |
28 |
--------------------------------------------------------------------------------
/footnotes.R:
--------------------------------------------------------------------------------
1 | out <- c("---", "layout: page",
2 | "title: Footnotes for Data Analysis for Genomics",
3 | "---","")
4 | dirs <- list.files(".","course*")
5 | for (dir in dirs) {
6 | files <- list.files(dir, "*.Rmd")
7 | out <- c(out, paste0("# Course ", sub("course(.*)","\\1",dir)), "")
8 | for (file in files) {
9 | lines <- readLines(paste0(dir,"/",file))
10 | if (sum(grepl("## Footnotes",lines)) == 0) next
11 | title <- sub("title: (.*)","\\1",grep("title:",lines,value=TRUE)[1])
12 | cat("writing:",dir,"/",title,"\n")
13 | footidx <- grep("## Footnotes", lines)
14 | footnotes <- lines[(footidx+1):length(lines)]
15 | footnotes <- footnotes[footnotes != ""]
16 | footnotes.spaced <- character(2*length(footnotes))
17 | footnotes.spaced[2 * seq_along(footnotes) - 1] <- footnotes
18 | out <- c(out, paste0("## ",title), footnotes.spaced,"","----","")
19 | }
20 | }
21 | writeLines(out, con="footnotes.md")
22 |
--------------------------------------------------------------------------------
/highdim/images/handmade/Heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/highdim/images/handmade/Heatmap.png
--------------------------------------------------------------------------------
/highdim/images/handmade/SVD1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/highdim/images/handmade/SVD1.png
--------------------------------------------------------------------------------
/highdim/images/handmade/SVD2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/highdim/images/handmade/SVD2.png
--------------------------------------------------------------------------------
/highdim/images/handmade/animals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/highdim/images/handmade/animals.png
--------------------------------------------------------------------------------
/highdim/rotations.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Rotations
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 |
12 | ## Rotations
13 |
14 | One of the most useful applications of projections relates to coordinate rotations. In data analysis, simple rotations can result in easier to visualize and interpret data. We will describe the mathematics behind rotations and give some data analysis examples.
15 |
16 | In our previous section, we used the following example:
17 |
18 | $$
19 | Y = \begin{pmatrix} 2 \\
20 | 3
21 | \end{pmatrix}
22 | =
23 | 2
24 | \begin{pmatrix} 1\\
25 | 0
26 | \end{pmatrix} +
27 | 3
28 | \begin{pmatrix} 0\\
29 | 1
30 | \end{pmatrix}
31 | $$
32 |
33 | and noted that $2$ and $3$ are the _coordinates_.
34 |
35 |
36 | ```{r,fig.cap="Plot of (2,3) as coordinates along Dimension 1 (1,0) and Dimension 2 (0,1)."}
37 | library(rafalib)
38 | mypar()
39 | plot(c(-2,4),c(-2,4),xlab="Dimension 1",ylab="Dimension 2",
40 | type="n",xaxt="n",yaxt="n",bty="n")
41 | text(rep(0,6),c(c(-2,-1),c(1:4)),as.character(c(c(-2,-1),c(1:4))),pos=2)
42 | text(c(c(-2,-1),c(1:4)),rep(0,6),as.character(c(c(-2,-1),c(1:4))),pos=1)
43 | abline(v=0,h=0)
44 | arrows(0,0,2,3,lwd=3)
45 | segments(2,0,2,3,lty=2)
46 | segments(0,3,2,3,lty=2)
47 | text(2,3," Y",pos=4,cex=3)
48 | ```
49 |
50 | However, mathematically we can represent the point $(2,3)$ with other linear combinations:
51 |
52 | $$
53 | \begin{align*}
54 | Y &= \begin{pmatrix} 2 \\ 3\end{pmatrix} \\
55 | &= 2.5 \begin{pmatrix} 1\\ 1\end{pmatrix} + -1 \begin{pmatrix} \phantom{-}0.5\\ -0.5\end{pmatrix}
56 | \end{align*}$$
57 |
58 | The new coordinates are:
59 |
60 | $$Z = \begin{pmatrix} 2.5 \\ -1 \end{pmatrix}$$
61 |
62 | Graphically, we can see that the coordinates are the projections to the spaces defined by the new basis:
63 |
64 | ```{r,fig.cap="Plot of (2,3) as a vector in a rotatated space, relative to the original dimensions."}
65 | library(rafalib)
66 | mypar()
67 | plot(c(-2,4),c(-2,4),xlab="Dimension 1",ylab="Dimension 2",
68 | type="n",xaxt="n",yaxt="n",bty="n")
69 | text(rep(0,6),c(c(-2,-1),c(1:4)),as.character(c(c(-2,-1),c(1:4))),pos=2)
70 | text(c(c(-2,-1),c(1:4)),rep(0,6),as.character(c(c(-2,-1),c(1:4))),pos=1)
71 | abline(v=0,h=0)
72 | abline(0,1,col="red")
73 | abline(0,-1,col="red")
74 | arrows(0,0,2,3,lwd=3)
75 | y=c(2,3)
76 | x1=c(1,1)##new basis
77 | x2=c(0.5,-0.5)##new basis
78 | c1 = crossprod(x1,y)/crossprod(x1)
79 | c2 = crossprod(x2,y)/crossprod(x2)
80 | segments(x1[1]*c1,x1[2]*c1,y[1],y[2],lty=2)
81 | segments(x2[1]*c2,x2[2]*c2,y[1],y[2],lty=2)
82 | text(2,3," Y",pos=4,cex=3)
83 | ```
84 |
85 | We can go back and forth between these two representations of $(2,3)$ using matrix multiplication.
86 |
87 | $$
88 | Y = AZ\\
89 | $$
90 |
91 | $$
92 | A^{-1} Y = Z\\
93 | $$
94 |
95 | $$
96 | A= \begin{pmatrix} 1& \phantom{-}0.5\\ 1 & -0.5\end{pmatrix} \implies
97 | A^{-1}= \begin{pmatrix} 0.5& 0.5 \\ 1 &-1\end{pmatrix}
98 | $$
99 |
100 | $Z$ and $Y$ carry the same information, but in a different _coordinate system_.
101 |
102 | #### Example: Twin heights
103 |
104 | Here are 100 two dimensional points $Y$
105 |
106 | ```{r twin-heights,fig.cap="Twin 2 heights versus twin 1 heights.",echo=FALSE,message=FALSE}
107 | library(MASS)
108 | n = 100
109 | mypar()
110 | set.seed(1)
111 | y=t(mvrnorm(n,c(0,0),matrix(c(1,0.95,0.95,1),2,2)))
112 | plot(y[1,],y[2,],xlab="Twin 1 (standardized height)",ylab="Twin 2 (standardized height)",xlim=c(-3,3),ylim=c(-3,3))
113 | ```
114 |
115 | Here are the rotations: $Z = A^{-1} Y$
116 |
117 | ```{r twin-heights-rotated,fig.cap="Rotation of twin 2 heights versus twin 1 heights.",echo=FALSE,message=FALSE}
118 | A = matrix(c(0.5,1,0.5,-1),2,2)
119 | z = A%*%y
120 | mypar()
121 | plot(z[1,],z[2,],xlab="Average",ylab="Difference",xlim=c(-3,3),ylim=c(-3,3))
122 | ```
123 |
124 | What we have done here is rotate the data so that the first coordinate of $Z$ is the average height, while the second is the difference between twin heights.
125 |
126 | We have used the singular value decomposition to find principal components. It is sometimes useful to think of the SVD as a rotation, for example $\mathbf{U}^\top \mathbf{Y}$, that gives us a new coordinate system $\mathbf{DV}^\top$ in which the dimensions are ordered by how much variance they explain.
127 |
128 |
--------------------------------------------------------------------------------
/inference/R_refresher.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: R refresher
4 | ---
5 |
6 |
7 | ## Data Summaries: Summary, str
8 |
9 | First we load an example data frame:
10 |
11 | ```{r}
12 | rats <- data.frame(id = paste0("rat",1:10),
13 | sex = factor(rep(c("female","male"),each=5)),
14 | weight = c(2,4,1,11,18,12,7,12,19,20),
15 | length = c(100,105,115,130,95,150,165,180,190,175))
16 | rats
17 | ```
18 |
19 | The `summary` and `str` functions are two helpful functions for getting a sense of data. `summary` works on vectors or matrix-like objects (including data.frames). `str` works on an arbitrary R object and will compactly display the structure.
20 |
21 | ```{r}
22 | summary(rats)
23 | summary(rats$weight)
24 | str(rats)
25 | ```
26 |
27 | ## Aligning Two Objects: Match, Merge
28 |
29 | We load another example data frame, with the original ID and another secret ID. Suppose we want to sort the original data frame by the secret ID.
30 |
31 | ```{r}
32 | ratsTable <- data.frame(id = paste0("rat",c(6,9,7,3,5,1,10,4,8,2)),
33 | secretID = 1:10)
34 | ratsTable
35 | # wrong!
36 | cbind(rats, ratsTable)
37 | ```
38 |
39 | `match` is a very useful function in R. It can give us this order, but it's also easy to get its arguments mixed up. Remember that `match` gives you, for each element in the first vector, the index of the first match in the second vector. So typically the data.frame or vector you are reordering would appear as the second argument to `match`. It's always a good idea to check that you got it right, which you can do by using `cbind` to line up both data frames.
40 |
41 | ```{r}
42 | match(ratsTable$id, rats$id)
43 | rats[match(ratsTable$id, rats$id),]
44 | cbind(rats[match(ratsTable$id, rats$id),], ratsTable)
45 | ```
46 |
47 | Or you can use the `merge` function which will handle everything for you. You can tell it the names of the columns to merge on, or it will look for columns with the same name.
48 |
49 | ```{r}
50 | ratsMerged <- merge(rats, ratsTable, by.x="id", by.y="id")
51 | ratsMerged[order(ratsMerged$secretID),]
52 | ```
53 |
54 | ## Analysis Over Groups: split, tapply, and dplyr libary
55 |
56 | Suppose we need to calculate the average rat weight for each sex. We could start by splitting the weight vector into a list of weight vectors divided by sex. `split` is a useful function for breaking up a vector into groups defined by a second vector, typically a factor. We can then use the `lapply` function to calculate the average of each element of the list, which are vectors of weights.
57 |
58 | ```{r}
59 | sp <- split(rats$weight, rats$sex)
60 | sp
61 | lapply(sp, mean)
62 | ```
63 |
64 | A shortcut for this is to use `tapply` and give the function, which should run on each element of the list, as a third argument:
65 |
66 | ```{r}
67 | tapply(rats$weight, rats$sex, mean)
68 | ```
69 |
70 | R is constantly being developed in the form of add-on packages, which can sometimes greatly simplify basic analysis tasks. A new library "dplyr" can accomplish the same task as above, and can be extended to many other, more complicated operations. The "d" in the name is for data.frame, and the "ply" is because the library attempts to simplify tasks typically used by the set of functions: `sapply`, `lapply`, `tapply`, etc. Here is the same task as before done with the dplyr functions `group_by` and `summarise`:
71 |
72 | ```{r}
73 | library(dplyr)
74 | sexes <- group_by(rats, sex)
75 | summarise(sexes, ave=mean(weight))
76 | ```
77 |
78 | With dplyr, you can chain operations using the `%.%` operator:
79 |
80 | ```{r}
81 | rats %.% group_by(sex) %.% summarise(ave=mean(weight))
82 | ```
83 |
--------------------------------------------------------------------------------
/inference/permutation_tests.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Permutation tests
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ```{r,include=FALSE}
12 | set.seed(1)
13 | ```
14 |
15 | ## Permutation Tests
16 |
17 | Suppose we have a situation in which none of the standard mathematical statistical approximations apply. We have computed a summary statistic, such as the difference in mean, but do not have a useful approximation, such as that provided by the CLT. In practice, we do not have access to all values in the population so we can't perform a simulation as done above. Permutation tests can be useful in these scenarios.
18 |
19 | We are back to the scenario where we only have 10 measurements for each group.
20 |
21 | ```{r,message=FALSE}
22 | dat=read.csv("femaleMiceWeights.csv")
23 |
24 | library(dplyr)
25 |
26 | control <- filter(dat,Diet=="chow") %>% select(Bodyweight) %>% unlist
27 | treatment <- filter(dat,Diet=="hf") %>% select(Bodyweight) %>% unlist
28 | obsdiff <- mean(treatment)-mean(control)
29 | ```
30 |
31 | In previous sections, we showed parametric approaches that helped determine if the observed difference was significant. Permutation tests take advantage of the fact that if we randomly shuffle the cases and control labels, then the null is true. So we shuffle the cases and control labels and assume that the ensuing distribution approximates the null distribution. Here is how we generate a null distribution by shuffling the data 1,000 times:
32 |
33 | ```{r diff_hist, fig.cap="Histogram of difference between averages from permutations. Vertical line shows the observed difference."}
34 | N <- 12
35 | avgdiff <- replicate(1000, {
36 | all <- sample(c(control,treatment))
37 | newcontrols <- all[1:N]
38 | newtreatments <- all[(N+1):(2*N)]
39 | return(mean(newtreatments) - mean(newcontrols))
40 | })
41 | hist(avgdiff)
42 | abline(v=obsdiff, col="red", lwd=2)
43 | ```
44 |
45 | How many of the null means are bigger than the observed value? That
46 | proportion would be the p-value for the null. We add a 1 to the
47 | numerator and denominator to account for misestimation of the p-value
48 | (for more details see
49 | [Phipson and Smyth, Permutation P-values should never be zero](http://www.ncbi.nlm.nih.gov/pubmed/21044043)).
50 |
51 | ```{r}
52 | #the proportion of permutations with larger difference
53 | (sum(abs(avgdiff) > abs(obsdiff)) + 1) / (length(avgdiff) + 1)
54 | ```
55 |
56 | Now let's repeat this experiment for a smaller dataset. We create a smaller dataset by sampling:
57 |
58 | ```{r}
59 | N <- 5
60 | control <- sample(control,N)
61 | treatment <- sample(treatment,N)
62 | obsdiff <- mean(treatment)- mean(control)
63 | ```
64 | and repeat the exercise:
65 |
66 |
67 | ```{r diff_hist_N50, fig.cap="Histogram of difference between averages from permutations for smaller sample size. Vertical line shows the observed difference."}
68 | avgdiff <- replicate(1000, {
69 | all <- sample(c(control,treatment))
70 | newcontrols <- all[1:N]
71 | newtreatments <- all[(N+1):(2*N)]
72 | return(mean(newtreatments) - mean(newcontrols))
73 | })
74 | hist(avgdiff)
75 | abline(v=obsdiff, col="red", lwd=2)
76 | ```
77 |
78 | Now the observed difference is not significant using this approach. Keep in mind that there is no theoretical guarantee that the null distribution estimated from permutations approximates the actual null distribution. For example, if there is a real difference between the populations, some of the permutations will be unbalanced and will contain some samples that explain this difference. This implies that the null distribution created with permutations will have larger tails than the actual null distribution. This is why permutations result in conservative p-values. For this reason, when we have few samples, we can't do permutations.
79 |
80 | Note also that permutation tests still have assumptions: samples are
81 | assumed to be independent and "exchangeable". If there is hidden
82 | structure in your data, then permutation tests can result in estimated
83 | null distributions that underestimate the size of tails because the
84 | permutations may destroy the existing structure in the original data.
85 |
86 |
--------------------------------------------------------------------------------
/inference/populations_and_samples.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Population, Samples, and Estimates
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ## Populations, Samples and Estimates
12 |
13 | Now that we have introduced the idea of a random variable, a null distribution, and a p-value, we are ready to describe the mathematical theory that permits us to compute p-values in practice. We will also learn about confidence intervals and power calculations.
14 |
15 | #### Population parameters
16 |
17 | A first step in statistical inference is to understand what population
18 | you are interested in. In the mouse weight example, we have two
19 | populations: female mice on control diets and female mice on high fat
20 | diets, with weight being the outcome of interest. We consider this
21 | population to be fixed, and the randomness comes from the
22 | sampling. One reason we have been using this dataset as an example is
23 | because we happen to have the weights of all the mice of this
24 | type. We download [this](https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/mice_pheno.csv) file to our working directory and read in to R:
25 |
26 | ```{r,message=FALSE,echo=FALSE}
27 | library(downloader)
28 | dir <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/"
29 | filename <- "mice_pheno.csv"
30 | url <- paste0(dir, filename)
31 | if (!file.exists(filename)) download(url,destfile=filename)
32 | ```
33 |
34 | ```{r}
35 | dat <- read.csv("mice_pheno.csv")
36 | ```
37 |
38 | We can then access the population values and determine, for example, how many we have. Here we compute the size of the control population:
39 |
40 | ```{r,message=FALSE}
41 | library(dplyr)
42 | controlPopulation <- filter(dat,Sex == "F" & Diet == "chow") %>%
43 | select(Bodyweight) %>% unlist
44 | length(controlPopulation)
45 | ```
46 |
47 | We usually denote these values as $x_1,\dots,x_m$. In this case, $m$ is the number computed above. We can do the same for the high fat diet population:
48 |
49 | ```{r}
50 | hfPopulation <- filter(dat,Sex == "F" & Diet == "hf") %>%
51 | select(Bodyweight) %>% unlist
52 | length(hfPopulation)
53 | ```
54 |
55 | and denote with $y_1,\dots,y_n$.
56 |
57 | We can then define summaries of interest for these populations, such as the mean and variance.
58 |
59 | The mean:
60 |
61 | $$\mu_X = \frac{1}{m}\sum_{i=1}^m x_i \mbox{ and } \mu_Y = \frac{1}{n} \sum_{i=1}^n y_i$$
62 |
63 | The variance:
64 |
65 | $$\sigma_X^2 = \frac{1}{m}\sum_{i=1}^m (x_i-\mu_X)^2 \mbox{ and } \sigma_Y^2 = \frac{1}{n} \sum_{i=1}^n (y_i-\mu_Y)^2$$
66 |
67 | with the standard deviation being the square root of the variance. We refer to such quantities that can be obtained from the population as _population parameters_. The question we started out asking can now be written mathematically: is $\mu_Y - \mu_X = 0$ ?
68 |
69 | Although in our illustration we have all the values and can check if this is true, in practice we do not. For example, in practice it would be prohibitively expensive to buy all the mice in a population. Here we learn how taking a _sample_ permits us to answer our questions. This is the essence of statistical inference.
70 |
71 | #### Sample estimates
72 |
73 | In the previous chapter, we obtained samples of 12 mice from each
74 | population. We represent data from samples with capital letters to
75 | indicate that they are random. This is common practice in statistics,
76 | although it is not always followed. So the samples are $X_1,\dots,X_M$
77 | and $Y_1,\dots,Y_N$ and, in this case, $N=M=12$. In contrast and as we
78 | saw above, when we list out the values of the population, which are
79 | set and not random, we use lower-case letters.
80 |
81 | Since we want to know if $\mu_Y - \mu_X$ is 0, we consider the sample version: $\bar{Y}-\bar{X}$ with:
82 |
83 | $$
84 | \bar{X}=\frac{1}{M} \sum_{i=1}^M X_i
85 | \mbox{ and }\bar{Y}=\frac{1}{N} \sum_{i=1}^N Y_i.
86 | $$
87 |
88 | Note that this difference of averages is also a random
89 | variable. Previously, we learned about the behavior of random variables
90 | with an exercise that involved repeatedly sampling from the original
91 | distribution. Of course, this is not an exercise that we can execute
92 | in practice. In this particular case it would involve buying 24 mice
93 | over and over again. Here we described the mathematical theory that
94 | mathematically relates $\bar{X}$ to $\mu_X$ and $\bar{Y}$ to $\mu_Y$,
95 | that will in turn help us understand the relationship between
96 | $\bar{Y}-\bar{X}$ and $\mu_Y - \mu_X$. Specifically, we will describe
97 | how the Central Limit Theorem permits us to use an approximation to
98 | answer this question, as well as motivate the widely used t-distribution.
99 |
100 |
--------------------------------------------------------------------------------
/intro/dplyr_intro.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Brief Introduction to `dplyr`
4 | ---
5 |
6 | ## Brief Introduction to `dplyr`
7 |
8 | The learning curve for R syntax is slow. One of the more difficult aspects that requires some getting used to is subsetting data tables. The `dplyr` package brings these tasks closer to English and we are therefore going to introduce two simple functions: one is used to subset and the other to select columns.
9 |
10 | Take a look at the dataset we read in:
11 | ```{r}
12 | filename <- "femaleMiceWeights.csv"
13 | dat <- read.csv(filename)
14 | head(dat) #In R Studio use View(dat)
15 | ```
16 |
17 | There are two types of diets, which are denoted in the first column. If we want just the weights, we only need the second column. So if we want the weights for mice on the `chow` diet, we subset and filter like this:
18 |
19 | ```{r,message=FALSE}
20 | library(dplyr)
21 | chow <- filter(dat, Diet=="chow") #keep only the ones with chow diet
22 | head(chow)
23 | ```
24 |
25 | And now we can select only the column with the values:
26 |
27 | ```{r}
28 | chowVals <- select(chow,Bodyweight)
29 | head(chowVals)
30 | ```
31 |
32 | A nice feature of the `dplyr` package is that you can perform consecutive tasks by using what is called a "pipe". In `dplyr` we use `%>%` to denote a pipe. This symbol tells the program to first do one thing and then do something else to the result of the first. Hence, we can perform several data manipulations in one line. For example:
33 |
34 | ```{r}
35 | chowVals <- filter(dat, Diet=="chow") %>% select(Bodyweight)
36 | ```
37 |
38 | In the second task, we no longer have to specify the object we are editing since it is whatever comes from the previous call.
39 |
40 | Also, note that if `dplyr` receives a `data.frame` it will return a `data.frame`.
41 | ```{r}
42 | class(dat)
43 | class(chowVals)
44 | ```
45 |
46 | For pedagogical reasons, we will often want the final result to be a simple `numeric` vector. To obtain such a vector with `dplyr`, we can apply the `unlist` function which turns `lists`, such as `data.frames`, into `numeric` vectors:
47 |
48 | ```{r}
49 | chowVals <- filter(dat, Diet=="chow") %>% select(Bodyweight) %>% unlist
50 | class( chowVals )
51 | ```
52 |
53 |
54 | To do this in R without `dplyr` the code is the following:
55 |
56 | ```{r}
57 | chowVals <- dat[ dat$Diet=="chow", colnames(dat)=="Bodyweight"]
58 | ```
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/intro/github.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Installing software from github.com
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 |
12 | ```{r}
13 | #install.packages("devtools")
14 | library(devtools)
15 | #install_github("rafalib","ririzarr")
16 | library(rafalib)
17 | mypar
18 | shist(rnorm(100))
19 | ```
20 |
21 |
--------------------------------------------------------------------------------
/intro/system_files.Rmd:
--------------------------------------------------------------------------------
1 | ## R system files
2 | Note that this file is also included in the 'dagdata' package. If you have the package installed the this file is already in your system and you can use the 'system.file' function to find it
3 |
4 | ```{r}
5 | dir <- system.file(package="dagdata")
6 | list.files(dir)
7 | list.files(file.path(dir,"extdata"))
8 | filename <- file.path(dir,"extdata/mice_pheno.csv")
9 | dat <- read.csv(filename)
10 | ```
11 |
12 |
13 | ## Using download
14 |
15 | url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/mice_pheno.csv"
16 | filename <- tempfile()
17 | if (!file.exists(filename)) download.file(url,destfile=filename,method="curl")
18 | dat <- read.csv(filename)
19 |
--------------------------------------------------------------------------------
/linear/linear_models_in_practice.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: Linear models in practice
3 | layout: page
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | #### The mouse diet example
12 |
13 | We will demonstrate how to analyze the high fat diet data using linear models instead of directly applying a t-test. We will demonstrate how ultimately these two approaches are equivalent.
14 |
15 | We start by reading in the data and creating a quick stripchart:
16 |
17 | ```{r,echo=FALSE}
18 | url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/femaleMiceWeights.csv"
19 | filename <- "femaleMiceWeights.csv"
20 | library(downloader)
21 | if (!file.exists(filename)) download(url, filename)
22 | ```
23 |
24 | ```{r,echo=FALSE}
25 | set.seed(1) #same jitter in stripchart
26 | ```
27 |
28 | ```{r bodyweight_by_diet_stripchart, fig.cap="Mice bodyweights stratified by diet."}
29 | dat <- read.csv("femaleMiceWeights.csv") ##previously downloaded
30 | stripchart(dat$Bodyweight ~ dat$Diet, vertical=TRUE, method="jitter",
31 | main="Bodyweight over Diet")
32 | ```
33 |
34 | We can see that the high fat diet group appears to have higher weights on average, although there is overlap between the two samples.
35 |
36 | For demonstration purposes, we will build the design matrix $\mathbf{X}$ using the formula `~ Diet`. The group with the 1's in the second column is determined by the level of `Diet` which comes second; that is, the non-reference level.
37 |
38 | ```{r}
39 | levels(dat$Diet)
40 | X <- model.matrix(~ Diet, data=dat)
41 | head(X)
42 | ```
43 |
44 | ## The Mathematics Behind lm()
45 |
46 | Before we use our shortcut for running linear models, `lm`, we want to review what will happen internally. Inside of `lm`, we will form the design matrix $\mathbf{X}$ and calculate the $\boldsymbol{\beta}$, which minimizes the sum of squares using the previously described formula. The formula for this solution is:
47 |
48 | $$ \hat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{Y} $$
49 |
50 | We can calculate this in R using our matrix multiplication operator `%*%`, the inverse function `solve`, and the transpose function `t`.
51 |
52 |
53 | ```{r}
54 | Y <- dat$Bodyweight
55 | X <- model.matrix(~ Diet, data=dat)
56 | solve(t(X) %*% X) %*% t(X) %*% Y
57 | ```
58 |
59 | These coefficients are the average of the control group and the difference of the averages:
60 |
61 |
62 | ```{r}
63 | s <- split(dat$Bodyweight, dat$Diet)
64 | mean(s[["chow"]])
65 | mean(s[["hf"]]) - mean(s[["chow"]])
66 | ```
67 |
68 | Finally, we use our shortcut, `lm`, to run the linear model:
69 |
70 | ```{r}
71 | fit <- lm(Bodyweight ~ Diet, data=dat)
72 | summary(fit)
73 | (coefs <- coef(fit))
74 | ```
75 |
76 | #### Examining the coefficients
77 |
78 | The following plot provides a visualization of the meaning of the coefficients with colored arrows (code not shown):
79 |
80 | ```{r parameter_estimate_illustration, fig.cap="Estimated linear model coefficients for bodyweight data illustrated with arrows.",echo=FALSE}
81 | stripchart(dat$Bodyweight ~ dat$Diet, vertical=TRUE, method="jitter",
82 | main="Bodyweight over Diet", ylim=c(0,40), xlim=c(0,3))
83 | a <- -0.25
84 | lgth <- .1
85 | library(RColorBrewer)
86 | cols <- brewer.pal(3,"Dark2")
87 | abline(h=0)
88 | arrows(1+a,0,1+a,coefs[1],lwd=3,col=cols[1],length=lgth)
89 | abline(h=coefs[1],col=cols[1])
90 | arrows(2+a,coefs[1],2+a,coefs[1]+coefs[2],lwd=3,col=cols[2],length=lgth)
91 | abline(h=coefs[1]+coefs[2],col=cols[2])
92 | legend("right",names(coefs),fill=cols,cex=.75,bg="white")
93 | ```
94 |
95 | To make a connection with material presented earlier, this simple linear model is actually giving us the same result (the t-statistic and p-value) for the difference as a specific kind of t-test. This is the t-test between two groups with the assumption that the population standard deviation is the same for both groups. This was encoded into our linear model when we assumed that the errors $\boldsymbol{\varepsilon}$ were all equally distributed.
96 |
97 | Although in this case the linear model is equivalent to a t-test, we will soon explore more complicated designs, where the linear model is a useful extension. Below we demonstrate that one does in fact get the exact same results:
98 |
99 | Our `lm` estimates were:
100 |
101 | ```{r}
102 | summary(fit)$coefficients
103 | ```
104 |
105 | And the t-statistic is the same:
106 |
107 | ```{r}
108 | ttest <- t.test(s[["hf"]], s[["chow"]], var.equal=TRUE)
109 | summary(fit)$coefficients[2,3]
110 | ttest$statistic
111 | ```
112 |
--------------------------------------------------------------------------------
/linear/linear_models_intro.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Introduction to Linear Models
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | # Linear Models
12 |
13 | Many of the models we use in data analysis can be presented using matrix algebra. We refer to these types of models as _linear models_. "Linear" here does not refer to lines, but rather to linear combinations. The representations we describe are convenient because we can write models more succinctly and we have the matrix algebra mathematical machinery to facilitate computation. In this chapter, we will describe in some detail how we use matrix algebra to represent and fit.
14 |
15 | In this book, we focus on linear models that represent dichotomous groups: treatment versus control, for example. The effect of diet on mice weights is an example of this type of linear model. Here we describe slightly more complicated models, but continue to focus on dichotomous variables.
16 |
17 | As we learn about linear models, we need to remember that we are still working with random variables. This means that the estimates we obtain using linear models are also random variables. Although the mathematics is more complex, the concepts we learned in previous chapters apply here. We begin with some exercises to review the concept of random variables in the context of linear models.
18 |
19 |
20 |
--------------------------------------------------------------------------------
/list_libs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for dir in `ls -d */`;
4 | do echo $dir;
5 | echo "";
6 | grep -h 'library(' $dir/*.Rmd | sort | uniq;
7 | echo "";
8 | done
9 |
--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
1 | course1rmd := $(wildcard course1/*.Rmd)
2 | course2rmd := $(wildcard course2/*.Rmd)
3 | course3rmd := $(wildcard course3/*.Rmd)
4 | course4rmd := $(wildcard course4/*.Rmd)
5 |
6 | course1md := $(course1rmd:.Rmd=.md)
7 | course2md := $(course2rmd:.Rmd=.md)
8 | course3md := $(course3rmd:.Rmd=.md)
9 | course4md := $(course4rmd:.Rmd=.md)
10 |
11 | all: course1 course2 course3 course4 foot
12 |
13 | course1: $(course1md)
14 | course2: $(course2md)
15 | course3: $(course3md)
16 | course4: $(course4md)
17 |
18 | %.md: %.Rmd
19 | cd $(dir $^); Rscript -e 'knit("$(notdir $^)")'
20 |
21 | foot:
22 | Rscript footnotes.R
23 |
24 |
--------------------------------------------------------------------------------
/methyl/inference_for_DNAmeth.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Inference for DNA methylation data
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ```{r}
12 | library(minfi) ##Bioc
13 | library(IlluminaHumanMethylation450kmanifest) ##Bioc
14 | library(doParallel) ##CRAN
15 | library(pkgmaker)
16 | library(rafalib)
17 | ```
18 |
19 | ```{r}
20 | path="/Users/ririzarr/myDocuments/teaching/HarvardX/tcgaMethylationSubset" # use your own path to downloaded data
21 | targets=read.delim(file.path (path,"targets.txt"),as.is=TRUE)
22 | table(targets$Tissue,targets$Status)
23 | ```
24 |
25 | For illustration we will read in the normal colon and lung
26 |
27 | ```{r}
28 | index = which( targets$Status=="normal" & targets$Tissue%in%c("colon","lung") )
29 | targets = targets[index,]
30 | ```
31 |
32 | ```{r}
33 | dat = read.metharray.exp(base=path,targets = targets, verbose=TRUE)
34 | dat = preprocessIllumina(dat)
35 | dat = mapToGenome(dat)
36 | dat = ratioConvert(dat,type="Illumina")
37 | ```
38 |
39 | ```{r}
40 | library(doParallel)
41 | detectCores()
42 | registerDoParallel(cores = 4)
43 | ```
44 |
45 | ```{r}
46 | tissue =pData(dat)$Tissue
47 | X= model.matrix(~tissue)
48 | index = which(seqnames(dat)=="chr22")
49 | dat = dat[index,] ## for illustrative purposes
50 | res=bumphunter(dat,X,cutoff=0.1,B=1000)
51 | head(res$tab)
52 | ```
53 |
54 |
55 | ```{r,message=FALSE}
56 | library(rafalib)
57 | library(AnnotationHub)
58 | cgi = AnnotationHub()[["AH5086"]]
59 | ```
60 |
61 | ```{r}
62 | tab = res$tab[res$tab$fwer <= 0.05,]
63 | tab = makeGRangesFromDataFrame(tab,keep.extra.columns = TRUE)
64 |
65 | map=distanceToNearest(tab,cgi)
66 | d = mcols(map)$distance
67 | prop.table( table( cut(as.numeric(d),c(0,1,2000,5000,Inf),include.lowest=TRUE,right=FALSE) ))
68 |
69 | null = granges(dat)
70 | nulltab = makeGRangesFromDataFrame(null,keep.extra.columns = TRUE)
71 |
72 | nullmap=distanceToNearest(nulltab,cgi)
73 | nulld = mcols(nullmap)$distance
74 | prop.table( table( cut(nulld,c(0,1,2000,5000,Inf),include.lowest=TRUE,right=FALSE) ))
75 | ```
76 |
77 | ```{r}
78 | beta = getBeta(dat)
79 | cols = as.factor(pData(dat)$Tissue)
80 |
81 | tab = tab[order(-mcols(tab)$area)]
82 | tab = tab+3000 ##add 3000 to each side
83 | mypar(1,1)
84 | i=17
85 | dataIndex = which(granges(dat)%over%tab[i])
86 | cgiIndex = which(cgi%over%tab[i])
87 | thecgi = cgi[cgiIndex]
88 |
89 | pos = start(dat)[dataIndex]
90 | xlim=range(c(pos,start(thecgi),end(thecgi)) )
91 |
92 | y = beta[dataIndex,]
93 |
94 | matplot(pos,y,col=as.numeric(cols) , xlim=xlim, ylim=c(0,1),ylab="Methylation")
95 | apply(cbind(start(thecgi),end(thecgi)),1,function(x) segments(x[1],0,x[2],0,lwd=4,col=3))
96 |
97 | plot(pos,res$fitted[dataIndex],xlim=xlim,ylim=c(-0.4,0.4))
98 | abline(h=0)
99 | apply(cbind(start(thecgi),end(thecgi)),1,function(x) segments(x[1],0,x[2],0,lwd=4,col=3))
100 |
101 | ```
102 |
103 | ```{r}
104 | table(getIslandStatus(dat))
105 | ```
106 |
107 |
108 |
109 |
110 |
111 |
112 |
--------------------------------------------------------------------------------
/methyl/minfi.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Reading 450K idat files with the minfi package
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | In this unit we will demonstrate how to read idat files from the illumina 450K DNA methylation array. We make use the the Bioconductor minfi package [cite 24478339].
12 |
13 | ```{r}
14 | # BiocManager::install(c("minfi","IlluminaHumanMethylation450kmanifest","IlluminaHumanMethylation450kanno.ilmn12.hg19"))
15 | library(minfi)
16 | ```
17 |
18 | The first step is to determine the basename of the idat files. Note that for each sample we have two files: one for red and green channels respectively. These files are found here:
19 |
20 | ```{r}
21 | path <- "idats"
22 | list.files(path)
23 | ```
24 |
25 | Let's start by reading in the csv file, which contains clinical information. This has one row for each sample and one of the columns includes the "basenames" for the files.
26 |
27 | ```{r}
28 | targets<-read.csv("idats/targets.csv",as.is=TRUE)
29 | names(targets)
30 | targets$Basename
31 | ```
32 |
33 | To make this script work in any working directory we can edit that column to contain the absolute paths. Then we are ready to read in the raw data with `read.metharray`:
34 |
35 | ```{r}
36 | targets$Basename <- file.path(path,targets$Basename)
37 | rgset <- read.metharray(targets$Basename,verbose=TRUE)
38 | pData(rgset)<-as(targets, "DataFrame")
39 | ```
40 |
41 | We now have the raw data, red and green intensities which we have access to:
42 | ```{r}
43 | dim(getRed(rgset))
44 | dim(getGreen(rgset))
45 | ```
46 |
47 | If you are not interested in developing preprocessing algorithms then you can use the built in preprocessing algorithm and go straight to an object that give you access to methylation estimates:
48 |
49 | ```{r}
50 | mset <- preprocessIllumina(rgset)
51 | ```
52 |
53 | This performs the default preprocessing algorithm developed by Illumina. However, for this to be useful, we want to have the locations of each CpG, and to do that we need map the CpGs to genome. minfi keeps this information modular so that when the genome annotation gets updated, one can easily change the mapping.
54 | ```{r}
55 | mset <- mapToGenome(mset)
56 | ```
57 |
58 | Now we are ready to obtain the methylation values and CpG locations.
59 |
60 | ```{r}
61 | dim(getBeta(mset,type="Illumina")) ##the argument type="Illumina" gives us default procedure
62 | head(granges(mset))
63 | ```
64 |
65 | We can also use functions such as `getSex` and `getQC` on the mset object:
66 | ```{r}
67 | colData(mset)<-getSex(mset)
68 | plotSex(mset)
69 | plot(as.matrix(getQC(mset)))
70 | ```
71 |
--------------------------------------------------------------------------------
/ml/conditional_expectation.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Conditional probabilities and expectations
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ## Conditional Probabilities and Expectations
12 |
13 | Prediction problems can be divided into categorical and continuous outcomes. However, many of the algorithms can be applied to both due to the connection between _conditional probabilities_ and _conditional expectations_.
14 |
15 | For categorical data, for example binary outcomes, if we know the probability of $Y$ being any of the possible outcomes $k$ given a set of predictors $X=(X_1,\dots,X_p)^\top$,
16 |
17 | $$
18 | f_k(x) = \mbox{Pr}(Y=k \mid X=x)
19 | $$
20 |
21 | we can optimize our predictions. Specifically, for any $x$ we predict the $k$ that has the largest probability $f_k(x)$.
22 |
23 | To simplify the exposition below, we will consider the case of binary data. You can think of the probability $\mbox{Pr}(Y=1 \mid X=x)$ as the proportion of 1s in the stratum of the population for which $X=x$. Given that the expectation is the average of all $Y$ values, in this case the expectation is equivalent to the probability: $f(x) \equiv \mbox{E}(Y \mid X=x)=\mbox{Pr}(Y=1 \mid X=x)$. We therefore use only the expectation in the descriptions below as it is more general.
24 |
25 | In general, the expected value has an attractive mathematical property, which is that it minimizes the expected distance between the predictor $\hat{Y}$ and $Y$:
26 |
27 | $$
28 | \mbox{E}\{ (\hat{Y} - Y)^2 \mid X=x \}
29 | $$
30 |
31 |
32 | #### Regression in the context of prediction
33 |
34 |
35 |
36 | We use the son and father height example to illustrate how regression can be interpreted as a machine learning technique. In our example, we are trying to predict the son's height $Y$ based on the father's $X$. Here we have only one predictor. Now if we were asked to predict the height of a randomly selected son, we would go with the average height:
37 |
38 |
39 | ```{r height_hist,message=FALSE,fig.cap="Histogram of son heights."}
40 | library(rafalib)
41 | mypar(1,1)
42 | data(father.son,package="UsingR")
43 | x=round(father.son$fheight) ##round to nearest inch
44 | y=round(father.son$sheight)
45 | hist(y,breaks=seq(min(y),max(y)))
46 | abline(v=mean(y),col="red",lwd=2)
47 | ```
48 |
49 | In this case, we can also approximate the distribution of $Y$ as normal, which implies the mean maximizes the probability density.
50 |
51 | Let's imagine that we are given more information. We are told that the father of this randomly selected son has a height of 71 inches (1.25 SDs taller than the average). What is our prediction now?
52 |
53 |
54 | ```{r conditional_distribution, fig.cap="Son versus father height (left) with the red lines denoting the stratum defined by conditioning on fathers being 71 inches tall. Conditional distribution: son height distribution of stratum defined by 71 inch fathers.",fig.width=10.5,fig.height=5.25}
55 | mypar(1,2)
56 | plot(x,y,xlab="Father's height in inches",ylab="Son's height in inches",
57 | main=paste("correlation =",signif(cor(x,y),2)))
58 | abline(v=c(-0.35,0.35)+71,col="red")
59 | hist(y[x==71],xlab="Heights",nc=8,main="",xlim=range(y))
60 | ```
61 |
62 |
63 | The best guess is still the expectation, but our strata has changed from all the data, to only the $Y$ with $X=71$. So we can stratify and take the average, which is the conditional expectation. Our prediction for any $x$ is therefore:
64 |
65 | $$
66 | f(x) = E(Y \mid X=x)
67 | $$
68 |
69 | It turns out that because this data is approximated by a bivariate normal distribution, using calculus, we can show that:
70 |
71 | $$
72 | f(x) = \mu_Y + \rho \frac{\sigma_Y}{\sigma_X} (X-\mu_X)
73 | $$
74 |
75 | and if we estimate these five parameters from the sample, we get the regression line:
76 |
77 | ```{r regression, fig.cap="Son versus father height showing predicted heights based on regression line (left). Conditional distribution with vertical line representing regression prediction.",fig.width=10.5,fig.height=5.25}
78 | mypar(1,2)
79 | plot(x,y,xlab="Father's height in inches",ylab="Son's height in inches",
80 | main=paste("correlation =",signif(cor(x,y),2)))
81 | abline(v=c(-0.35,0.35)+71,col="red")
82 |
83 | fit <- lm(y~x)
84 | abline(fit,col=1)
85 |
86 | hist(y[x==71],xlab="Heights",nc=8,main="",xlim=range(y))
87 | abline(v = fit$coef[1] + fit$coef[2]*71, col=1)
88 | ```
89 |
90 | In this particular case, the regression line provides an optimal prediction function for $Y$. But this is not generally true because, in the typical machine learning problems, the optimal $f(x)$ is rarely a simple line.
91 |
92 |
--------------------------------------------------------------------------------
/modeling/bayes-gif.R:
--------------------------------------------------------------------------------
1 | ##If you have ImageMagic installed on your computer,
2 | ##you can create an animated gif with code
3 | ##below. Note that the computation will make several
4 | ##gifs so it might take some time to compute.
5 | ##Make sure to pick a `filename` that does not already exist in the working directory.
6 |
7 | set.seed(3)
8 | prev <- 1/20
9 | acc <- 0.90
10 | ##For the animation we use 20 x 80
11 | N <- 20; M <- 80
12 | x<-rbinom(N*M,1,p=prev)
13 | cols <- c("grey","red")
14 | people <- expand.grid(1:M,N:1)
15 | people2 <- expand.grid(1:(M/2),N:1)
16 |
17 | cols1 <- cols[x+1]
18 | cols2 <- rep(NA,length(cols1));count2<-1
19 | cols3 <- rep(NA,length(cols1));count3<-1
20 |
21 | library(rafalib)
22 | library(animation)
23 | filename <- 'bayes.gif'
24 | saveGIF({
25 | i=1
26 | while(count3 <= N*M/2 & count2 <= N*M/2){
27 | test <- sample(100,1);min=round(100*acc)
28 | mypar()
29 | layout(matrix(c(1,2,1,3),2,2))
30 | plot(people,col=cols1,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Population: ",round(mean(x)*100),"% are red"))
31 | if(test>min) axis(side=1,M/2,"X",col="red",tick=FALSE,cex.axis=3,line=1.5) else axis(side=1,M/2,"O",col="black",tick=FALSE,cex.axis=2,line=1.5)
32 | points(people[i,],pch=1,cex=1.5)
33 | if(all(is.na(cols2))) plot(people2,type="n",pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main="Tested Positive") else plot(people2,col=cols2,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Tested Positive: ",round(mean(cols2=="red",na.rm=TRUE)*100),"% are red"))
34 | if(all(is.na(cols3))) plot(people2,type="n",pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main="Tested Negative") else plot(people2,col=cols3,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Tested Negative: ",round(mean(cols3=="red",na.rm=TRUE)*100,1),"% are red"))
35 | outcome <- ifelse(x[i]==1, as.numeric(test<=min), as.numeric(test>min))
36 | if(outcome==0) {cols3[count3]<-cols1[i];count3<-count3+1} else {cols2[count2]<-cols1[i];count2<-count2+1}
37 | i<-i+1
38 | }},filename, interval = .1, ani.width = 800, ani.height = 500)
39 |
40 |
--------------------------------------------------------------------------------
/renaming_map.md:
--------------------------------------------------------------------------------
1 | # renaming map
2 |
3 | from | to
4 | --- | ---
5 | 1 | intro
6 | 1 | inference
7 | 1 | eda
8 | 1 | robust
9 | 2 | matrixalg
10 | 2 | linear
11 | 3 | advinference
12 | 3 | modeling
13 | 3 | highdim
14 | 3 | ml
15 | 3 | batch
16 | 4 | bioc
17 | 5 | rnaseq
18 | 6 | variants
19 | 7 | chipseq
20 | 8 | methyl
21 |
--------------------------------------------------------------------------------
/rnaseq/airway_sample_table.csv:
--------------------------------------------------------------------------------
1 | "","SampleName","cell","dex","albut","Run","avgLength","Experiment","Sample","BioSample"
2 | "SRR1039508","GSM1275862","N61311","untrt","untrt","SRR1039508",126,"SRX384345","SRS508568","SAMN02422669"
3 | "SRR1039509","GSM1275863","N61311","trt","untrt","SRR1039509",126,"SRX384346","SRS508567","SAMN02422675"
4 | "SRR1039512","GSM1275866","N052611","untrt","untrt","SRR1039512",126,"SRX384349","SRS508571","SAMN02422678"
5 | "SRR1039513","GSM1275867","N052611","trt","untrt","SRR1039513",87,"SRX384350","SRS508572","SAMN02422670"
6 | "SRR1039516","GSM1275870","N080611","untrt","untrt","SRR1039516",120,"SRX384353","SRS508575","SAMN02422682"
7 | "SRR1039517","GSM1275871","N080611","trt","untrt","SRR1039517",126,"SRX384354","SRS508576","SAMN02422673"
8 | "SRR1039520","GSM1275874","N061011","untrt","untrt","SRR1039520",101,"SRX384357","SRS508579","SAMN02422683"
9 | "SRR1039521","GSM1275875","N061011","trt","untrt","SRR1039521",98,"SRX384358","SRS508580","SAMN02422677"
10 |
--------------------------------------------------------------------------------
/rnaseq/fastq.md:
--------------------------------------------------------------------------------
1 | # Fastq files
2 |
3 | ## Links for this experiment
4 |
5 | Study information at the Sequence Read Archive:
6 |
7 | http://www.ncbi.nlm.nih.gov/Traces/sra/?study=SRP033351
8 |
9 | Himes et al paper at PubMed Central:
10 |
11 | http://www.ncbi.nlm.nih.gov/pubmed/24926665
12 |
13 | Example sample table stored in our course repo on github:
14 |
15 | https://github.com/genomicsclass/labs/blob/master/course5/airway_sample_table.csv
16 |
17 | Details on creating such a sample table from SRA and GEO:
18 |
19 | http://www.bioconductor.org/packages/release/data/experiment/vignettes/airway/inst/doc/airway.html
20 |
21 | The European Nucleotide Archive (EMBL-EBI):
22 |
23 | http://www.ebi.ac.uk/ena
24 |
25 | The Sequence Read Archive (NCBI):
26 |
27 | http://www.ncbi.nlm.nih.gov/sra/
28 |
29 | ## Fastq file commands
30 |
31 | Downloading from the ENA:
32 |
33 | ```
34 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR103/008/SRR1039508/SRR1039508_1.fastq.gz
35 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR103/008/SRR1039508/SRR1039508_2.fastq.gz
36 | ```
37 |
38 | Alias for ls:
39 |
40 | ```
41 | alias ll='ls -lGh'
42 | ```
43 |
44 | Unzipping:
45 |
46 | ```
47 | gunzip *.fastq.gz
48 | ```
49 |
50 | Looking at the FASTQ files:
51 |
52 | ```
53 | less SRR1039508_1.fastq
54 | wc -l SRR1039508_1.fastq
55 | ```
56 |
57 | Quality control with [fastqc](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
58 |
59 | ```
60 | fastqc --noextract SRR1039508_1.fastq SRR1039508_2.fastq
61 | ```
62 |
63 |
--------------------------------------------------------------------------------
/rnaseq/genome_align_STAR.md:
--------------------------------------------------------------------------------
1 | # STAR commands
2 |
3 | The STAR homepage:
4 |
5 | https://github.com/alexdobin/STAR
6 |
7 | The STAR paper:
8 |
9 | http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530905/
10 |
11 | The STAR manual:
12 |
13 | https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf
14 |
15 | Downloading genome FASTA and GTF files from ENSEMBL:
16 |
17 | http://ensembl.org
18 |
19 | http://ensembl.org/info/data/ftp/index.html
20 |
21 | Generating the genome:
22 |
23 | Note the `sjdbOverhang` is used for constructing the splice junction database. It should be set to (read length - 1), and according to the manual a general value of 100 will work as well.
24 |
25 | For this limited demonstration, I am only going to align to the genes on chromosome 1, so I subset the GTF file:
26 |
27 | ```
28 | grep -P '^1\t' Homo_sapiens.GRCh38.79.gtf > Homo_sapiens.GRCh38.79.chrom1.gtf
29 | ```
30 |
31 | We then moved files in subdirectories, and created one for the STAR genome index:
32 |
33 | ```
34 | mkdir gtf
35 | mkdir genome
36 | mv *.gtf gtf
37 | mv *.fa genome
38 | mkdir GRCh38.79.chrom1
39 | ```
40 |
41 | The STAR command to generate the genome index:
42 |
43 | ```
44 | STAR --runMode genomeGenerate \
45 | --genomeDir GRCh38.79.chrom1 \
46 | --genomeFastaFiles genome/Homo_sapiens.GRCh38.dna.chromosome.1.fa \
47 | --sjdbGTFfile gtf/Homo_sapiens.GRCh38.79.chrom1.gtf \
48 | --sjdbOverhang 62
49 | ```
50 |
51 | Mapping the reads:
52 |
53 | ```
54 | STAR --runThreadN 12 \
55 | --genomeDir GRCh38.79.chrom1 \
56 | --readFilesIn fastq/SRR1039508_1.fastq fastq/SRR1039508_2.fastq
57 | ```
58 |
--------------------------------------------------------------------------------
/rnaseq/r_bioc_links.md:
--------------------------------------------------------------------------------
1 | # R and Bioconductor links
2 |
3 | * [Central R Archive Network (CRAN)](http://cran.rstudio.com/)
4 | * [RStudio](http://www.rstudio.com/)
5 | * [Bioconductor](http://bioconductor.org/install)
6 |
7 | Once you have installed R and the `BiocManager` package, running the following lines in your console will install Bioconductor:
8 |
9 | ```
10 | BiocManager::install()
11 | ```
12 |
13 | Make sure to hit `[a]` to update all packages. This is important so that your answers will match the answers accepted by the grading bot.
14 |
15 | To install specific packages from Bioconductor use, for example:
16 |
17 | ```
18 | BiocManager::install(c("pasilla", "DEXSeq"))
19 | ```
20 |
21 | We will provide a list of all packages we will use [here](rnaseq_pkgs.R).
22 |
23 | If you want to see what version of Bioconductor you are using and whether your packages are up to date:
24 |
25 | ```
26 | BiocManager::version()
27 | BiocManager::valid()
28 | ```
29 |
--------------------------------------------------------------------------------
/rnaseq/rnaseq_exon_usage.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: RNA-seq differential exon usage
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | The [DEXSeq](http://bioconductor.org/packages/release/bioc/html/DEXSeq.html) package offers differential testing of exon usage within each gene. Here we will explore the R code used in a *DEXSeq* analysis. We omit the python calls for preparing the annotation and count tables, but these can be found in the vignette at the above link. The python calls are generally along the lines of:
12 |
13 | ```
14 | python dexseq_prepare_annotation.py gtffile.gtf dexseq.gff
15 | python dexseq_count.py dexseq.gff sample1.sam sample1.txt
16 | ```
17 |
18 | Once we have repeated the `dexseq_count` script for each sample, we can read the data into R using the code chunks below. As we are working with pre-prepared data, we first point to these files which live within the *pasilla* package.
19 |
20 | The *pasilla* package contains counts from an experiment by [Brooks et al](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3032923/)
21 |
22 | We will run DEXSeq on a subset of the genes, for demonstration purposes.
23 |
24 | ```{r}
25 | library("pasilla")
26 | inDir = system.file("extdata", package="pasilla", mustWork=TRUE)
27 | countFiles = list.files(inDir, pattern="fb.txt$", full.names=TRUE)
28 | flattenedFile = list.files(inDir, pattern="gff$", full.names=TRUE)
29 | genesForSubset = read.table(file.path(inDir, "geneIDsinsubset.txt"),
30 | stringsAsFactors=FALSE)[[1]]
31 | ```
32 |
33 | As in *DESeq2* we use a `sampleTable` to define the samples:
34 |
35 | ```{r}
36 | sampleTable = data.frame(
37 | row.names = c( "treated1", "treated2", "treated3",
38 | "untreated1", "untreated2", "untreated3", "untreated4" ),
39 | condition = c("knockdown", "knockdown", "knockdown",
40 | "control", "control", "control", "control" ),
41 | libType = c( "single-end", "paired-end", "paired-end",
42 | "single-end", "single-end", "paired-end", "paired-end" ) )
43 | sampleTable
44 | ```
45 |
46 | We now read the data into a `DEXSeqDataSet` object:
47 |
48 | ```{r message=FALSE}
49 | library("DEXSeq")
50 | dxd = DEXSeqDataSetFromHTSeq(
51 | countFiles,
52 | sampleData=sampleTable,
53 | design= ~ sample + exon + condition:exon,
54 | flattenedfile=flattenedFile )
55 | ```
56 |
57 | Subset the genes, for demonstration purposes:
58 |
59 | ```{r}
60 | dxd = dxd[geneIDs( dxd ) %in% genesForSubset,]
61 | ```
62 |
63 | Now we run the estimation and testing functions:
64 |
65 | ```{r}
66 | dxd = estimateSizeFactors( dxd )
67 | dxd = estimateDispersions( dxd )
68 | dxd = testForDEU( dxd )
69 | dxd = estimateExonFoldChanges( dxd, fitExpToVar="condition")
70 | ```
71 |
72 | The following code extracts a results table, makes an MA-plot, and draws the expression levels over the exons to highlight differential exon usage:
73 |
74 | ```{r}
75 | dxr = DEXSeqResults( dxd )
76 | plotMA( dxr, cex=0.8 )
77 | plotDEXSeq( dxr, "FBgn0010909", legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 )
78 | ```
79 |
80 | Again, drawing the expression levels, now showing the annotated transcripts below:
81 |
82 | ```{r}
83 | plotDEXSeq( dxr, "FBgn0010909", displayTranscripts=TRUE, legend=TRUE,
84 | cex.axis=1.2, cex=1.3, lwd=2 )
85 | ```
86 |
87 | For more details on the *DEXSeq* software, see the vignette and the paper, which is linked from the vignette page:
88 |
89 | ```{r eval=FALSE}
90 | browseVignettes("DEXSeq")
91 | ```
92 |
93 | We conclude by adding the session information:
94 |
95 | ```{r}
96 | sessionInfo()
97 | ```
98 |
99 |
--------------------------------------------------------------------------------
/rnaseq/rnaseq_isoform_cummerbund.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Exploring Cufflinks output with cummeRbund
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 |
12 | Here we show the exploratory plots offered by the [cummeRbund](http://www.bioconductor.org/packages/release/bioc/html/cummeRbund.html) package. These plots require loading in a directory in which results from a [Cufflinks](http://cole-trapnell-lab.github.io/cufflinks/) analysis has been run. Follow the vignette in the above link in order in order to perform a Cufflinks gene- and isoform-level analysis. From the vignette:
13 |
14 | > CummeRbund begins by re-organizing output files of a cuffdiff analysis, and storing these data in a local SQLite database. CummeRbund indexes the data to speed up access to specific feature data (genes, isoforms, TSS, CDS, etc.), and preserves the various relationships between these features.
15 |
16 | ```{r message=FALSE}
17 | library(cummeRbund)
18 | myDir <- system.file("extdata", package="cummeRbund")
19 | gtfFile <- system.file("extdata/chr1_snippet.gtf",package="cummeRbund")
20 | ```
21 |
22 | Read in the prepared Cufflinks files from the directory:
23 |
24 | ```{r warning=FALSE}
25 | cuff <- readCufflinks(dir=myDir,gtfFile=gtfFile,genome="hg19",rebuild=TRUE)
26 | ```
27 |
28 | Boxplots of expression (FPKM) at the gene and isoform level:
29 |
30 | ```{r}
31 | csBoxplot(genes(cuff))
32 | csBoxplot(genes(cuff),replicates=TRUE)
33 | csBoxplot(isoforms(cuff),replicates=TRUE)
34 | ```
35 |
36 | Scatterplot matrix of gene and isoform level expression:
37 |
38 | ```{r}
39 | csScatterMatrix(genes(cuff))
40 | csScatterMatrix(isoforms(cuff))
41 | ```
42 |
43 | Sample dendrograms using Jensen-Shannon distances:
44 |
45 | ```{r}
46 | csDendro(genes(cuff),replicates=TRUE)
47 | csDendro(isoforms(cuff),replicates=TRUE)
48 | ```
49 |
50 | MA-plot comparing two conditions:
51 |
52 | ```{r}
53 | MAplot(genes(cuff),"hESC","Fibroblasts")
54 | MAplot(isoforms(cuff),"hESC","Fibroblasts")
55 | ```
56 |
57 | A "volcano plot" matrix. Each volcano plot is the -log10(p-value) over the log fold change.
58 |
59 | ```{r}
60 | csVolcanoMatrix(genes(cuff))
61 | csVolcanoMatrix(isoforms(cuff))
62 | ```
63 |
64 | For all of these functions, see the help pages in the *cummeRbund* package for more details, and check the vignette for a sample workflow. The [Cufflinks homepage](http://cole-trapnell-lab.github.io/cufflinks/) has details about running the pipeline upstream of producing these figures.
65 |
66 | ```{r eval=FALSE}
67 | browseVignettes("cummeRbund")
68 | ```
69 |
--------------------------------------------------------------------------------
/rnaseq/rnaseq_pkgs.R:
--------------------------------------------------------------------------------
1 | # 2 CRAN packages
2 | cranpkgs <- c("ggplot2","pheatmap")
3 | install.packages(cranpkgs)
4 |
5 | # rafalib from github (not strictly necessary, but useful for plots)
6 | install.packages("devtools")
7 | library(devtools)
8 | install_github("ririzarr/rafalib")
9 |
10 | # the rest are Bioconductor packages
11 | biocpkgs <- c("Rsamtools",
12 | "GenomicFeatures",
13 | "GenomicAlignments",
14 | "Rsubread",
15 | "airway",
16 | "pasilla",
17 | "DESeq2",
18 | "DEXSeq",
19 | "vsn",
20 | "sva",
21 | "org.Hs.eg.db",
22 | "cummeRbund",
23 | "pasillaBamSubset",
24 | "TxDb.Dmelanogaster.UCSC.dm3.ensGene")
25 | BiocManager::install(biocpkgs)
26 | # note that Rsubread does not have a binary for Windows. This package is not required for class.
27 |
--------------------------------------------------------------------------------
/rnaseq/storage/RNAseq_quiz.R:
--------------------------------------------------------------------------------
1 | link <- "http://bowtie-bio.sourceforge.net/recount/ExpressionSets/wang_eset.RData"
2 | if (!file.exists("wang_eset.RData")) download.file(link, "wang_eset.RData")
3 | load("wang_eset.RData")
4 |
5 | library(Biobase)
6 | library(GenomicRanges)
7 | # the SimpleList part is only necessary for Bioc <= 2.13
8 | se <- SummarizedExperiment(SimpleList(counts = exprs(wang.eset)))
9 | colData(se) <- DataFrame(pData(wang.eset))
10 |
11 | table(colData(se)$cell.type)
12 |
13 | tissues <- c("cerebellum","breast","colon","heart","liver","skeletal.muscle")
14 | se <- se[,colData(se)$cell.type %in% tissues]
15 |
16 | table(colData(se)$cell.type)
17 |
18 | test <- colData(se)$cell.type == "cerebellum"
19 | lvls <- c("not","cerebellum")
20 | condition <- factor(ifelse(test,"cerebellum","not"), levels=lvls)
21 | colData(se)$condition <- condition
22 |
23 | library(DESeq2)
24 | dds <- DESeqDataSet( se, design = ~ condition )
25 | dds <- DESeq( dds )
26 | res <- results( dds )
27 |
28 | res[order(res$pvalue)[1:10],]
29 |
30 | # Bioc 2.13 baseMean log2FoldChange lfcSE stat pvalue
31 | # ENSG00000143858 1727.5839 7.937566 0.2794747 28.40173 1.924967e-177
32 | # ENSG00000176749 733.8875 6.352004 0.2298648 27.63366 4.387200e-168
33 | # ENSG00000187730 2766.0323 10.587430 0.3875275 27.32046 2.423703e-164
34 | # ENSG00000161509 1229.6361 8.146916 0.3313924 24.58390 1.878103e-133
35 | # ENSG00000170616 1288.3151 9.776143 0.4349104 22.47852 6.734571e-112
36 |
37 | # Bioc 2.14 baseMean log2FoldChange lfcSE stat pvalue
38 | # ENSG00000143858 1727.5839 8.003262 0.3027984 26.43099 6.035903e-154
39 | # ENSG00000176749 733.8875 6.391632 0.2502764 25.53830 7.407303e-144
40 | # ENSG00000187730 2766.0323 10.767926 0.4252847 25.31933 1.956629e-141
41 | # ENSG00000161509 1229.6361 8.244928 0.3546406 23.24868 1.466793e-119
42 | # ENSG00000170616 1288.3151 9.988089 0.4793187 20.83809 1.954636e-96
43 |
44 |
45 | top <- rownames(res)[order(res$pvalue)[1:10]]
46 | top[1]
47 |
48 | stripchart(log10(counts(dds,normalized=TRUE)[top[1],] + 1) ~ se$condition,
49 | vertical=TRUE, method="jitter")
50 |
51 | # averages for each group
52 | tapply(counts(dds,normalized=TRUE)[top[2],],
53 | colData(dds)$condition,
54 | mean)
55 |
56 | library(org.Hs.eg.db)
57 | keytypes(org.Hs.eg.db)
58 | columns(org.Hs.eg.db)
59 |
60 | # gene names
61 | map <- select(org.Hs.eg.db, keys=top,
62 | columns=c("SYMBOL", "GENENAME"), keytype="ENSEMBL")
63 | map
64 |
65 | map <- select(org.Hs.eg.db, keys=top[3],
66 | columns=c("GO"), keytype="ENSEMBL")
67 |
68 | # The following gives the GO terms:
69 | map$GO
70 |
71 | # this gives the meaning of these:
72 | library(GO.db)
73 | as.list(GOTERM[map$GO])
74 |
--------------------------------------------------------------------------------
/rnaseq/storage/cufflinks.txt:
--------------------------------------------------------------------------------
1 | We will work with the Hammer et al dataset, as prepared by the ReCount
2 | website.
3 |
4 | http://bowtie-bio.sourceforge.net/recount/
5 |
6 | The Hammer et al paper:
7 |
8 | http://www.ncbi.nlm.nih.gov/pubmed?term=20452967
9 |
10 | The GEO page:
11 |
12 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE20895
13 |
14 | The sample I will align:
15 |
16 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM539553
17 |
18 | which points to the SRA:
19 |
20 | ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX%2FSRX020%2FSRX020088/SRR042499/
21 |
22 | fastq-dump SRR042499.sra
23 |
24 | The genome was downloaded from Illumina iGenomes
25 |
26 | http://cufflinks.cbcb.umd.edu/igenomes.html
27 |
28 | The tophat call to align the reads:
29 |
30 | tophat2 -o tophat_out -p 10 /path/to/Rattus_norvegicus/Ensembl/RGSC3.4/Sequence/Bowtie2Index/genome SRR042499.fastq
31 |
32 | cufflinks -o cufflinks -p 10 --GTF-guide /path/to/Rattus_norvegicus/Ensembl/RGSC3.4/Annotation/Genes/genes.gtf \
33 | tophat_out/accepted_hits.bam
34 |
35 | grep -v 'FPKM "0.0000000000"' transcripts.gtf | less
36 |
37 | For visualizing:
38 |
39 | ftp://ftp.ensembl.org/pub/release-69/fasta/rattus_norvegicus/dna/Rattus_norvegicus.RGSC3.4.69.dna.chromosome.1.fa.gz
40 | ftp://ftp.ensembl.org/pub/release-69/gtf/rattus_norvegicus/Rattus_norvegicus.RGSC3.4.69.gtf.gz
41 |
--------------------------------------------------------------------------------
/rnaseq/trancsript_align_RSEM.md:
--------------------------------------------------------------------------------
1 | # RSEM transcript alignment
2 |
3 | RSEM homepage:
4 |
5 | http://deweylab.biostat.wisc.edu/rsem/
6 |
7 | RSEM paper:
8 |
9 | http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3163565/
10 |
11 | prepare-reference help:
12 |
13 | http://deweylab.biostat.wisc.edu/rsem/rsem-prepare-reference.html
14 |
15 | calculate-expression help:
16 |
17 | http://deweylab.biostat.wisc.edu/rsem/rsem-calculate-expression.html
18 |
19 | RSEM expects a GTF file with only exons, which are each assigned to a `transcript_id`.
20 |
21 | Note that we only align to chromosome 1 for demonstration purposes.
22 |
23 | ```
24 | awk '$3 == "exon"' gtf/Homo_sapiens.GRCh38.79.chrom1.gtf > gtf/Homo_sapiens.GRCh38.79.chrom1.exons.gtf
25 | ```
26 |
27 | RSEM will then prepare a reference transcriptome against which to align reads.
28 |
29 | ```
30 | mkdir rsemGenome
31 | rsem-prepare-reference --gtf gtf/Homo_sapiens.GRCh38.79.chrom1.exons.gtf genome/Homo_sapiens.GRCh38.dna.chromosome.1.fa rsemGenome/GRCh38.79.chrom1
32 | ```
33 |
34 | ```
35 | rsem-calculate-expression -p 12 --paired-end fastq/SRR1039508_1.fastq fastq/SRR1039508_2.fastq rsemGenome/GRCh38.79.chrom1 SRR1039508
36 | ```
37 |
38 |
--------------------------------------------------------------------------------
/robust/ranktest.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Rank tests
4 | ---
5 |
6 | ```{r options, echo=FALSE}
7 | library(knitr)
8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-"))
9 | ```
10 |
11 | ## Wilcoxon Rank Sum Test
12 |
13 | We learned how the sample mean and SD are susceptible to outliers. The
14 | t-test is based on these measures and is susceptible as well. The
15 | Wilcoxon rank test (equivalent to the Mann-Whitney test) provides an
16 | alternative. In the code below, we perform a t-test on data for which
17 | the null is true. However, we change one sum observation by mistake
18 | in each sample and the values incorrectly entered are different. Here
19 | we see that the t-test results in a small p-value, while the Wilcoxon
20 | test does not:
21 |
22 | ```{r}
23 | set.seed(779) ##779 picked for illustration purposes
24 | N=25
25 | x<- rnorm(N,0,1)
26 | y<- rnorm(N,0,1)
27 | ```
28 |
29 | Create outliers:
30 |
31 | ```{r}
32 | x[1] <- 5
33 | x[2] <- 7
34 | cat("t-test pval:",t.test(x,y)$p.value)
35 | cat("Wilcox test pval:",wilcox.test(x,y)$p.value)
36 | ```
37 |
38 | The basic idea is to 1) combine all the data, 2) turn the values into ranks, 3) separate them back into their groups, and 4) compute the sum or average rank and perform a test.
39 |
40 | ```{r rank-test-illustration, fig.cap="Data from two populations with two outliers. The left plot shows the original data and the right plot shows their ranks. The numbers are the w values ",fig.width=10.5,fig.height=5.25}
41 | library(rafalib)
42 | mypar(1,2)
43 |
44 | stripchart(list(x,y),vertical=TRUE,ylim=c(-7,7),ylab="Observations",pch=21,bg=1)
45 | abline(h=0)
46 |
47 | xrank<-rank(c(x,y))[seq(along=x)]
48 | yrank<-rank(c(x,y))[-seq(along=x)]
49 |
50 | stripchart(list(xrank,yrank),vertical=TRUE,ylab="Ranks",pch=21,bg=1,cex=1.25)
51 |
52 | ws <- sapply(x,function(z) rank(c(z,y))[1]-1)
53 | text( rep(1.05,length(ws)), xrank, ws, cex=0.8)
54 | W <-sum(ws)
55 | ```
56 |
57 | `W` is the sum of the ranks for the first group relative to the second
58 | group. We can compute an exact p-value for $W$ based on
59 | combinatorics. We can also use the CLT since
60 | statistical theory tells us that this `W` is approximated by the
61 | normal distribution. We can construct a z-score as follows:
62 |
63 | ```{r}
64 | n1<-length(x);n2<-length(y)
65 | Z <- (mean(ws)-n2/2)/ sqrt(n2*(n1+n2+1)/12/n1)
66 | print(Z)
67 | ```
68 |
69 | Here the `Z` is not large enough to give us a p-value less
70 | than 0.05. These are part of the calculations performed by the R function
71 | `wilcox.test`.
72 |
73 |
--------------------------------------------------------------------------------
/variants/SNP_quiz.R:
--------------------------------------------------------------------------------
1 | library(VariantTools)
2 | library(LungCancerLines)
3 | library(BSgenome.Hsapiens.UCSC.hg19)
4 | genome <- gmapR::TP53Genome()
5 | bams <- LungCancerLines::LungCancerBamFiles()
6 | bam <- bams$H1993
7 |
8 | tally.param <- TallyVariantsParam(genome,
9 | high_base_quality = 23L)
10 | call23 <- callVariants(bam, tally.param)
11 | length(call23)
12 | mean(mcols(call23)$raw.count)
13 |
14 | # what's the average raw count with higher filter on quality
15 | tally.param <- TallyVariantsParam(genome,
16 | high_base_quality = 32L)
17 | call32 <- callVariants(bam, tally.param)
18 | length(call32)
19 | mean(mcols(call32)$raw.count)
20 |
21 |
22 |
23 |
24 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
25 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
26 | library(VariantAnnotation)
27 | fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
28 | vcf <- readVcf(fl, genome="hg19")
29 | seqlevels(vcf) <- paste0("chr", seqlevels(vcf))
30 |
31 |
32 | # what is the gene whose promoter overlaps the first variant in vcf?
33 | loc <- locateVariants(vcf, txdb, PromoterVariants())
34 | loc
35 |
36 | # ?promoter tells us that the promoter is 2000 bp upstream from the gene
37 | # start and 200 bp downstream. note that gene start
38 |
39 | # how far is this variant from this gene?
40 | rowData(vcf[197])
41 | g <- genes(txdb)
42 | idx <- as.character(mcols(g)$gene_id) == "79174"
43 | g[idx]
44 |
45 | # this gives us the distance to the gene start
46 | distance(rowData(vcf[197]), flank(g[idx], 0))
47 |
48 | # it gives the same as this...
49 | distance(rowData(vcf[197]), g[idx])
50 |
51 | # however, theoretically if the variant was less
52 | # than 200 bp downstream from the TSS, and the
53 | # gene was very short, the variant could end up
54 | # closer to the gene end. so the first line of code
55 | # is safer.
--------------------------------------------------------------------------------