├── .gitignore ├── .md ├── LICENSE ├── README.md ├── Rscripts ├── cheung.R ├── cheungSubset.R ├── cltmovie.R ├── dilution.R ├── gsea.R ├── limma_quiz.R ├── make_two_tables_for_class.R ├── microarraydata-eda-lab.R └── read_tcga_meth.R ├── advinference ├── eda_for_highthroughput.Rmd ├── inference_for_highthroughput.Rmd ├── intro_to_highthroughput_data.Rmd ├── multiple_testing.Rmd ├── quick_Bioc_intro.Rmd └── storage │ ├── confounding.Rmd │ ├── crossvalidation.Rmd │ ├── distance_lecture.Rmd │ ├── heatmaps.Rmd │ ├── hierarchical_modeling.R │ ├── justsvd_duplicate_content.txt │ ├── modeling.Rmd │ ├── multtest.Rmd │ ├── pca_svd.Rmd │ ├── prediction.Rmd │ ├── sva.Rmd │ ├── svacombat.Rmd │ └── transformations.Rmd ├── batch ├── adjusting_with_factor_analysis.Rmd ├── adjusting_with_linear_models.Rmd ├── confounding.Rmd ├── eda_with_pca.Rmd ├── factor_analysis.Rmd └── intro_to_batch_effects.Rmd ├── bioc ├── EDA_plots_for_NGS.Rmd ├── HPCami.Rmd ├── aaFinalSummary.Rmd ├── anno4liftover.Rmd ├── annoCheat.Rmd ├── archWk4basic.Rmd ├── background.Rmd ├── biocparallel.Rmd ├── biological_versus_technical_var.Rmd ├── biostrings.Rmd ├── c1.all.v4.0.entrez.gmt ├── eset.Rmd ├── eset_sumexp.Rmd ├── gene_set_analysis.Rmd ├── gene_set_analysis_in_R.Rmd ├── ggbioNote.Rmd ├── grangesERBSExample.Rmd ├── importBed.Rmd ├── import_NGS.Rmd ├── inference_with_bioc.Rmd ├── installing_Bioconductor_finding_help.Rmd ├── iranges_granges.Rmd ├── moreGR.Rmd ├── normalization.Rmd ├── operateGRanges.Rmd ├── read_counting.Rmd ├── reading_microarray_data.Rmd ├── seq4motif.Rmd ├── storage │ ├── EDA_plots_for_microarray.Rmd │ ├── GEOquery.Rmd │ ├── anno1refbuilds.Rmd │ ├── anno2Biostrings.Rmd │ ├── anno3GRanges.Rmd │ ├── anno4liftover.Rmd │ ├── anno5genes.Rmd │ ├── annoPhen.Rmd │ ├── basic_Bioconductor_infrastructure.Rmd │ ├── basic_inference_microarray.Rmd │ ├── chromComp.Rmd │ ├── chromIntro.Rmd │ ├── confounding.Rmd │ ├── mapping_features.Rmd │ ├── probeSearch.Rmd │ ├── svacombat.Rmd │ ├── using_limma.Rmd │ └── using_limma_old_no_comments.Rmd ├── tophat.md ├── using_limma.Rmd └── visualizing_NGS.Rmd ├── biocadv_6x ├── bioc2_HPCami.Rmd ├── bioc2_externData.Rmd ├── bioc2_ggbio.Rmd ├── bioc2_gvfeat.Rmd ├── bioc2_hybstor.Rmd ├── bioc2_integExamps.Rmd ├── bioc2_nosql.Rmd ├── bioc2_ov.Rmd ├── bioc2_parallel.Rmd ├── bioc2_rainfall.Rmd ├── bioc2_repro1.Rmd ├── bioc2_rpacks.Rmd ├── bioc2_shiny.Rmd ├── bioc2_vizNGS.Rmd ├── bioc2_vizOv.Rmd ├── esHclust.Rmd ├── finalViz.Rmd ├── multiOOM.Rmd └── tcga.Rmd ├── biocintro_5x ├── WhatWeMeas.Rmd ├── bioc1_align.Rmd ├── bioc1_annoCheat.Rmd ├── bioc1_annoOverview.Rmd ├── bioc1_btvari.Rmd ├── bioc1_geneset_1.Rmd ├── bioc1_grangeOps.Rmd ├── bioc1_igranges.Rmd ├── bioc1_liftOver.Rmd ├── bioc1_limma.Rmd ├── bioc1_mgt_gsd.Rmd ├── bioc1_multibed.Rmd ├── bioc1_roast.Rmd ├── bioc1_summex.Rmd ├── bioc1_t_mult.Rmd ├── biointro.Rmd ├── biomotiv.Rmd ├── dataman2017.Rmd ├── dataman2019.Rmd ├── dataman2022.Rmd └── optalign.Rmd ├── chipseq ├── ChIPseq.Rmd ├── ChIPseq_quiz.R └── MACS.txt ├── eda ├── exploratory_data_analysis.Rmd └── plots_to_avoid.Rmd ├── example.Rmd ├── footnotes.R ├── highdim ├── PCA.Rmd ├── distance.Rmd ├── images │ └── handmade │ │ ├── Heatmap.png │ │ ├── SVD1.png │ │ ├── SVD2.png │ │ └── animals.png ├── mds.Rmd ├── pca_motivation.Rmd ├── projections.Rmd ├── rotations.Rmd └── svd.Rmd ├── inference ├── R_refresher.Rmd ├── association_tests.Rmd ├── clt_and_t-distribution.Rmd ├── clt_in_practice.Rmd ├── confidence_intervals.Rmd ├── monte_carlo.Rmd ├── permutation_tests.Rmd ├── populations_and_samples.Rmd ├── power_calculations.Rmd ├── random_variables.Rmd └── t-tests_in_practice.Rmd ├── intro ├── dplyr_intro.Rmd ├── dplyr_tutorial.Rmd ├── getting_started.Rmd ├── github.Rmd ├── introduction.Rmd ├── math_notation.Rmd └── system_files.Rmd ├── linear ├── collinearity.Rmd ├── expressing_design_formula.Rmd ├── interactions_and_contrasts.Rmd ├── linear_models_going_further.Rmd ├── linear_models_in_practice.Rmd ├── linear_models_intro.Rmd ├── qr_and_regression.Rmd └── standard_errors.Rmd ├── list_libs.sh ├── makefile ├── matrixalg ├── intro_using_regression.Rmd ├── matrix_algebra_examples.Rmd ├── matrix_notation.Rmd └── matrix_operations.Rmd ├── methyl ├── epiviz.Rmd ├── inference_for_DNAmeth.Rmd ├── methylation.Rmd └── minfi.Rmd ├── ml ├── clustering_and_heatmaps.Rmd ├── conditional_expectation.Rmd ├── crossvalidation.Rmd ├── machine_learning.Rmd └── smoothing.Rmd ├── modeling ├── bayes-gif.R ├── bayes.Rmd ├── hierarchical_models.Rmd └── modeling.Rmd ├── renaming_map.md ├── rnaseq ├── airway_sample_table.csv ├── fastq.md ├── genome_align_STAR.md ├── r_bioc_links.md ├── rnaseq_exon_usage.Rmd ├── rnaseq_gene_level.Rmd ├── rnaseq_isoform_cummerbund.Rmd ├── rnaseq_pkgs.R ├── storage │ ├── RNAseq_quiz.R │ └── cufflinks.txt └── trancsript_align_RSEM.md ├── robust ├── ranktest.Rmd └── robust_summaries.Rmd └── variants ├── SNP.Rmd └── SNP_quiz.R /.gitignore: -------------------------------------------------------------------------------- 1 | footnotes.md 2 | */figure 3 | */cache 4 | */.cache 5 | */*.html 6 | */*.md 7 | */*.RData 8 | */*.txt 9 | */*.csv 10 | */*.tsv 11 | */*.tab 12 | */*.bam 13 | */*.bai 14 | */*.fasta 15 | */*.fai 16 | */*.bgz 17 | */*.tbi 18 | */.DS_Store 19 | */.Rhistory 20 | *_exercises 21 | *.Rproj 22 | .Rproj.user 23 | -------------------------------------------------------------------------------- /.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Rafael Irizarry and Michael Love 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Data Analysis for the Life Sciences 2 | 3 | #### NEWS: 4 | 5 | September 16, 2015 : We are reogranizing the labs here for the new courses launching this Fall. 6 | We have decided to drop the `course1` style of directory structure, as the number of courses 7 | is still in flux. We are now using a modular structure. See `renaming_map.md` for how courses 8 | were remapped to new names. 9 | 10 | #### Book versions 11 | 12 | Compiled versions of this document as HTML can be found here: 13 | 14 | http://genomicsclass.github.io/book/ 15 | 16 | The ePub version of this document can be found on *Leanpub*: 17 | 18 | https://leanpub.com/dataanalysisforthelifesciences/ 19 | 20 | #### Pull requests and issues 21 | 22 | We greatly appreciate all of our readers who contribute pull requests! 23 | 24 | If you want to contribute through pull request, please first clone a *new version* of the repo. If you have a version of the repo from 2014, it will contain some large data objects, which accidentally snuck in, and we won't be able to accept your pull request. 25 | 26 | Please do not add an issue which says "I couldn't knit the Rmd". This is nearly always because users are missing one or more of the libraries and datasets used within (we do not re-install libraries in each Rmd script as this would slow down our compilation of the book material). You will find the missing library if you step through the Rmd one chunk at a time. 27 | -------------------------------------------------------------------------------- /Rscripts/cheung.R: -------------------------------------------------------------------------------- 1 | library(GEOquery) 2 | gse <- getGEO("GSE5859") 3 | pd<-pData(gse[[1]]) 4 | library(affy) 5 | filenames<-file.path("GSE5859",basename(as.character(pd[,38]))) 6 | e=justRMA(filenames=filenames) 7 | dates<-vector("character",ncol(exprs(e))) 8 | for(i in seq(along=dates)){ 9 | tmp<-affyio::read.celfile.header(filenames[i],info="full") 10 | dates[i]<-strsplit(tmp$ScanDate,"\ ")[[1]][1] 11 | } 12 | dates<-as.Date(dates,"%m/%d/%y") 13 | 14 | ###ethnicity info obtained from Jeff Leek 15 | eth <- readLines("fulldata_reprecent.txt",n=1) 16 | eth=strsplit(eth,"\ ")[[1]] 17 | eth=t(sapply(strsplit(eth,"_"),function(x) x)) 18 | 19 | gmnames<-gsub("_rep[12]","",as.character(pd[,1])) 20 | 21 | eth2<-eth[match(gmnames,eth[,2]),1] 22 | eth2[is.na(eth2)]<-"HAN" ##from LA, checked here 23 | ##http://ccr.coriell.org/Sections/Search/Advanced_Search.aspx?PgId=175 24 | 25 | 26 | pd=data.frame(ethnicity=eth2,date=dates,filename=I(basename(filenames))) 27 | pData(e)<-pd 28 | save(e,file="GSE5859.rda") 29 | -------------------------------------------------------------------------------- /Rscripts/cheungSubset.R: -------------------------------------------------------------------------------- 1 | library(Biobase) 2 | library(GSE5859) 3 | library(hgfocus.db) ##get the gene chromosome 4 | data(GSE5859) 5 | annot <- select(hgfocus.db, keys=featureNames(e), keytype="PROBEID", 6 | columns=c("CHR", "CHRLOC", "SYMBOL"))[,-4] 7 | ##for genes with multiples, pick on 8 | annot <-annot[match(featureNames(e),annot$PROBEID),] 9 | annot$CHR <- ifelse(is.na(annot$CHR),NA,paste0("chr",annot$CHR)) 10 | y<- colMeans(exprs(e)[which(annot$CHR=="chrY"),]) 11 | sex <- ifelse(y<4.5,"F","M") 12 | 13 | sampleInfo <- pData(e) 14 | sampleInfo$group <- ifelse(sex=="F",1,0) 15 | 16 | batch <- format(pData(e)$date,"%y%m") 17 | ind<-which(batch%in%c("0506","0510")) 18 | set.seed(1) 19 | N <- 12; N1 <-3; M<-12; M1<-9 20 | ind <- c(sample(which(batch=="0506" & sex=="F"),N1), 21 | sample(which(batch=="0510" & sex=="F"),N-N1), 22 | sample(which(batch=="0506" & sex=="M"),M1), 23 | sample(which(batch=="0510" & sex=="M"),M-M1)) 24 | 25 | geneExpression <- exprs(e)[,ind] 26 | sampleInfo <- sampleInfo[ind,] 27 | geneAnnotation <- annot 28 | 29 | save(geneExpression,sampleInfo,geneAnnotation,file="GSE5859Subset.rda") 30 | -------------------------------------------------------------------------------- /Rscripts/cltmovie.R: -------------------------------------------------------------------------------- 1 | dat<-read.csv("http://www.biostat.jhsph.edu/bstcourse/bio751/data/USheights_subsample.csv") 2 | 3 | library(animation) 4 | saveGIF({ 5 | set.seed(1) 6 | N=10 7 | L<-1000 8 | means<-vector("numeric",L) 9 | LIM=seq(69.28-4,69.28+4,0.33/sqrt(N)*sqrt(10)) 10 | LIM2=seq(69.28-4,69.28+4,0.1) 11 | for(i in 1:L){ 12 | means[i]<-mean(sample(dat$Height[dat$Gender==1],N)) 13 | if(i%%20==1){ 14 | dd=dnorm(LIM2,mean(dat$Height[dat$Gender==1]),sd(dat$Height[dat$Gender==1])/sqrt(N)) 15 | tmp=hist(means[1:i],ylim=c(0,150),xlim=range(LIM),breaks=LIM,freq=TRUE,xlab="average height",ylab="Density",main=paste0("N=",N)) 16 | k=sum(tmp$counts)/sum(dd)*length(dd)/length(tmp$counts) ##this is a normalizing constant to assure same are on plot 17 | lines(LIM2,dd*k,type="l",col=2,lwd=2) 18 | } 19 | } 20 | },'clt10.gif', interval = .05) 21 | 22 | 23 | -------------------------------------------------------------------------------- /Rscripts/dilution.R: -------------------------------------------------------------------------------- 1 | library(affy) 2 | fns <- list.celfiles(path="CEL",full=TRUE) 3 | pData <- read.table("dilution_pdata.txt",header=TRUE) 4 | stopifnot(all(pData$filename == grep("CEL\\/(.+)\\.cel","\\1",fns))) 5 | celData <- ReadAffy(filenames=fns,phenoData=pData,verbose=TRUE) 6 | dilution <- rma(celData,verbose=TRUE) 7 | save(dilution, file="dilution.RData") 8 | -------------------------------------------------------------------------------- /Rscripts/gsea.R: -------------------------------------------------------------------------------- 1 | tab <- read.delim("gseacelfiles/reannotate_select_cal.gct",as.is=TRUE,skip=2) 2 | library(affy) 3 | fns <- list.celfiles(path="gseacelfiles") 4 | 5 | sns <- gsub("\\.CEL\\.gz","",fns) 6 | tmp<-strsplit(names(tab)[-c(1,2)],"\\.") 7 | sns2 <- sapply(tmp,function(x)x[2]) 8 | tmp2 <- data.frame(t(sapply(tmp,function(x) strsplit(x[1],"_")[[1]]))) 9 | filenames <- fns[match(sns2,sns)] 10 | ab<- ReadAffy(filenames=filenames,celfile.path="gseacelfiles") 11 | e<-rma(ab) 12 | dates<-vector("character",ncol(exprs(e))) 13 | for(i in seq(along=dates)){ 14 | tmp<-affyio::read.celfile.header(file.path("gseacelfiles",filenames[i]),info="full") 15 | dates[i]<-strsplit(tmp$ScanDate,"\ ")[[1]][1] 16 | } 17 | dates<-as.Date(dates,"%m/%d/%y") 18 | 19 | tmp2$dates <- dates 20 | 21 | pData(e)<-tmp2 22 | 23 | 24 | save(e,file="gsea.rda") 25 | 26 | ##adding MAS 5.0 27 | library(simpleaffy) 28 | m <- justMAS(ab) 29 | pData(m)<-tmp2 30 | save(m,file="gseamas5.rda") 31 | -------------------------------------------------------------------------------- /Rscripts/limma_quiz.R: -------------------------------------------------------------------------------- 1 | # biocLite("gaschYHS") 2 | library(gaschYHS) 3 | data(gaschYHS) 4 | e <- gaschYHS 5 | head(pData(e)[,c("time","status")],12) 6 | 7 | e <- e[,1:8] 8 | e <- e[!apply(exprs(e), 1, anyNA),] 9 | 10 | # question 1 11 | 12 | condition <- factor(rep(1:2,c(5,3))) 13 | design <- model.matrix(~ condition) 14 | library(limma) 15 | fit <- lmFit(e, design) 16 | fit <- eBayes(fit) 17 | (tt <- topTable(fit, coef=2)) 18 | tt["YDR171W",] 19 | 20 | # question 2 21 | 22 | e = e[,1:5] 23 | time <- pData(e)$time 24 | design <- model.matrix(~ time + I(time^2)) 25 | 26 | fit <- lmFit(e, design) 27 | fit <- eBayes(fit) 28 | (tt <- topTable(fit, coef=2:3)) 29 | tt["YGR211W",] 30 | 31 | -------------------------------------------------------------------------------- /Rscripts/make_two_tables_for_class.R: -------------------------------------------------------------------------------- 1 | library(minfi) 2 | datadir="/home/bst/other/hcorrada/methyl/exps/tcga/raw_data/colon" 3 | 4 | clinicalDir=file.path(datadir,"Clinical/Biotab") 5 | sample_tab=read.delim(file.path(clinicalDir,"biospecimen_sample_coad.txt"),sep="\t",stringsAsFactors=FALSE) 6 | keep=sample_tab$sample_type %in% c("Primary Tumor", "Solid Tissue Normal") 7 | sample_tab=sample_tab[keep,] 8 | 9 | patient_id=unique(sapply(strsplit(sample_tab$bcr_sample_barcode,split="-"), function(x) paste(x[1:3],collapse="-"))) 10 | 11 | tumor_sample_id=sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Primary Tumor"] 12 | normal_sample_id=sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Solid Tissue Normal"] 13 | 14 | # read tumor data 15 | tumor_tab=read.delim(file.path(clinicalDir,"biospecimen_tumor_sample_coad.txt"),sep="\t",stringsAsFactors=FALSE) 16 | 17 | ###make it slightly harder by changing a name 18 | write.csv(tumor_tab,file="tumor_tab.csv",row.names=FALSE) 19 | write.csv(sample_tab,file="sample_tab.csv",row.names=FALSE) 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Rscripts/microarraydata-eda-lab.R: -------------------------------------------------------------------------------- 1 | # biocLite("SpikeIn") 2 | library(SpikeIn) 3 | data("SpikeIn95") 4 | ##from previous data exploration we know that array 55 is bad. we pick 5 | ##two groups with same 52-55 and 56-59 6 | int=pm(SpikeIn95)[,52:59] ###int for intensity 7 | spikeInDesign=pData(SpikeIn95)[52:59,] 8 | 9 | cdfname <- getCdfInfo(SpikeIn95) 10 | psets <- as.list(cdfname) 11 | psets <- psets[order(names(psets))] 12 | index <- unlist(sapply(psets, function(x) x[, 1]), use.names = FALSE) 13 | locations <- indices2xy(index,cdf="hgu95acdf") 14 | 15 | save(int,spikeInDesign,locations,file="spikeInSubset.rda") 16 | -------------------------------------------------------------------------------- /Rscripts/read_tcga_meth.R: -------------------------------------------------------------------------------- 1 | library(minfi) 2 | ## DOwnload 3 | ## colon/DNA_Methylation/JHU_USC__HumanMethylation450/Level_1/ 4 | ## from tcga and put in datadir 5 | datadir="/datadir" 6 | 7 | 8 | clinicalDir=file.path(datadir,"Clinical/Biotab") 9 | sample_tab=read.delim(file.path(clinicalDir,"biospecimen_sample_coad.txt"),sep="\t",stringsAsFactors=FALSE) 10 | keep=sample_tab$sample_type %in% c("Primary Tumor", "Solid Tissue Normal") 11 | sample_tab=sample_tab[keep,] 12 | 13 | patient_id=unique(sapply(strsplit(sample_tab$bcr_sample_barcode,split="-"), function(x) paste(x[1:3],collapse="-"))) 14 | 15 | tumor_sample_id=sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Primary Tumor"] 16 | normal_sample_id=sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Solid Tissue Normal"] 17 | 18 | # read tumor data 19 | tumor_tab=read.delim(file.path(clinicalDir,"biospecimen_tumor_sample_coad.txt"),sep="\t",stringsAsFactors=FALSE) 20 | 21 | tab=merge(sample_tab, tumor_tab, by="bcr_sample_uuid", suffixes=c(".sample",".tumor"),all.x=TRUE) 22 | 23 | # read normal data 24 | normal_tab=read.delim(file.path(clinicalDir,"biospecimen_normal_control_coad.txt"),sep="\t",stringsAsFactors=FALSE) 25 | tab=merge(tab, normal_tab, by="bcr_sample_uuid", suffixes=c(".tumor",".normal"),all.x=TRUE) 26 | 27 | tab$bcr_patient_barcode=tab$bcr_patient_barcode.tumor 28 | 29 | ii=is.na(tab$bcr_patient_barcode) 30 | tab$bcr_patient_barcode[ii]=tab$bcr_patient_barcode.normal[ii] 31 | 32 | # read patient data 33 | patient_tab=read.delim(file.path(clinicalDir,"clinical_patient_coad.txt"),sep="\t",stringsAsFactors=FALSE) 34 | names(patient_tab)=paste("patient",names(patient_tab),sep=".") 35 | tmp=merge(tab,patient_tab,by.x="bcr_patient_barcode",by.y="patient.bcr_patient_barcode",all.x=TRUE,suffixes=c(".sample",".patient")) 36 | tab=tmp 37 | 38 | # read meth metadata 39 | methMetaDir=file.path(datadir,"METADATA/JHU_USC__HumanMethylation450") 40 | methMeta_tab=read.delim(file.path(methMetaDir,"jhu-usc.edu_COAD.HumanMethylation450.1.4.0.sdrf.txt"),sep="\t",stringsAsFactors=FALSE) 41 | 42 | sample_barcode=sapply(strsplit(methMeta_tab$Comment..TCGA.Barcode.,split="-"),function(x) paste(x[1:4],collapse="-")) 43 | m=match(tab$bcr_sample_barcode,sample_barcode) 44 | tab$Basename=gsub("_Grn\\.idat","",methMeta_tab$Array.Data.File[m]) 45 | tab=tab[!is.na(tab$Basename),] 46 | 47 | basedir=file.path(datadir,"DNA_Methylation/JHU_USC__HumanMethylation450/Level_1") 48 | tab$Basename=file.path(basedir,tab$Basename) 49 | keep=file.exists(paste(tab$Basename,"_Grn.idat",sep="")) 50 | colon_targets=tab 51 | objs=grep("tab",ls(),value=TRUE) 52 | rm(list=objs) 53 | objs=grep("dir",ls(),value=TRUE,ignore=TRUE) 54 | rm(list=objs) 55 | 56 | nms=names(colon_targets) 57 | targets=colon_targets[nms] 58 | 59 | targets$Status=factor(ifelse(targets$sample_type=="Primary Tumor","cancer","normal"),levels=c("normal","cancer")) 60 | targets$Tissue=tolower(targets$patient.tumor_tissue_site) 61 | targets$Sex=targets$patient.gender 62 | 63 | datadir="rdas" 64 | save(targets,colon_targets,breast_targets,lung_targets,file=file.path(datadir,"targets.rda")) 65 | 66 | ##read raw data 67 | rgset <- read.450k(targets$Basename,verbose=TRUE) 68 | pData(rgset)<-targets 69 | 70 | #normalize with illumina default 71 | mset1<-preprocessIllumina(rgset) 72 | mset1<-mapToGenome(mset1) 73 | meth <- getBeta(mset1,type="Illumina") 74 | gr <- granges(mset1) 75 | pd <- pData(mset1) 76 | 77 | save(meth,gr,pd,file="coloncancermeth.rda") 78 | -------------------------------------------------------------------------------- /advinference/quick_Bioc_intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Introduction to Advanced Statistics for the Life Sciences" 3 | author: "Rafa" 4 | date: "January 31, 2015" 5 | output: html_document 6 | layout: page 7 | --- 8 | 9 | ```{r options, echo=FALSE} 10 | library(knitr) 11 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 12 | ``` 13 | 14 | # Installing Bioconductor 15 | 16 | Many of the datasets we will use in this chapter require packages made available via the Bioconductor project. Bioconductor is similar to CRAN but uses a different set of functions for downloads. It also includes many more data packages as well as _annotation_ packages that store information about either high-throughout products or information about molecular endpoints such as genes. We will need to some of these packages in this chapter. Here we show how to install the Biobase package. 17 | 18 | ```{r,eval=FALSE} 19 | source("http://bioconductor.org/biocLite.R") 20 | biocLite("Biobase") 21 | ``` 22 | 23 | You can install a suite of recommended packages by simply typing `biocLite()` 24 | 25 | # Data organized in three tables 26 | 27 | One of the great advantages of using Bioconductor for high throughput data is that it provides object classes specifically designed to keep high throughput data organized. Below we show an example of how the three tables that are needed to conduct data analysis are available from Bioconductor data objects. For example for gene expression we can use the ExpressionSet object. 28 | 29 | ```{r,message=FALSE} 30 | library(Biobase) 31 | ##can be installed like this: devtools::install_github("genomicsclass/GSE5859") 32 | library(GSE5859) 33 | data(GSE5859) 34 | class(e) 35 | ``` 36 | 37 | 38 | These objects were originally designed for gene expression data so the methods to extract the high throughput measurements have related names: 39 | ```{r} 40 | dat <- exprs(e) 41 | dim(dat) 42 | ``` 43 | 44 | The information about samples is also stored in this object and the functions to create it try to guarantee that the columns of `exprs(e)` match the rows of the sample information table. `pData` is use as shorthand for _phenotype_ data. 45 | : 46 | 47 | ```{r} 48 | sampleInfo <- pData(e) 49 | dim(sampleInfo) 50 | head(sampleInfo) 51 | ``` 52 | 53 | A final table, which we will cover in much more detail in the Bioconductor chapter, is a table that describes the rows, in this case genes. Because each product will have a different table, these have already been created in Bioconductor. Because there are certain products that are widely used, Bioconductor makes databases available from which you can extract this information. This every object does not have to carry around this information: 54 | 55 | ```{r} 56 | library(hgfocus.db) 57 | annot <- select(hgfocus.db, keys=featureNames(e), keytype="PROBEID", columns=c("CHR", "CHRLOC", "SYMBOL")) 58 | ##pick one 59 | annot <-annot[match(featureNames(e),annot$PROBEID),] 60 | head(annot) 61 | dim(annot) 62 | ``` 63 | -------------------------------------------------------------------------------- /advinference/storage/crossvalidation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Cross-validation 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | In this lab, we will explore a method for picking parameters in a 12 | prediction / machine learning task, which is called 13 | *cross-validation*. 14 | 15 | Suppose we have a prediction algorithm which is going to predict the 16 | class of some observations using a number of features. For example, we 17 | will use the gene expression values to predict the tissue type in our 18 | tissues gene expression dataset. 19 | 20 | If this algorithm has a parameter which controls the behavior, we 21 | might pick the value of this parameter which minimizes the 22 | classification error. However, trying to classify the same 23 | observations as we use to *train* the model can be misleading. 24 | In lecture, we saw that for K-nearest neighbors, using k=1 will always 25 | give 0 classification error in the training set (because we use the 26 | single observation to classify itself). Instead, it's better to pick 27 | the parameter using the algorithms performance on a set of 28 | observations which the algorithm has never seen, a *test* set. 29 | 30 | Cross-validation is simply a method which splits the data into a 31 | number of *folds*. If we have N folds, then the algorithm typically 32 | trains on (N-1) of the folds, and test the algorithms performance on 33 | the left-out single fold. This is then repeated N times until each 34 | fold has been used as a *test* set. 35 | 36 | Let's load in the tissue gene expression dataset: 37 | 38 | ```{r} 39 | # library(devtools) 40 | # install_github("dagdata","genomicsclass") 41 | library(dagdata) 42 | data(tissuesGeneExpression) 43 | library(Biobase) 44 | rownames(tab) <- tab$filename 45 | t <- ExpressionSet(e, AnnotatedDataFrame(tab)) 46 | t$Tissue <- factor(t$Tissue) 47 | colnames(t) <- paste0(t$Tissue, seq_len(ncol(t))) 48 | ``` 49 | 50 | Let's drop one of the tissues which doesn't have many samples: 51 | 52 | ```{r} 53 | library(class) 54 | table(t$Tissue) 55 | t <- t[,t$Tissue != "placenta"] 56 | t$Tissue <- droplevels(t$Tissue) 57 | table(t$Tissue) 58 | x <- t(exprs(t)) 59 | ``` 60 | 61 | We will use the `createFolds` function from the `caret` 62 | package to make 5 folds of the data, which are 63 | balanced over the tissues. Don't be confused that the 64 | `createFolds` function uses the same letter 'k' as the k in 65 | K-nearest neighbors. These 'k' are unrelated. 66 | The caret function `createFolds` is 67 | asking for how many folds to create, the 'N' from above. The `knn` 68 | function is asking how many closest observations to use to classify 69 | the test observations. 70 | 71 | ```{r} 72 | # install.packages("caret") 73 | library(caret) 74 | set.seed(1) 75 | idx <- createFolds(t$Tissue, k=5) 76 | sapply(idx, function(i) table(t$Tissue[i])) 77 | ``` 78 | 79 | Now we can try out the K-nearest neighbors method on a single fold: 80 | 81 | ```{r} 82 | pred <- knn(train = x[ -idx[[1]], ], test = x[ idx[[1]], ], cl=t$Tissue[ -idx[[1]] ], k=5) 83 | table(true=t$Tissue[ idx[[1]] ], pred) 84 | ``` 85 | 86 | As the prediction is looking too good in the space of all the genes, 87 | let's make it more difficult for the K-nearest neighbors algorithm. 88 | We will use a reduced dimension representation of the dataset, using 89 | the *multi-dimensional scaling* algorithm used in the previous section. 90 | 91 | ```{r} 92 | xsmall <- cmdscale(dist(x)) 93 | ``` 94 | 95 | Now we will create a loop, which tries out each value of k from 1 to 96 | 12, and runs the K-nearest neighbors algorithm on each fold. We then 97 | ask for the proportion of errors for each fold, and report the average 98 | from the 5 cross-validation folds: 99 | 100 | ```{r} 101 | set.seed(1) 102 | ks <- 1:12 103 | res <- sapply(ks, function(k) { 104 | # try out each version of k from 1 to 12 105 | 106 | res.k <- sapply(seq_along(idx), function(i) { 107 | # loop over each of the 5 cross-validation folds 108 | 109 | # predict the held-out samples using k nearest neighbors 110 | pred <- knn(train = xsmall[ -idx[[i]], ], 111 | test = xsmall[ idx[[i]], ], 112 | cl = t$Tissue[ -idx[[i]] ], k = k) 113 | 114 | # the ratio of misclassified samples 115 | mean(t$Tissue[ idx[[i]] ] != pred) 116 | }) 117 | 118 | # average over the 5 folds 119 | mean(res.k) 120 | }) 121 | ``` 122 | 123 | Now we can plot the mean misclassification rate for each value of k: 124 | 125 | ```{r} 126 | plot(ks, res, type="o") 127 | ``` 128 | -------------------------------------------------------------------------------- /advinference/storage/heatmaps.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Dimension reduction and heatmaps 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ## Dimension reduction 12 | 13 | We start loading the tissue gene expression dataset: 14 | 15 | ```{r} 16 | # library(devtools) 17 | # install_github("dagdata","genomicsclass") 18 | library(dagdata) 19 | data(tissuesGeneExpression) 20 | library(Biobase) 21 | rownames(tab) <- tab$filename 22 | t <- ExpressionSet(e, AnnotatedDataFrame(tab)) 23 | t$Tissue <- factor(t$Tissue) 24 | colnames(t) <- paste0(t$Tissue, seq_len(ncol(t))) 25 | ``` 26 | 27 | As we noticed in the end of the clustering section, we weren't able to 28 | *see* why the k-means algorithm defined a certain set of clusters 29 | using only the first two genes. 30 | 31 | ```{r} 32 | x <- t(exprs(t)) 33 | km <- kmeans(x, centers=3) 34 | library(rafalib) 35 | mypar() 36 | plot(x[,1], x[,2], col=km$cluster, pch=16) 37 | ``` 38 | 39 | Instead of the first two genes, let's use the *multi-dimensional 40 | scaling* algorithm which Rafa introduced in the lectures. This is a 41 | projection from the space of all genes to a two dimensional space, 42 | which mostly preserves the inter-sample distances. The `cmdscale` 43 | function in R takes a distance object and returns a matrix which has 44 | two dimensions (columns) for each sample. 45 | 46 | ```{r} 47 | mds <- cmdscale(dist(x)) 48 | plot(mds, col=km$cluster, pch=16) 49 | ``` 50 | 51 | We can also plot the names of the tissues with the color of the cluster. 52 | 53 | ```{r} 54 | plot(mds, type="n") 55 | text(mds, colnames(t), col=km$cluster) 56 | ``` 57 | 58 | ...or the names of the tissues with the color of the tissue. 59 | 60 | ```{r} 61 | plot(mds, type="n") 62 | text(mds, colnames(t), col=as.fumeric(t$Tissue)) 63 | ``` 64 | 65 | ## Heatmaps 66 | 67 | Heatmaps are useful plots for visualizing the expression values for a 68 | subset of genes over all the samples. The *dendrogram* on top and on 69 | the side is a hierarchical clustering as we saw before. First we will 70 | use the `heatmap` available in base R. First define a color palette. 71 | 72 | ```{r} 73 | # install.packages("RColorBrewer") 74 | library(RColorBrewer) 75 | hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) 76 | ``` 77 | 78 | Now, pick the genes with the top variance over all samples: 79 | 80 | ```{r} 81 | library(genefilter) 82 | rv <- rowVars(exprs(t)) 83 | idx <- order(-rv)[1:40] 84 | ``` 85 | 86 | Now we can plot a heatmap of these genes: 87 | 88 | ```{r} 89 | heatmap(exprs(t)[idx,], col=hmcol) 90 | ``` 91 | 92 | The `heatmap.2` function in the `gplots` package on CRAN is a bit more 93 | customizable, and stretches to fill the window. Here we add colors to 94 | indicate the tissue on the top: 95 | 96 | ```{r} 97 | # install.packages("gplots") 98 | library(gplots) 99 | cols <- palette(brewer.pal(8, "Dark2"))[t$Tissue] 100 | cbind(colnames(t),cols) 101 | heatmap.2(exprs(t)[idx,], trace="none", ColSideColors=cols, col=hmcol) 102 | ``` 103 | 104 | -------------------------------------------------------------------------------- /advinference/storage/hierarchical_modeling.R: -------------------------------------------------------------------------------- 1 | # The following script produces the plots seen in the hierarchical modeling lecture. 2 | # These are also produced in the using_limma.Rmd file. 3 | 4 | # biocLite("SpikeInSubset") 5 | library(SpikeInSubset) 6 | data(rma95) 7 | library(genefilter) 8 | fac <- factor(rep(1:2,each=3)) 9 | tt <- rowttests(exprs(rma95),fac) 10 | mask <- with(tt, abs(dm) < .2 & p.value < .01) 11 | spike <- rownames(rma95) %in% colnames(pData(rma95)) 12 | cols <- ifelse(mask,"red",ifelse(spike,"dodgerblue","black")) 13 | 14 | with(tt, plot(-dm, -log10(p.value), cex=.8, pch=16, 15 | xlim=c(-1,1), ylim=c(0,5), 16 | xlab="difference in means", 17 | col=cols)) 18 | abline(h=2,v=c(-.2,.2), lty=2) 19 | 20 | tt$s <- apply(exprs(rma95), 1, function(row) sqrt(.5 * (var(row[1:3]) + var(row[4:6])))) 21 | with(tt, plot(s, -log10(p.value), cex=.8, pch=16, 22 | log="x",xlab="estimate of standard deviation", 23 | col=cols)) 24 | 25 | library(limma) 26 | fit <- lmFit(rma95, model.matrix(~ fac)) 27 | ebfit <- ebayes(fit) 28 | limmares <- data.frame(dm=coef(fit)[,"fac2"], p.value=ebfit$p.value[,"fac2"]) 29 | with(limmares, plot(dm, -log10(p.value),cex=.8, pch=16, 30 | col=cols,xlab="difference in means", 31 | xlim=c(-1,1), ylim=c(0,5))) 32 | abline(h=2,v=c(-.2,.2), lty=2) 33 | 34 | 35 | n <- 40 36 | qs <- seq(from=0,to=.2,length=n) 37 | idx <- sapply(seq_len(n),function(i) which(as.integer(cut(tt$s^2,qs)) == i)[1]) 38 | idx <- idx[!is.na(idx)] 39 | par(mar=c(5,5,2,2)) 40 | plot(1,1,xlim=c(0,.21),ylim=c(0,1),type="n", 41 | xlab="variance estimates",ylab="",yaxt="n") 42 | axis(2,at=c(.1,.9),c("before","after"),las=2) 43 | segments((tt$s^2)[idx],rep(.1,n), 44 | ebfit$s2.post[idx],rep(.9,n)) 45 | 46 | -------------------------------------------------------------------------------- /advinference/storage/justsvd_duplicate_content.txt: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Principal component analysis and Singular value decomposition 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 8 | ``` 9 | 10 | We have measurements for $m$ genes and $n$ samples in a matrix $Y_{m\times n}$. Suppose we 11 | suspect that a batch effect is responsible for most the variability. We know that some samples fall in one batch and the rest in an other, but we don't know which sample is in which batch. Can we discover the batch? If we assume that many genes will have a different average in batch compared to the other then we can quantify this problem as searching for the separation that makes many of these differences in average large. TO simplify and illustrate further assume $n/2$ samples are in one batch and $n/2$ in the other but we dont know whcih. Can we find the separation? 12 | 13 | Assume the gene in row $i$ is affected by batch. Then 14 | $$ 15 | (Y_{i1}, \dots, Y_{in}) (v_1,\dots,v_n) = \sum_{i=1}^n v_i Y_{in}' 16 | $$ 17 | with each $v_i$ either $1/(n/2)$ or $-1/(n/2)$ will give us the average difference between each batch for gene $i$, call it $\m_i$. Because we think the batch effect many genes then we want to find the vector $v=(v_1\dots,v_n)$ that maximizes the variace of $m_1,\dots,m_n$. 18 | 19 | There is actually a nice mathematical result that can help us find this vector. In fact, if we let $v$ be any vector with standard deviation 1, then the $v$ that maximizes the variance of $Y_i v$ is called the first _principal component_ directions or eigen vector. The vectors of "differences" $Y_i v$, $i=1,\dots,n$ is the first principal component and below we will refer to it as $v_1$ 20 | 21 | Now, suppose we think there is more unwanted variability affecting several genes. We can subtract the first principal component from $Y_{m\time n}$, $r_{m\times n}=Y_{m \times n} - Y_{m \times n} v_1 v_1'$ we can then find the vector $v_2$ that results in the most variable vector $r_{m\times n} v_2$. We continue this way until to obtain $n$ eigen vectors $V_{n\times n} = (v_1,\dots v_n)$. 22 | 23 | ## Singular value decomposition (SVD) 24 | 25 | The SVD is a very powerful mathematical result that gives us an algorithm to write a matrix in the following way: 26 | 27 | $ 28 | Y_{m\times n} = U_{m\ times n} D_{n \times n} V’_{n \times n} 29 | $ 30 | 31 | With the columns of $V$ the matrix with columns the eigen vectors defined above. The matrices $U$ and $V$ are _orthogonal_ meaning that 32 | with $U_i'U_i=1$ and $U_i'U_i$=0 where $U_i$ and $U_j$ are $i$th and $j$th columns of 1. 33 | 34 | Notice this matrix: 35 | $$ 36 | Y_{m\times n} V = U_{m \times n} D_{n\times n} 37 | $$ 38 | has the principal coponents as columns and that the standard deviation of the $i$ principal component is $D_{i,i}/n$: 39 | $$ 40 | (Y_{m\times n} V)'(Y_{m\times n} V) = D_{n\times n} U'_{m\times n} U_{m\times n} = D^2_{n\times n} 41 | $$ 42 | 43 | ## Example 44 | Let's consider a simple example. Suppose we have the heights of identical twin pairs in an $m\times 2$ matrix. We are asked to 45 | 46 | ```{r} 47 | library(MASS) 48 | set.seed(1) 49 | y=mvrnorm(1000,c(0,0),3^2*matrix(c(1,.9,.9,1),2,2)) 50 | mypar(1,1) 51 | plot(y,xlab="Twin 1 (inches away from avg)",ylab="Twin 2 (inches away from avg)") 52 | ``` 53 | 54 | 55 | Transmitting the two heights seems inefficient given how correlated they. If we tranmist the pricipal components instead we save money. Let's see how: 56 | 57 | ```{r} 58 | s=svd(y) 59 | plot(s$u[,1]*s$d[1],s$u[,2]*s$d[2],ylim=range(s$u[,1]*s$d[1]),xlab="First PC",ylab="Second PC") 60 | ``` 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /advinference/storage/multtest.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Multiple testing 4 | --- 5 | 6 | The following code reproduces the images in the Multiple testing lecture. 7 | 8 | ```{r options, echo=FALSE} 9 | library(knitr) 10 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 11 | ``` 12 | 13 | First we make a function for drawing sections of a uniform distribution. 14 | 15 | ```{r} 16 | drawU <- function(a,y,boxes=FALSE,xmax=1) { 17 | plot(1,1,type="n",xlim=c(0,xmax),ylim=c(0,1.2*y),bty="L",xlab="p",ylab="",las=1) 18 | lines(c(0,1,1,0,0),c(0,0,y,y,0)) 19 | polygon(c(0,a,a,0),c(0,0,y,y),col=rgb(1,0,0,.5)) 20 | polygon(c(a,1,1,a),c(0,0,y,y),col=rgb(0,0,0,.1)) 21 | x <- 1/a 22 | if (boxes) { 23 | segments(0:x/x, 0, 0:x/x, y) 24 | } 25 | } 26 | ``` 27 | 28 | Here we draw a uniform distribution, and show how many p-values we expect at different cutoffs. 29 | 30 | ```{r} 31 | drawU(.3,1) 32 | drawU(1/20,1) 33 | drawU(1/20,20000/20,TRUE) 34 | drawU(1/100,20000/100,TRUE) 35 | drawU(1/1000,20000/1000,TRUE,.01) 36 | ``` 37 | 38 | The distribution of p-values using a z-score. 39 | 40 | ```{r} 41 | z <- rnorm(100) 42 | brks <- 0:20/20 43 | hist(pnorm(z),col="grey",main="",xlab="p",breaks=brks) 44 | ``` 45 | 46 | The same as above, but with now many more z-scores. Now the distribution looks more uniform. 47 | 48 | ```{r} 49 | z <- rnorm(10000) 50 | hist(pnorm(z),col="grey",main="",xlab="p",breaks=brks) 51 | ``` 52 | 53 | What happens if we spike in 500 small z-scores? 54 | 55 | ```{r} 56 | z <- c(rnorm(10000), rep(-3.72,500)) 57 | h <- hist(pnorm(z),col="grey",main="",xlab="p",breaks=brks) 58 | abline(h=10000/20,col="red",lwd=4,lty=3) 59 | abline(h=h$counts[1],col="blue",lwd=4,lty=3) 60 | ``` 61 | 62 | Now, making the bins in the histogram smaller, i.e. looking for a smaller p-value threshold: 63 | 64 | ```{r} 65 | h <- hist(pnorm(z),col="grey",main="",xlab="p",breaks=0:50/50) 66 | abline(h=10000/50,col="red",lwd=4,lty=3) 67 | abline(h=h$counts[1],col="blue",lwd=4,lty=3) 68 | ``` 69 | 70 | Even smaller... 71 | 72 | ```{r} 73 | h <- hist(pnorm(z),col="grey",main="",xlab="p",breaks=0:100/100) 74 | abline(h=10000/100,col="red",lwd=4,lty=3) 75 | abline(h=h$counts[1],col="blue",lwd=4,lty=3) 76 | ``` 77 | 78 | Even smaller... 79 | 80 | ```{r} 81 | h <- hist(pnorm(z),col="grey",main="",xlab="p",breaks=0:10000/10000) 82 | abline(h=10000/10000,col="red",lwd=4,lty=3) 83 | abline(h=sum(h$counts[1]),col="blue",lwd=4,lty=3) 84 | ``` 85 | 86 | This visualizes the [Benjamini Hochberg method](#foot). 87 | 88 | ```{r} 89 | set.seed(1) 90 | pvals <- c(runif(90),runif(10,0,.001)) 91 | plot(sort(pvals),xlab="i",ylab="p-value",ylim=c(0,1)) 92 | abline(0, .05/length(pvals)) 93 | legend("top",expression(slope~alpha/m)) 94 | ``` 95 | 96 | ```{r} 97 | plot(sort(pvals),xlab="i",ylab="p-value",ylim=c(0,.03),xlim=c(0,13)) 98 | abline(0, .05/length(pvals)) 99 | legend("top",expression(slope~alpha/m)) 100 | ``` 101 | 102 | ## Footnotes 103 | 104 | Yoav Benjamini and Yosef Hochberg, "Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing". Journal of the Royal Statistical Society. 1995. 105 | 106 | -------------------------------------------------------------------------------- /advinference/storage/pca_svd.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Running PCA and SVD in R 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | In this unit, we will show how to perform principal component analysis (PCA) and singular value decomposition (SVD) in R, and how the two are related to each other. We will use the tissue gene expression dataset from the week 5 lectures and labs. 12 | 13 | ```{r} 14 | # library(devtools) 15 | # install_github("dagdata","genomicsclass") 16 | library(dagdata) 17 | data(tissuesGeneExpression) 18 | library(rafalib) 19 | group <- as.fumeric(tab$Tissue) 20 | ``` 21 | 22 | 23 | First, the typical principal component analysis on the samples would be to transpose the data such that the samples are rows of the data matrix. The `prcomp` function can be used to return the principal components and other variables. 24 | 25 | ```{r} 26 | x <- t(e) 27 | pc <- prcomp(x) 28 | # ?prcomp 29 | names(pc) 30 | plot(pc$x[,1], pc$x[,2], col=group, main="PCA", xlab="PC1", ylab="PC2") 31 | ``` 32 | 33 | This PCA is equivalent to performing the SVD on the centered data, where the centering occurs on the columns (here genes). We can use the `sweep` function to perform arbitrary operations on the rows and columns of a matrix. The second argument specifies we want to operate on the columns (1 would be used for rows), and the third and fourth arguments specify that we want to subtract the column means. 34 | 35 | ```{r} 36 | cx <- sweep(x, 2, colMeans(x), "-") 37 | sv <- svd(cx) 38 | names(sv) 39 | plot(sv$u[,1], sv$u[,2], col=group, main="SVD", xlab="U1", ylab="U2") 40 | ``` 41 | 42 | So the columns of U from the SVD correspond to the principal components `x` in the PCA. Furthermore, the matrix V from the SVD is equivalent to the `rotation` matrix returned by `prcomp`. 43 | 44 | ```{r} 45 | sv$v[1:5,1:5] 46 | pc$rotation[1:5,1:5] 47 | ``` 48 | 49 | The diagonal elements of D from the SVD are proportional to the standard deviations returned by PCA. The difference is that the standard deviations from `prcomp` are sample standard deviations (`prcomp` returns unbiased estimates of sample variance, so with the $n / (n - 1)$ correction). The elements of D are formed by taking the sum of the squares of the principal components but not dividing by the sample size. 50 | 51 | ```{r} 52 | head(sv$d^2) 53 | head(pc$sdev^2) 54 | head(sv$d^2 / (ncol(e) - 1)) 55 | ``` 56 | 57 | By dividing the variances by the sum, we get a plot of the ratio of variance explained by each principal component. 58 | 59 | ```{r} 60 | plot(sv$d^2 / sum(sv$d^2), xlim=c(0,15), type="b", pch=16, 61 | xlab="principal components", 62 | ylab="variance explained") 63 | plot(sv$d^2 / sum(sv$d^2), type="b", pch=16, 64 | xlab="principal components", 65 | ylab="variance explained") 66 | ``` 67 | 68 | Note that, not centering the data before running `svd` results in a slightly different plot: 69 | 70 | ```{r} 71 | svNoCenter <- svd(x) 72 | plot(pc$x[,1], pc$x[,2], col=group, main="PCA", xlab="PC1", ylab="PC2") 73 | points(0,0,pch=3,cex=4,lwd=4) 74 | plot(svNoCenter$u[,1], svNoCenter$u[,2], col=group, main="SVD not centered", xlab="U1", ylab="U2") 75 | ``` 76 | 77 | # SVD on (genes vs samples) and (samples vs genes) 78 | 79 | Finally, we show that the SVD on the data matrix where samples are columns -- as used in the Surrogate Variable Analysis SVA -- is equivalent to the SVD on the data matrix where the samples are rows, if no centering has been done. 80 | 81 | 82 | ```{r} 83 | sv2 <- svd(t(e)) 84 | plot(sv2$u[,1], sv2$u[,2], col=group, main="samples vs genes (typical PCA)", xlab="U1", ylab="U2") 85 | sv1 <- svd(e) 86 | plot(sv1$v[,1], sv1$v[,2], col=group, main="genes vs samples (SVA)", xlab="V1", ylab="V2") 87 | ``` 88 | 89 | The question of which direction to center depends on what the focus of the analysis is. For comparing sample distances, as in the typical PCA plot, the rows are samples and the genes are centered. For finding genes which contribute to batch, as in the SVA model, the rows are genes and the samples are centered. 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /advinference/storage/sva.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Batch adjustment 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | To illustrate how we can adjust for batch effects using statistcal methods, we will create a data example in which the outcome of interest is confounded with batch but not completely. We will also select a outcome for which we have an expectation of what genes should be differentially expressed. Namely, we make sex the outcome of interest and expect genes on the Y chromosome to be differentially expressed. Note that we may also see genes from the X chromosome as differentially expressed as some escape X inactivation. 12 | 13 | We start by finding the genes on the Y chromosome. 14 | ```{r} 15 | library(rafalib) 16 | library(GSE5859Subset) 17 | data(GSE5859Subset) 18 | y <- geneExpression-rowMeans(geneExpression) 19 | ``` 20 | 21 | 22 | To illustrate the confounding we will pick some genes to show in a heatmap plot. We pick all Y chromosome genes, some genes that we see correlate with batch, and then some randomly selected genes. 23 | ```{r} 24 | ind1 <- which(geneAnnotation$CHR=="chrY") ##real differences 25 | month <- factor(format(sampleInfo$date,"%m")) 26 | tt<-genefilter::rowttests(y,month) 27 | ind2 <- setdiff(c(order(tt$dm)[1:25],order(-tt$dm)[1:25]),ind1) 28 | ###now pick at random from rest: 29 | set.seed(1) 30 | ind0 <- setdiff(sample(seq(along=tt$dm),50),c(ind2,ind1)) 31 | geneindex<-c(ind2,ind0,ind1) 32 | 33 | mat<-geneExpression[geneindex,] 34 | mat <- mat-rowMeans(mat) 35 | ``` 36 | 37 | Here is a the data for the selected genes: 38 | ```{r} 39 | icolors <- rev(brewer.pal(11,"RdYlBu")) 40 | mypar(1,1) 41 | image(t(mat),xaxt="n",yaxt="n",col=icolors) 42 | ``` 43 | 44 | So what follows is like the analysis we would do in practice. We don't know there is a batch and we are interested in finding genes that are different between males and females. We start by computing t-statistics and p-values comparing males and females. We use histograms to notice the problem introduced by the batch. 45 | 46 | The batch effect adjustment methods are best described with the linear models so we start by writing down the linear more for this particular case: 47 | 48 | 49 | 50 | ## SVA 51 | 52 | ```{r} 53 | library(sva) 54 | library(limma) 55 | sex <- sampleInfo$group 56 | mod <- model.matrix(~sex) 57 | cind <- order( as.Date(sampleInfo$date) ) 58 | dates <- gsub("2005-","",sampleInfo$date) 59 | weights=rep(1,nrow(y)) 60 | for(b in 1:5){ 61 | mypar2(1,1) 62 | par(mar = c(4.1, 5.1, 3.5, 2.1)) 63 | layout(matrix(c(1:3),nrow=1),widths=c(5,1.5,5)) 64 | image(1:ncol(mat),1:nrow(mat),t(mat[,cind]*weights[geneindex]),xaxt="n",yaxt="n",col=icolors,xlab="",ylab="") 65 | axis(side=1,seq(along=dates),dates[cind],las=2) 66 | abline(v=12.5) 67 | 68 | 69 | svafit <- sva(y,mod,B=b,n.sv=5) 70 | weights = svafit$pprob.gam*(1-svafit$pprob.b) 71 | ## Weighted SVD 72 | surrogate <- svd( y*weights)$v[,1] 73 | 74 | image(matrix(weights[geneindex],nrow=1),,xaxt="n",yaxt="n",col=brewer.pal(9,"Blues")) 75 | plot(surrogate[cind],bg=sex[cind]+1,pch=21,xlab="",xaxt="n",ylab="Surrogate variable",ylim=c(-.5,.5),cex=1.5) 76 | axis(side=1,seq(along=dates),dates[cind],las=2) 77 | abline(v=12.5) 78 | text(1,0.5,"June") 79 | text(13.5,0.5,"Oct") 80 | legend("bottomright",c("0","1"),col=c(1,2),pch=16) 81 | } 82 | ``` 83 | 84 | 85 | ```{r} 86 | lmfit <- lmFit(dat,svaX) 87 | tt<-lmfit$coef[,2]*sqrt(lmfit$df.residual)/(2*lmfit$sigma) 88 | mypar(1,2) 89 | pval<-2*(1-pt(abs(tt),lmfit$df.residual[1])) 90 | hist(pval[!chr%in%c("chrX","chrY")],xlab="p-values",ylim=HLIM,main="") 91 | hist(pval[chr%in%c("chrY")],nc=20,xlab="p-value",ylim=c(0,9),main="") 92 | ``` 93 | 94 | Decompose the data 95 | ```{r} 96 | Batch<- lmfit$coef[geneindex,3:7]%*%t(svaX[,3:7]) 97 | Signal<-lmfit$coef[geneindex,1:2]%*%t(svaX[,1:2]) 98 | error <- dat[geneindex,]-Signal-Batch 99 | ##demean for plot 100 | Signal <-Signal-rowMeans(Signal) 101 | mat <- dat[geneindex,]-rowMeans(dat[geneindex,]) 102 | mypar(1,4,mar = c(2.75, 4.5, 2.6, 1.1)) 103 | image(t(mat),col=icolors,zlim=c(-5,5),xaxt="n",yaxt="n") 104 | image(t(Signal),col=icolors,zlim=c(-5,5),xaxt="n",yaxt="n") 105 | image(t(Batch),col=icolors,zlim=c(-5,5),xaxt="n",yaxt="n") 106 | image(t(error),col=icolors,zlim=c(-5,5),xaxt="n",yaxt="n") 107 | ``` 108 | 109 | ## Footnotes 110 | 111 | ### Principal Components Analysis (PCA) 112 | 113 | Jolliffe, Ian. Principal component analysis. John Wiley & Sons, Ltd, 2005. 114 | 115 | Dunteman, George H. Principal components analysis. No. 69. Sage, 1989. 116 | -------------------------------------------------------------------------------- /advinference/storage/transformations.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Transformation 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ## Transformations 12 | 13 | ### Mean-variance relationship 14 | 15 | In microarrays and RNAseq data we observe strong variance dependence on mean. 16 | ```{r} 17 | if (!file.exists("bottomly_eset.RData")) download.file("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData","bottomly_eset.RData") 18 | load("bottomly_eset.RData") 19 | library("Biobase") 20 | ind <- which(pData(bottomly.eset)$strain=="C57BL/6J") 21 | Y <- exprs(bottomly.eset)[,ind] 22 | avgs<-rowMeans(Y) 23 | sds <-genefilter::rowSds(Y) 24 | mypar(1,1) 25 | splot(avgs,sds,log="xy",subset=which(avgs>0),xlab="Average",ylab="SD") 26 | ``` 27 | 28 | This means that the larger values, vary the most. If we need to compute a mean to, say, normalize, it will be highly sensitive to the variation of the max: 29 | ```{r} 30 | maxs <- apply(Y,2,max) 31 | sampleavgs <- colMeans(Y) 32 | plot(maxs,sampleavgs/min(sampleavgs),xlab="Max",ylab="Sample average increase",pch=21,bg=1,cex=1.5) 33 | ``` 34 | The log transformation can remove the strong dependence. 35 | 36 | ```{r} 37 | lY <- log2(Y+0.5) 38 | lavgs<-rowMeans(lY) 39 | lsds <-genefilter::rowSds(lY) 40 | splot(lavgs,lsds,xlab="Average of log counts",ylab="SD of log counts") 41 | ``` 42 | 43 | ```{r} 44 | lsampleavgs <- colMeans(lY) 45 | plot(maxs,sampleavgs/min(sampleavgs),xlab="Max",ylab="Sample average increase",bg=1,pch=21,cex=1.5) 46 | points(maxs,lsampleavgs/min(lsampleavgs),xlab="Max",ylab="Sample average",bg=2,pch=21,cex=1.5) 47 | legend("topleft",c("Original","Log"),pch=16,col=1:2,box.lwd=0) 48 | ``` 49 | 50 | -------------------------------------------------------------------------------- /bioc/EDA_plots_for_NGS.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Exploratory Data Analysis for NGS 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | This is a dataset produced by Bottomly et al., sequencing two strains of mouse with many biological replicates. This dataset and a number of other sequencing datasets have been compiled from raw data into read counts tables by Frazee, Langmead, and Leek as part of the ReCount project. These datasets are made publicly available at the following website: 12 | 13 | http://bowtie-bio.sourceforge.net/recount/ 14 | 15 | Unlike many sequencing studies, Bottomly et al., realizing the such information is important for downstream analysis, provided the experiment number for all samples. Below we can see that the experimental batch explains more variation than the condition of interest: the strain of mouse. 16 | 17 | We can make similar figures for NGS to the ones shown in the previous sections. However, the log transform does not work because RNAseq data contains many 0s. One quick way to get around this is by adding a constant before taking the log. A typical one is 0.5 which gives us a log2 value of -1 for 0s. 18 | 19 | ```{r} 20 | if (!file.exists("bottomly_eset.RData")) download.file("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bottomly_eset.RData","bottomly_eset.RData") 21 | load("bottomly_eset.RData") 22 | library("Biobase") 23 | exprs(bottomly.eset)[1,] 24 | pData(bottomly.eset) 25 | ``` 26 | 27 | ```{r} 28 | Y <- log2(exprs(bottomly.eset) + 0.5) 29 | # library(devtools) 30 | # install_github("rafalib","ririzarr") 31 | library("rafalib") 32 | mypar(1,1) 33 | for(i in 1:ncol(Y)){ 34 | shist(Y[,i],unit=0.25,col=i,plotHist=FALSE,add=i!=1) 35 | } 36 | ``` 37 | 38 | If we get rid of the zeros (i.e., those with log2 value of -1), we can more easily see that shape of the distribution for the expressed genes: 39 | 40 | ```{r} 41 | mypar(1,1) 42 | for(i in 1:ncol(Y)){ 43 | idx <- Y[,i] > -1 44 | shist(Y[idx,i],unit=0.25,col=i,plotHist=FALSE,add=i!=1) 45 | } 46 | ``` 47 | 48 | Plotting two samples against each other shows the spreading of points at the low end of expression from the log transformation. This can also be seen with randomly generated Poisson data. 49 | 50 | ```{r} 51 | mypar(1,2) 52 | idx <- rowSums(Y[,1:2]) > 0 53 | plot(Y[idx,1], Y[idx,2], cex=.1) 54 | rm <- rowMeans(2^Y[idx,1:2]) 55 | simulated1 <- rpois(length(idx), rm) 56 | simulated2 <- rpois(length(idx), rm) 57 | plot(log2(simulated1 + .5), log2(simulated2 + .5), cex=.1) 58 | ``` 59 | 60 | The MA plot is again easier to look at, in that we don't have to rotate our heads sideways by 45 degrees to see deviations from the diagonal. 61 | 62 | ```{r} 63 | mypar(1,1) 64 | maplot(Y[idx,1],Y[idx,2]) 65 | ``` 66 | 67 | 68 | -------------------------------------------------------------------------------- /bioc/anno4liftover.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Chromosomes and their substructures 4: Translating addresses between genome builds" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | 16 | ```{r setup,echo=FALSE,results="hide"} 17 | suppressPackageStartupMessages({ 18 | library(BSgenome.Hsapiens.UCSC.hg19) 19 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 20 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 21 | library(Biostrings) 22 | library(GenomicRanges) 23 | library(IRanges) 24 | library(ph525x) 25 | library(Homo.sapiens) 26 | library(rtracklayer) 27 | }) 28 | ``` 29 | 30 | # Translating addresses between genome builds: liftOver 31 | 32 | The rtracklayer package includes an interface to the 33 | liftOver utilities developed for the UCSC genome browser. 34 | The idea is that a collection of local alignments 35 | can be defined and used to remap coordinates from 36 | one reference build to another. 37 | 38 | We can illustrate this with gene addresses created for hg38, 39 | the current reference build. We want to translate them 40 | for comparison to addresses asserted for hg19. 41 | 42 | We need a "chain file", uncompressed. You can 43 | get it from the following URL, and use gunzip on your 44 | system to uncompress in your home dir, if you would 45 | like to emulate the commands below. 46 | 47 | "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz" 48 | 49 | ```{r domyimport} 50 | library(rtracklayer) 51 | ch = import.chain("~/hg38ToHg19.over.chain") 52 | ch 53 | str(ch[[1]]) 54 | ``` 55 | 56 | Let's get the addresses for genes on chromosome 1 57 | in hg38. 58 | 59 | ```{r get38} 60 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 61 | tx38 = TxDb.Hsapiens.UCSC.hg38.knownGene 62 | seqlevels(tx38, force=TRUE) = "chr1" 63 | g1_38 = genes(tx38) 64 | ``` 65 | 66 | Now execute the liftOver: 67 | 68 | ```{r doli} 69 | g1_19L = liftOver(g1_38, ch) 70 | ``` 71 | 72 | The result is a list of GRanges, one for 73 | each translation event. 74 | 75 | ```{r lktx} 76 | g1_19L 77 | ``` 78 | 79 | Verification of accuracy of translation is covered in exercises. 80 | -------------------------------------------------------------------------------- /bioc/annoCheat.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Genomic annotation in Bioconductor: Cheat sheet" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | # Summarizing the key genome annotation resources in Bioconductor 16 | 17 | ## Executive summary 18 | 19 | ### Organism-oriented annotation 20 | 21 | For biological annotation, generally sequence or gene based, there 22 | are three key types of package 23 | 24 | * Reference sequence packages: BSgenome.[Organism].[Curator].[BuildID] 25 | * Gene model database packages: TxDb.[Organism].[Curator].[BuildID].[Catalog] 26 | * Annotation map package: org.[Organism2let].[Institution].db 27 | 28 | wherever brackets are used, you must substitute an appropriate token. 29 | You can survey all annotation packages at [the annotation page](http://bioconductor.org/packages/release/BiocViews.html#___AnnotationData). 30 | 31 | Packages Homo.sapiens, Mus.musculus and Rattus.norvegicus are specialized 32 | integrative annotation resources with an evolving interface. We have 33 | illustrated their use in lectures and labs. 34 | 35 | ### Systems biology oriented annotation 36 | 37 | Packages GO.db, KEGG.db, KEGGREST, and reactome.db are primarily 38 | intended as organism-independent resources organizing genes into 39 | groups. However, there are organism-specific mappings between 40 | gene-oriented annotation and these resources, that involve specific 41 | abbreviations and symbol conventions. These are described 42 | when these packages are used. 43 | 44 | ## Names for organisms and their abbreviations 45 | 46 | The standard Linnaean taxonomy is used very generally. So you 47 | need to know that 48 | 49 | * Human = *Homo sapiens* 50 | * Mouse = *Mus musculus* 51 | * Rat = *Rattus norvegicus* 52 | * Yeast = *Saccharomyces cerevisiae* 53 | * Zebrafish = *Danio rerio* 54 | * Cow = *Bos taurus* 55 | 56 | and so on. We use two sorts of abbreviations. For 57 | Biostrings-based packages, the contraction of first 58 | and second names is used 59 | 60 | * Human = Hsapiens 61 | * Mouse = Mmusculus 62 | * Rat = Rnorvegicus 63 | * Yeast = Scerevisiae ... 64 | 65 | For NCBI-based annotation maps, we contract further 66 | 67 | * Human = Hs 68 | * Mouse = Mm 69 | * Rat = Rn 70 | * Yeast = Sc ... 71 | 72 | ## Genomic sequence 73 | 74 | These packages have four-component names that specify the reference build used 75 | 76 | * Human = BSgenome.Hsapiens.UCSC.hg19 77 | * Mouse = BSgenome.Mmusculus.UCSC.mm10 78 | * Rat = BSgenome.Rnorvegicus.UCSC.rn5 79 | * Yeast = BSgenome.Scerevisiae.UCSC.sacCer3 80 | 81 | ## Gene models 82 | 83 | These packages have five-component names that specify the reference build used and 84 | the gene catalog 85 | 86 | * Human = TxDb.Hsapiens.UCSC.hg19.knownGene 87 | * Mouse = TxDb.Mmusculus.UCSC.mm10.knownGene 88 | * Rat = TxDb.Rnorvegicus.UCSC.rn5.knownGene 89 | * Yeast = TxDb.Scerevisiae.UCSC.sacCer3.sgdGene 90 | 91 | ## Annotation maps 92 | 93 | These packages have four component names, with two components fixed. The 94 | variable components indicate organism and curating institution. 95 | 96 | * Human = org.Hs.eg.db 97 | * Mouse = org.Mm.eg.db 98 | * Rat = org.Rn.eg.db 99 | * Yeast = org.Sc.sgd.db 100 | 101 | ## Additional options 102 | 103 | There are often alternative curating institutions available such as 104 | Ensembl. 105 | -------------------------------------------------------------------------------- /bioc/eset.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The ExpressionSet container" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | --- 8 | 9 | ```{r options, echo=FALSE} 10 | library(knitr) 11 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 12 | ``` 13 | 14 | # Overview 15 | 16 | We'll work with the basic representation of expression experiments 17 | in Bioconductor. An example is in package Biobase. 18 | 19 | ```{r do1} 20 | library(Biobase) 21 | data(sample.ExpressionSet) 22 | sample.ExpressionSet 23 | ``` 24 | 25 | We'll abbreviate the name: 26 | 27 | ```{r do2} 28 | samp = sample.ExpressionSet 29 | ``` 30 | 31 | # Queries and extractors 32 | 33 | ```{r do3} 34 | dim(samp) 35 | exprs(samp)[1:5,1:6] # extract expression values 36 | pData(samp) # extract sample level data 37 | experimentData(samp) 38 | abstract(samp) # special accessor 39 | ``` 40 | 41 | Have a look at annotation package pmid2MIAME function to see 42 | how to extract abstracts of papers from pubmed. These can be 43 | bound into ExpressionSets with experimentData(). 44 | 45 | # Matrix-like subscripting 46 | 47 | We can use matrix-like syntax directly to restrict the 48 | ExpressionSet, getting back a new ExpressionSet 49 | ```{r doex} 50 | samp[1:4,3:20] 51 | samp[, samp$sex=="Male"] 52 | ``` 53 | 54 | 55 | -------------------------------------------------------------------------------- /bioc/eset_sumexp.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: ExpressionSet and SummarizedExperiment 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ```{r message=FALSE} 12 | library(Biobase) 13 | library(GEOquery) 14 | ``` 15 | 16 | ```{r} 17 | geoq <- getGEO("GSE9514") # download a dataset from GEO 18 | names(geoq) 19 | e <- geoq[[1]] # extract ExpressionSet 20 | ``` 21 | 22 | ### ExpressionSet 23 | 24 | ```{r} 25 | # exprs gives expression matrix 26 | dim(e) # number of features and samples in ExpressionSet 27 | exprs(e)[1:3,1:3] 28 | dim(exprs(e)) # rows are features, columns are samples 29 | 30 | # pData gives phenotype data (sample information) 31 | pData(e)[1:3,1:6] 32 | dim(pData(e)) # rows of pData correspond to columns of exprs 33 | names(pData(e)) 34 | pData(e)$characteristics_ch1 35 | 36 | # fData gives feature data (probe information) 37 | fData(e)[1:3,1:3] 38 | dim(fData(e)) # rows of fData correspond to rows of exprs 39 | names(fData(e)) 40 | head(fData(e)$"Gene Symbol") 41 | head(rownames(e)) 42 | 43 | # additional annotation tied to ExpressionSet 44 | experimentData(e) 45 | annotation(e) 46 | ``` 47 | 48 | ### Summarized Experiment 49 | 50 | ```{r message=FALSE} 51 | library(parathyroidSE) 52 | ``` 53 | 54 | 55 | ```{r} 56 | data(parathyroidGenesSE) 57 | se <- parathyroidGenesSE 58 | se 59 | ``` 60 | 61 | 62 | ```{r} 63 | # assay contains results of the assay 64 | dim(se) 65 | assay(se)[1:3,1:3] 66 | dim(assay(se)) # rows = features (ranges), columns = samples 67 | 68 | # colData contains sample information 69 | colData(se)[1:3,1:6] 70 | dim(colData(se)) 71 | names(colData(se)) 72 | colData(se)$treatment 73 | 74 | # rowRanges contains feature information 75 | rowRanges(se)[1] 76 | class(rowRanges(se)) 77 | length(rowRanges(se)) 78 | head(rownames(se)) 79 | metadata(rowRanges(se)) 80 | 81 | # additional metadata, including sample information 82 | metadata(se)$MIAME 83 | abstract(metadata(se)$MIAME) 84 | ``` 85 | 86 | ## Footnotes 87 | 88 | For more information about the `GenomicRanges` package, check out the PLOS Comp Bio paper, which the authors of GenomicRanges published: 89 | 90 | http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118 91 | 92 | For more information on *SummarizedExperiment*: 93 | 94 | http://www.nature.com/nmeth/journal/v12/n2/abs/nmeth.3252.html 95 | 96 | Also the software vignettes have a lot of details about the functionality. Check out "An Introduction to Genomic Ranges Classes". All of the vignette PDFs are available here: 97 | 98 | http://www.bioconductor.org/packages/release/bioc/html/GenomicRanges.html 99 | 100 | -------------------------------------------------------------------------------- /bioc/ggbioNote.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "A note on visualization options" 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | There are many interesting approaches to visualizing genome-scale data. 12 | Two major packages in Bioconductor are Gviz and ggbio. Both represent 13 | significant efforts at bridging the gap between graphics facilities 14 | and various genomic data structures. 15 | 16 | ggbio's `autoplot` method can be very useful for broad overviews. 17 | For a GRanges instance, each range for which data exists can be 18 | depicted as a band on the chromosome. The karyogram layout 19 | gives a genome-wide view, but it can be important to control 20 | the handling of extra-chromosomal sequence levels. 21 | 22 | ```{r getl,echo=FALSE,results="hide"} 23 | library(ERBS) 24 | library(GenomeInfoDb) 25 | library(ggbio) 26 | ``` 27 | ```{r lkd, fig=TRUE} 28 | library(ERBS) 29 | data(HepG2) 30 | library(GenomeInfoDb) # trim all but autosomal chroms 31 | seqlevels(HepG2, force=TRUE) = paste0("chr", 1:22) 32 | data(GM12878) 33 | seqlevels(GM12878, force=TRUE) = paste0("chr", 1:22) 34 | library(ggbio) 35 | autoplot(HepG2, layout="karyogram", main="ESRRA binding on HepG2") 36 | ``` 37 | 38 | Notice that the title is not printed, currently a bug. 39 | 40 | ```{r lkm,fig=TRUE} 41 | autoplot(GM12878, layout="karyogram", main="ESRRA binding on GM12878") 42 | ``` 43 | -------------------------------------------------------------------------------- /bioc/importBed.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Importing genomic regions from files" 3 | author: "Mike Love and Rafa" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | # Introduction 16 | 17 | A common way you will be accessing genomic regions data is through files. The standard used by most software application is [BED](http://genome.ucsc.edu/FAQ/FAQformat.html#format1). The ENCODE project has created it's new format: [NarrowPeak](http://genome.ucsc.edu/FAQ/FAQformat.html#format12). Here we demonstrate the `import` function from the `rtracklayer` package that facilitates the creation of `GRanges` objects from these files. 18 | 19 | 20 | # Finding the files 21 | 22 | Here we use as an example the original files used to create the objects in the `ERBS` library. Once you install this package the files can be found here: 23 | 24 | ```{r} 25 | dir <- file.path( system.file(package="ERBS"), "extdata") 26 | ``` 27 | 28 | We included a `sampleInfo` file that includes the names and provenance of the data. 29 | 30 | ```{r} 31 | sampleInfo <- read.table(file.path(dir,"sampleInfo.txt"), stringsAsFactors=FALSE) 32 | sampleInfo 33 | ``` 34 | 35 | As an example we will read in just the first file: 36 | 37 | ```{r} 38 | filename <- file.path(dir,sampleInfo[1,1]) 39 | ``` 40 | 41 | # Import 42 | 43 | To import the files we can now use the `import` function. Note that import does not support NarrowPeak files but it does support BED files and is able read it in. 44 | 45 | 46 | ```{r} 47 | library(rtracklayer) 48 | HepG2 <- import(filenames[1], format="bedGraph") 49 | HepG2 50 | ``` 51 | We do successfully create a `GRanges` object but note the metadata names are missing. We can add these by hand. 52 | ```{r} 53 | names(mcols(HepG2)) <- c("name","score","col","signalValue","pValue","qValue","peak") 54 | ``` 55 | 56 | # Adding metadata 57 | 58 | A much more important piece of information that is missing here relates to provenance and genome annotation. Where do the files original come from? What build of the human genome was used? What chromosomes were considered? 59 | 60 | We highly recommend that you add this information to your object even if is not installed in the file. Here is how we constructed the objects in `ERBS` 61 | 62 | Add data provenance: 63 | ```{r} 64 | metadata(HepG2) <- list("ENCODE accession: ENCSR000EEW. ESRRA ChIP-seq peaks of HepG2 cell line https://www.encodeproject.org/experiments/ENCSR000EEW/") 65 | metadata(GM12878) <- list("ENCODE accession: ENCSR000DYQ. ESRRA ChIP-seq peaks of GM12878 cell line https://www.encodeproject.org/experiments/ENCSR000DYQ/") 66 | ``` 67 | 68 | Next, we can add the genome build that was used: 69 | 70 | ```{r} 71 | # add simple text descriptor for genome 72 | genome(HepG2) <- "hg19" 73 | genome(GM12878) <- "hg19" 74 | ``` 75 | 76 | Finally we denote the chromosome annotation that should be used by copying it from one of the `BSgenome` objects. 77 | 78 | We start by checking that they are in fact the same style 79 | 80 | ```{r} 81 | # import chromosomal length information as well (also UCSC) 82 | library(BSgenome.Hsapiens.UCSC.hg19) 83 | seqlevelsStyle(HepG2) 84 | seqlevelsStyle(Hsapiens) 85 | ``` 86 | 87 | and that all the chromosome names in `HepG2` are in `Hsapiens` 88 | 89 | ```{r} 90 | seqlevels(HepG2)%in% seqlevels(Hsapiens) 91 | ``` 92 | 93 | Once we see that they are then we simply borrow the information from the `Hsapines` object: 94 | ```{r} 95 | seqinfo(HepG2) <- seqinfo(Hsapiens) 96 | ``` -------------------------------------------------------------------------------- /bioc/inference_with_bioc.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Inference with bioc 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | 12 | # Introduction 13 | 14 | In this section we will cover inference in the context of genomics experiments. We apply some of the concepts we have covered in previous sections including t-tests, multiple comparisons and standard deviation estimates from hierarchical models. 15 | 16 | We start by loading the pooling experiment data 17 | 18 | 19 | ```{r,message=FALSE} 20 | library(Biobase) 21 | library(maPooling) 22 | data(maPooling) 23 | pd=pData(maPooling) 24 | individuals=which(rowSums(pd)==1) 25 | ``` 26 | 27 | And extracting the individual mice as well as their strain 28 | 29 | ```{r} 30 | individuals=which(rowSums(pd)==1) 31 | individuals=individuals[-grep("tr",names(individuals))] 32 | y=exprs(maPooling)[,individuals] 33 | g=factor(as.numeric(grepl("b",names(individuals)))) 34 | ``` 35 | 36 | 37 | # T-test 38 | 39 | We can now apply a t-test to each gene using the `rowttest` function in the `genefilter` package 40 | 41 | ```{r} 42 | library(genefilter) 43 | tt=rowttests(y,g) 44 | ``` 45 | 46 | Now which genes do we report as statistically significant? For somewhat arbitrary reasons, in science p-values of 0.01 and 0.05 are used as cutoff. In this particular example we get 47 | 48 | ```{r} 49 | sum(tt$p.value<0.01) 50 | sum(tt$p.value<0.05) 51 | ``` 52 | 53 | 54 | # Multiple testing 55 | We described multiple testing in detail in course 3. Here we provide a quick summary. 56 | 57 | Do we report all these genes? Let's explore what happens if we split the first group into two, forcing the null hypothesis to be true 58 | 59 | ```{r} 60 | set.seed(0) 61 | shuffledIndex <- factor(sample(c(0,1),sum(g==0),replace=TRUE )) 62 | nulltt <- rowttests(y[,g==0],shuffledIndex) 63 | sum(nulltt$p.value<0.01) 64 | sum(nulltt$p.value<0.05) 65 | ``` 66 | 67 | If we use the 0.05 cutoff we will be reporting 840 false positives. We have described several ways to adjust for this include the `qvalue` method available in the `qvalue` package. After this adjustment we include a smaller list of genes. 68 | 69 | ```{r} 70 | library(qvalue) 71 | qvals = qvalue(tt$p.value)$qvalue 72 | sum(qvals<0.05) 73 | sum(qvals<0.01) 74 | ``` 75 | 76 | And now the null case generates fewer false positives: 77 | 78 | ```{r} 79 | library(qvalue) 80 | nullqvals = qvalue(nulltt$p.value)$qvalue 81 | sum(nullqvals<0.05) 82 | sum(nullqvals<0.01) 83 | ``` 84 | 85 | -------------------------------------------------------------------------------- /bioc/moreGR.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "GRanges operations related to gene model, TSS, and promoter region identification" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | 16 | 17 | ```{r setup,echo=FALSE,results="hide"} 18 | suppressPackageStartupMessages({ 19 | library(BSgenome.Hsapiens.UCSC.hg19) 20 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 21 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 22 | library(Biostrings) 23 | library(GenomicRanges) 24 | library(IRanges) 25 | library(ph525x) 26 | library(Homo.sapiens) 27 | library(Gviz) 28 | }) 29 | ``` 30 | # Overview 31 | 32 | In this document we work with a small set of ranges and 33 | illustrate basic intra-range operations reduce, disjoin, gaps. 34 | We then add strand and seqname information and show how 35 | resize and flank are useful for identifying TSS and promoter regions. 36 | 37 | ## A simple set of ranges 38 | 39 | ```{r newr} 40 | ir <- IRanges(c(3, 8, 14, 15, 19, 34, 40), 41 | width = c(12, 6, 6, 15, 6, 2, 7)) 42 | ``` 43 | 44 | ```{r plotr,echo=FALSE} 45 | plotRanges <- function(x, xlim = x, main = deparse(substitute(x)), 46 | col = "black", sep = 0.5, ...) 47 | { 48 | height <- 1 49 | if (is(xlim, "Ranges")) 50 | xlim <- c(min(start(xlim)), max(end(xlim))) 51 | bins <- disjointBins(IRanges(start(x), end(x) + 1)) 52 | plot.new() 53 | plot.window(xlim, c(0, max(bins)*(height + sep))) 54 | ybottom <- bins * (sep + height) - height 55 | rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...) 56 | title(main) 57 | axis(1) 58 | } 59 | 60 | plotGRanges = function (x, xlim = x, col = "black", sep = 0.5, xlimits = c(0, 61 | 60), ...) 62 | { 63 | main = deparse(substitute(x)) 64 | ch = as.character(seqnames(x)[1]) 65 | x = ranges(x) 66 | height <- 1 67 | if (is(xlim, "Ranges")) 68 | xlim <- c(min(start(xlim)), max(end(xlim))) 69 | bins <- disjointBins(IRanges(start(x), end(x) + 1)) 70 | plot.new() 71 | plot.window(xlim = xlimits, c(0, max(bins) * (height + sep))) 72 | ybottom <- bins * (sep + height) - height 73 | rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height, 74 | col = col, ...) 75 | title(main, xlab = ch) 76 | axis(1) 77 | } 78 | ``` 79 | 80 | Let's visualize `ir` and several intra-range operations. 81 | ```{r lkir,fig=TRUE, out.height="1100px"} 82 | par(mfrow=c(4,1), mar=c(4,2,2,2)) 83 | plotRanges(ir, xlim=c(0,60)) 84 | plotRanges(reduce(ir), xlim=c(0,60)) 85 | plotRanges(disjoin(ir), xlim=c(0,60)) 86 | plotRanges(gaps(ir), xlim=c(0,60)) 87 | ``` 88 | 89 | reduce(x) produces a set of 90 | nonoverlapping ranges that cover all positions covered by x. 91 | This can be used to reduce complexity of a gene model 92 | with many transcripts, where we may just want the addresses 93 | of intervals known to be transcribed, regardless of transcript 94 | of residence. 95 | 96 | disjoin(x) produces a set of ranges that cover all positions 97 | covered by x, such that none of the ranges in the 98 | disjoin output overlaps any end points of intervals in x. 99 | This gives us the largest possible collection of contiguous 100 | intervals that are separated wherever the original set 101 | of intervals had an endpoint. 102 | 103 | gaps(x) produces a set of ranges covering the positions 104 | in [start(x), end(x)] that are not covered by any range in x. 105 | Given coding sequence addresses and exon intervals, this can 106 | be used to enumerate introns. 107 | 108 | # Extension to GRanges 109 | 110 | We add chromosome and strand information. 111 | 112 | ```{r dogr} 113 | library(GenomicRanges) 114 | gir = GRanges(seqnames="chr1", ir, strand=c(rep("+", 4), rep("-",3))) 115 | ``` 116 | 117 | Let's assume the intervals represent genes. 118 | The following plots illustrate the identification of 119 | transcription start sites (green), upstream promoter 120 | regions (purple), downstream promoter regions (brown). 121 | 122 | ```{r dopr,fig=TRUE, out.height="1100px", out.width="1100px"} 123 | par(mfrow=c(4,1), mar=c(4,2,2,2)) 124 | plotGRanges(gir, xlim=c(0,60)) 125 | plotGRanges(resize(gir,1), xlim=c(0,60)) 126 | plotGRanges(flank(gir,3), xlim=c(0,60), col="purple") 127 | plotGRanges(flank(gir,2,start=FALSE), xlim=c(0,60), col="brown") 128 | ``` 129 | 130 | Note that we do not need to take special steps to 131 | deal with the differences in strand. 132 | -------------------------------------------------------------------------------- /bioc/operateGRanges.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "GRanges operations related to gene model, TSS, and promoter region identification" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | 16 | 17 | ```{r setup,echo=FALSE,results="hide"} 18 | suppressPackageStartupMessages({ 19 | library(BSgenome.Hsapiens.UCSC.hg19) 20 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 21 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 22 | library(Biostrings) 23 | library(GenomicRanges) 24 | library(IRanges) 25 | library(ph525x) 26 | library(Homo.sapiens) 27 | library(Gviz) 28 | }) 29 | ``` 30 | # Overview 31 | 32 | In this document we work with a small set of ranges and 33 | illustrate basic intra-range operations reduce, disjoin, gaps. 34 | We then add strand and seqname information and show how 35 | resize and flank are useful for identifying TSS and promoter regions. 36 | 37 | ## A simple set of ranges 38 | 39 | ```{r newr} 40 | ir <- IRanges(c(3, 8, 14, 15, 19, 34, 40), 41 | width = c(12, 6, 6, 15, 6, 2, 7)) 42 | ``` 43 | 44 | ```{r plotr,echo=FALSE} 45 | plotRanges <- function(x, xlim = x, main = deparse(substitute(x)), 46 | col = "black", sep = 0.5, ...) 47 | { 48 | height <- 1 49 | if (is(xlim, "Ranges")) 50 | xlim <- c(min(start(xlim)), max(end(xlim))) 51 | bins <- disjointBins(IRanges(start(x), end(x) + 1)) 52 | plot.new() 53 | plot.window(xlim, c(0, max(bins)*(height + sep))) 54 | ybottom <- bins * (sep + height) - height 55 | rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...) 56 | title(main) 57 | axis(1) 58 | } 59 | 60 | plotGRanges = function (x, xlim = x, col = "black", sep = 0.5, xlimits = c(0, 61 | 60), ...) 62 | { 63 | main = deparse(substitute(x)) 64 | ch = as.character(seqnames(x)[1]) 65 | x = ranges(x) 66 | height <- 1 67 | if (is(xlim, "Ranges")) 68 | xlim <- c(min(start(xlim)), max(end(xlim))) 69 | bins <- disjointBins(IRanges(start(x), end(x) + 1)) 70 | plot.new() 71 | plot.window(xlim = xlimits, c(0, max(bins) * (height + sep))) 72 | ybottom <- bins * (sep + height) - height 73 | rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height, 74 | col = col, ...) 75 | title(main, xlab = ch) 76 | axis(1) 77 | } 78 | ``` 79 | 80 | Let's visualize `ir` and several intra-range operations. 81 | ```{r lkir,fig=TRUE, out.height="1100px"} 82 | par(mfrow=c(4,1), mar=c(4,2,2,2)) 83 | plotRanges(ir, xlim=c(0,60)) 84 | plotRanges(reduce(ir), xlim=c(0,60)) 85 | plotRanges(disjoin(ir), xlim=c(0,60)) 86 | plotRanges(gaps(ir), xlim=c(0,60)) 87 | ``` 88 | 89 | reduce(x) produces a set of 90 | nonoverlapping ranges that cover all positions covered by x. 91 | This can be used to reduce complexity of a gene model 92 | with many transcripts, where we may just want the addresses 93 | of intervals known to be transcribed, regardless of transcript 94 | of residence. 95 | 96 | disjoin(x) produces a set of ranges that cover all positions 97 | covered by x, such that none of the ranges in the 98 | disjoin output overlaps any end points of intervals in x. 99 | This gives us the largest possible collection of contiguous 100 | intervals that are separated wherever the original set 101 | of intervals had an endpoint. 102 | 103 | gaps(x) produces a set of ranges covering the positions 104 | in [start(x), end(x)] that are not covered by any range in x. 105 | Given coding sequence addresses and exon intervals, this can 106 | be used to enumerate introns. 107 | 108 | # Extension to GRanges 109 | 110 | We add chromosome and strand information. 111 | 112 | ```{r dogr} 113 | library(GenomicRanges) 114 | gir = GRanges(seqnames="chr1", ir, strand=c(rep("+", 4), rep("-",3))) 115 | ``` 116 | 117 | Let's assume the intervals represent genes. 118 | The following plots illustrate the identification of 119 | transcription start sites (green), upstream promoter 120 | regions (purple), downstream promoter regions (brown). 121 | 122 | ```{r dopr,fig=TRUE, out.height="1100px", out.width="1100px"} 123 | par(mfrow=c(4,1), mar=c(4,2,2,2)) 124 | plotGRanges(gir, xlim=c(0,60)) 125 | plotGRanges(resize(gir,1), xlim=c(0,60),col="green") 126 | plotGRanges(flank(gir,3), xlim=c(0,60), col="purple") 127 | plotGRanges(flank(gir,2,start=FALSE), xlim=c(0,60), col="brown") 128 | ``` 129 | 130 | Note that we do not need to take special steps to 131 | deal with the differences in strand. 132 | -------------------------------------------------------------------------------- /bioc/reading_microarray_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Reading in microarray data 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ## Affymterix CEL files 12 | 13 | We start by reading in the sample information table. This is usually created by the person who performed the experiment. 14 | 15 | The raw data files for this lab are in the `rawdata` repository, available here: 16 | 17 | 18 | 19 | Click Download ZIP in order to download all the files, then unzip this file, which should result in a `rawdata-master` folder. Make sure this folder is in your current working directory. 20 | 21 | First we save the initial working directory, so we can return to it. 22 | 23 | ```{r} 24 | wd <- getwd() 25 | ``` 26 | 27 | Now we can start reading in the files: 28 | 29 | ```{r} 30 | datadir <- paste0(wd, "/rawdata-master") 31 | basedir <- paste0(datadir, "/celfiles") 32 | setwd(basedir) 33 | library(affy) 34 | tab <- read.delim("sampleinfo.txt",check.names=FALSE,as.is=TRUE) 35 | rownames(tab) <- tab$filenames 36 | tab 37 | fns <- list.celfiles(basedir) 38 | fns 39 | fns %in% tab[,1] ##check 40 | ab <- ReadAffy(phenoData=tab) 41 | ``` 42 | 43 | This creates an AffyBatch object which object contains the information you need. (These commands may download some annotation packages to interpret the arrays.) 44 | 45 | ```{r} 46 | dim(pm(ab)) 47 | dim(pData(ab)) 48 | annotation(ab) 49 | ``` 50 | 51 | Note, this object You can then preprocess RMA 52 | ```{r} 53 | e <- rma(ab) 54 | ``` 55 | 56 | Now we go back to the previous working directory. 57 | 58 | ```{r} 59 | setwd(wd) 60 | ``` 61 | 62 | If you are not interested in probe level data you could can use this function: 63 | 64 | ```{r} 65 | setwd(basedir) 66 | ejust <- justRMA(filenames=tab[,1],phenoData=tab) 67 | dim(ejust) 68 | ``` 69 | 70 | 71 | ##Agilent data 72 | 73 | ```{r} 74 | library(limma) 75 | library(rafalib) 76 | basedir <- paste0(datadir, "/agilent") 77 | setwd(basedir) 78 | targets <- readTargets("TargetBeta7.txt") 79 | RG <- read.maimages(targets$FileName, source="genepix") 80 | MA <- MA.RG(RG,bc.method="none") 81 | mypar(1,1) 82 | imageplot(MA$M[,2], RG$printer, zlim=c(-3,3)) 83 | dev.off() 84 | ``` 85 | 86 | 87 | Now we go back to the previous working directory. 88 | 89 | ```{r} 90 | setwd(wd) 91 | ``` 92 | 93 | 94 | 95 | ## oligo 96 | We can also use oligo to read affy arrays 97 | 98 | ```{r} 99 | detach("package:affy") 100 | library(oligo) 101 | basedir <- paste0(datadir,"/celfiles") 102 | setwd(basedir) 103 | tab <- read.delim("sampleinfo.txt",check.names=FALSE,as.is=TRUE) 104 | fns <- list.celfiles(listGzipped=TRUE) 105 | fns %in% tab[,1] ##check 106 | pd <- as(tab, "AnnotatedDataFrame") 107 | efs <- read.celfiles(filenames=tab[,1],phenoData=pd,sampleNames=sampleNames(pd)) 108 | ``` 109 | 110 | ```{r} 111 | e <- rma(efs) 112 | ``` 113 | -------------------------------------------------------------------------------- /bioc/seq4motif.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Genomic sequence -- utility for motif checking" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | ```{r setup,echo=FALSE,results="hide",message=FALSE} 16 | library(GenomicFeatures) 17 | library(GenomicRanges) 18 | library(IRanges) 19 | library(devtools) 20 | library(ERBS) 21 | library(Homo.sapiens) 22 | ``` 23 | 24 | # Overview 25 | 26 | In this document we'll show how to look for occurrences 27 | of a binding motif in genomic sequence underlying binding peaks. 28 | 29 | Recall that we have the ER binding peaks for two cell 30 | lines in the ERBS package. We'll focus on HepG2 31 | 32 | ```{r pkgs} 33 | library(ERBS) 34 | data(HepG2) 35 | HepG2 36 | ``` 37 | 38 | We'd like to look at the genomic sequence underneath the peaks 39 | and inspect it for the binding motif "TCAAGGTCA". This is 40 | easy to do with the Biostrings and BSGenome infrastructure. 41 | 42 | # Reference genomic sequence for humans 43 | 44 | We'll work with hg19. The BSgenome... package will 45 | create variable `Hsapiens` on attachment. 46 | This variable gives a metadata report. 47 | 48 | ```{r gethg} 49 | library(BSgenome.Hsapiens.UCSC.hg19) 50 | Hsapiens 51 | ``` 52 | 53 | The reference sequence for a chromosome can be obtained 54 | with the $ operator. 55 | 56 | ```{r getch} 57 | Hsapiens$chr17 58 | ``` 59 | 60 | # Targeted retrieval of reference sequence 61 | 62 | The getSeq function obtains sequence 63 | corresponding to addresses listed in GRanges. 64 | We'll obtain the sequence under the peaks as 65 | `hepseq`, and a set of control sequences of similar 66 | lengths obtained by shifting the binding peak intervals 67 | by 2500 bases and obtaining the reference sequence in the 68 | shifted intervals. 69 | 70 | ```{r getsq} 71 | hepseq = getSeq(Hsapiens, HepG2) 72 | rhepseq = getSeq(Hsapiens, shift(HepG2,2500)) 73 | hepseq 74 | ``` 75 | 76 | # Counting motif occurrences 77 | 78 | We count the occurrences of the ESRRA 79 | binding motif 80 | "TCAAGGTCA" in the bound intervals (and their reverse complement 81 | representation). This is compared to the frequency of occurrence in the 82 | control sequences. We'll use the `vcountPattern` function of the 83 | Biostrings package to carry this out. 84 | 85 | ```{r lk1} 86 | sum(vcountPattern("TCAAGGTCA", hepseq))+sum(vcountPattern("TCAAGGTCA", 87 | reverseComplement(hepseq))) 88 | sum(vcountPattern("TCAAGGTCA", rhepseq))+sum(vcountPattern("TCAAGGTCA", 89 | reverseComplement(rhepseq))) 90 | ``` 91 | 92 | We see a 9-fold increase in occupancy in the bound regions compared 93 | to the shifted regions. This is not the way one assesses motif occurrences. 94 | First, the motif is generally represented as a model and not a string. 95 | The model is typically expressed as a position weight matrix (PWM). 96 | Second, the most common software tools for evaluating motif enrichment are 97 | MEME and FIMO; matchPWM of the Biostrings package can perform similar analyses. 98 | package can also 99 | -------------------------------------------------------------------------------- /bioc/storage/EDA_plots_for_microarray.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Exploratory Data Analysis for microarray 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | # EDA for Microarray data 12 | Here we are analyzing microarray data from eight samples: two groups of four. A first step in any analysis of genomics data is to learn its general properties and search for problematic samples. By viewing the data from the first sample we immediately notice that over 90% of data is below 1,000 and the remaining 10% spans values up to 40,000. By taking the log we get a better picture of the distribution. We use base 2 because memorizing the powers of 2 is easy. It gives us a friendly range: 4-16. 13 | 14 | ```{r,fig.width=6, fig.height=3} 15 | # BiocManager::install("genomicsclass/SpikeInEDA") 16 | library(SpikeInEDA) 17 | data(SpikeInEDA) 18 | par(mfrow=c(1,2)) 19 | hist(int[,1]) 20 | hist(log2(int[,1])) 21 | ``` 22 | 23 | Next we look at all eight histograms simultaneously. To facilitate this we introduce the _density estimator_ or _smooth histogram_. Basically we create a histogram, draw a smooth curve through the top of the bars, and keep that curve. This permits us to put several histograms on the same page: 24 | 25 | ```{r,fig.width=3, fig.height=3} 26 | par(mfrow=c(1,1)) 27 | for(i in 1:ncol(int)) 28 | if(i==1) plot(density(log2(int[,i])),col=(i==4)+1) else lines(density(log2(int[,i])),col=(i==4)+1) 29 | ``` 30 | Note that one histogram (we higlighted it by making it red) looks different: it has a different shape from the rest. So is this sample different from the rest in any significant way? If we compute the correlation between this sample and the rest it is not very different and all very high. 31 | ```{r} 32 | signif(cor(int),2) 33 | ``` 34 | The problem is not immediately obviou from a scatter plot. 35 | ```{r,fig.width=6, fig.height=3} 36 | ##we don't need to show all the points so we take samples 37 | library(rafalib) 38 | splot<-function(x,y,...){ 39 | ind<-sample(length(x),10000) 40 | x=x[ind];y=y[ind] 41 | plot(x,y,...) 42 | } 43 | mypar(1,2) 44 | splot(log2(int[,1]),log2(int[,2])) 45 | splot(log2(int[,1]),log2(int[,4])) 46 | ``` 47 | Note that samples 1 through 4 are replicates and should produce the same values up to measurement error. Scatterplots and correlation are not the best tools to detect problems. Note for example that 1,2,3,4 and 100,200,300,400 two lists with very different values have perfect correlation. A better measure is the differences between the values and therefore a better plot is a rotation of the scatter plot containg the differences (log ratios) on the y-axis and the averages (in the log scale) on the x-axis. This plot is a refered to as an MA-plot. 48 | ```{r,fig.width=6, fig.height=3} 49 | maplot<- function(x,y,...) splot((x+y)/2,y-x,...) 50 | mypar(1,3) 51 | maplot(log2(int[,1]),log2(int[,2]),xlab="A",ylab="M",ylim=c(-2,2)) 52 | maplot(log2(int[,1]),log2(int[,3]),xlab="A",ylab="M",ylim=c(-2,2)) 53 | maplot(log2(int[,1]),log2(int[,4]),xlab="A",ylab="M",ylim=c(-2,2)) 54 | ``` 55 | Now the problem is obvious. It turns out this samples comes from an array for which a spatial problem can be detected at the original image level. We actually have the grid locations for these measurements and can recreate the image. 56 | 57 | ```{r, fig.width=6, fig.height=3} 58 | ##we are doing this for two arrays 1 and 4 59 | library(matrixStats) ##need rowMedians 60 | library(RColorBrewer) 61 | for(i in c(1,4)){ 62 | r=log2(int[,i])-rowMedians(log2(int)) 63 | ## r are residuals from median array 64 | ## to avoind outliers taking over colors of image 65 | ### define a MAX 66 | MAX<-1 67 | r[r>MAX]<-MAX 68 | r[r< -MAX] <- -MAX 69 | ##we now that every other column is skipped 70 | mat=matrix(NA,max(locations[,1]),max(locations[,2]+1)/2) 71 | for(j in 1:nrow(locations)){ 72 | mat[locations[j,1],(locations[j,2]+1)/2]<-r[j] 73 | } 74 | image(mat,col=brewer.pal(11,"RdBu")) 75 | } 76 | ``` 77 | 78 | On the second image we can clearly see the spatial pattern (blue are positive residuals, red are negative) 79 | -------------------------------------------------------------------------------- /bioc/storage/GEOquery.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Downloading data from GEO using GEOquery 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ## Example of how to download CEL files from GEO 12 | 13 | ## contributed by Stephanie Hicks 14 | 15 | If the `GEOquery` R/Biocondcutor package is not installed, use `biocLite()` to install the package: 16 | ```{r, eval=FALSE} 17 | source("http://bioconductor.org/biocLite.R") 18 | biocLite("GEOquery") 19 | ``` 20 | 21 | Load the `GEOquery` R/Bioconductor package: 22 | ```{r, message=FALSE} 23 | library(GEOquery) 24 | ``` 25 | 26 | 27 | ### Access the GEO Series Data 28 | To access the GEO Sample (GSM), GEO Series (GSE) (lists of GSM files that together form a single experiment) or GEO Dataset (GDS), use the function `getGEO()` which returns a list of ExpressionSets: 29 | ```{r, message=FALSE, eval=FALSE} 30 | ###This will download a 20 Mb 31 | gse <- getGEO("GSE21653", GSEMatrix=TRUE) 32 | show(gse) 33 | ``` 34 | 35 | 36 | ### Accessing raw data from GEO 37 | If raw data such as .CEL files exist on GEO, you can easily access this dea using the `getGEOSuppFiles()` function. The function takes in a GEO accession as the argument and will download all the raw data associated with that accession. By default the `getGEOSuppFiles()` function will create a directory within the current working directory to store the raw data. Here, the file paths of the downloaded files (often with as a .tar extension) are stored in a data frame called `filePaths`. 38 | 39 | ```{r,eval=FALSE} 40 | filePaths = getGEOSuppFiles('GSE21653') 41 | filePaths 42 | ``` 43 | From here you can use, for example, `ReadAffy()` to read in the CEL files. 44 | 45 | 46 | ### Access GSE Data Tables from GEO 47 | To access the phenotypic information about the samples, the best way is to use `getGEO()` function to obtain the GSE object and then extract the phenoData object from that. Unfortunately this means downloadint the entire GSE Matrix file. 48 | 49 | ```{r,eval=FALSE} 50 | dim(pData(gse[[1]])) 51 | head(pData(gse[[1]])[,1:3]) 52 | ``` 53 | 54 | Sometimes GSEs are include separate data tables with the sample information. If these exist, you can uuse the `getGSEDataTables()` function. For example here is the phenoData object from a different GSE accession GSE3494 with a Data Table. 55 | ```{r} 56 | df1 <- getGSEDataTables("GSE3494") 57 | lapply(df1,head) 58 | ``` 59 | -------------------------------------------------------------------------------- /bioc/storage/anno1refbuilds.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Chromosomes and their substructures 1: Reference genomes" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | ```{r setup,echo=FALSE,results="hide"} 16 | suppressPackageStartupMessages({ 17 | library(BSgenome.Hsapiens.NCBI.GRCh38) 18 | library(Biostrings) 19 | library(GenomicRanges) 20 | library(IRanges) 21 | }) 22 | ``` 23 | 24 | # Genomic sequence, reference builds 25 | 26 | ## Human 27 | 28 | The genomic sequence for humans has recently 29 | been revised. We can use the most recent major 30 | revision as follows: 31 | 32 | ```{r hg38} 33 | library(BSgenome.Hsapiens.NCBI.GRCh38) 34 | Hsapiens 35 | h38 = Hsapiens # for later 36 | ``` 37 | 38 | Notice the number of sequences reported, and their names. We can 39 | get the sequence for a chromosome by using list-like 40 | syntax with `Hsapiens`. 41 | 42 | ```{r lkc22} 43 | h38$"22" 44 | ``` 45 | 46 | This shows that the starting and ending bases are indeterminate. 47 | We can obtain the overall nucleotide frequencies as 48 | 49 | ```{r lkf} 50 | alphabetFrequency(Hsapiens$"22") 51 | ``` 52 | 53 | A great deal of reference data in use are annotated to 54 | build hg19 (also known as GRCh37). 55 | 56 | ```{r lk19} 57 | library(BSgenome.Hsapiens.UCSC.hg19) 58 | Hsapiens 59 | h19 = Hsapiens 60 | ``` 61 | 62 | Note that there is a different sequence naming convention 63 | and a different number of sequences managed in this build. 64 | 65 | 66 | ## Other organisms 67 | 68 | If you have an internet connection, the `available.genomes` function 69 | will list packages that contain reference sequences. 70 | 71 | ```{r lkav, eval=FALSE} 72 | available.genomes() 73 | ``` 74 | 75 | For organisms not covered at present by the project, tools 76 | for building compatible packages are available in the 77 | BSgenome package (see the BSgenomeForge vignette). 78 | 79 | 80 | -------------------------------------------------------------------------------- /bioc/storage/anno2Biostrings.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Chromosomes and their substructures 2: Biostrings" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | ```{r setup,echo=FALSE,results="hide"} 16 | suppressPackageStartupMessages({ 17 | library(BSgenome.Hsapiens.NCBI.GRCh38) 18 | library(Biostrings) 19 | library(GenomicRanges) 20 | library(IRanges) 21 | }) 22 | ``` 23 | 24 | 25 | # Biostrings: basic infrastructure for computing on sequences 26 | 27 | ## Construction, sets, restricted alphabets 28 | 29 | Very large strings like chromosome sequences receive 30 | special handling in Bioconductor. We use a general container 31 | class called `BString` for "big" strings that are 32 | distringuished from R character vectors in that BStrings a) obey 33 | different rules for copying and b) do not contain multiple 34 | strings (see the man page for BString). Classes `DNAString` 35 | and `AAString` have restrictions on the characters that can be 36 | managed in instances. 37 | 38 | ```{r lkbs} 39 | library(Biostrings) 40 | bdemo = BString("BCDEF") 41 | ddemo = try(DNAString("BCDEF")) 42 | cat(ddemo) 43 | ademo = try(AAString("BCDEF")) 44 | ``` 45 | 46 | Efficient management of multiple strings employs classes with 47 | "Set" as suffix. 48 | ```{r lkds} 49 | ddem2 = DNAStringSet(c("ACTG", "GTCAG")) 50 | ddem2 51 | ``` 52 | 53 | The restrictions on contents of genomic strings are defined 54 | in constant vectors in `Biostrings`. For example 55 | ```{r lkcon} 56 | AA_ALPHABET 57 | IUPAC_CODE_MAP 58 | ``` 59 | 60 | ## Operations 61 | 62 | There are over 200 functions defined in the Biostrings package, 63 | all devoted to computation on sequence data. Here's an 64 | example illustrating basic notions. 65 | 66 | ```{r doop} 67 | D = DNAString("ACTGACGTACGTAGGCTAGCGATCGATATACGATATACG") 68 | translate(D) 69 | codons(D) 70 | ``` 71 | 72 | Notice that the output of codons is printed as a `Views` instance. 73 | This is a very efficient approach to creating references to 74 | subsequences of a sequence, without copying any data. 75 | -------------------------------------------------------------------------------- /bioc/storage/anno4liftover.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Chromosomes and their substructures 4: Translating addresses between genome builds" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | 16 | ```{r setup,echo=FALSE,results="hide"} 17 | suppressPackageStartupMessages({ 18 | library(BSgenome.Hsapiens.UCSC.hg19) 19 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 20 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 21 | library(Biostrings) 22 | library(GenomicRanges) 23 | library(IRanges) 24 | library(ph525x) 25 | library(Homo.sapiens) 26 | library(rtracklayer) 27 | }) 28 | ``` 29 | 30 | # Translating addresses between genome builds: liftOver 31 | 32 | The rtracklayer package includes an interface to the 33 | liftOver utilities developed for the UCSC genome browser. 34 | The idea is that a collection of local alignments 35 | can be defined and used to remap coordinates from 36 | one reference build to another. 37 | 38 | We can illustrate this with gene addresses created for hg38, 39 | the current reference build. We want to translate them 40 | for comparison to addresses asserted for hg19. 41 | 42 | We need a "chain file", uncompressed. You can 43 | get it from the following URL, and use gunzip on your 44 | system to uncompress in your home dir, if you would 45 | like to emulate the commands below. 46 | 47 | "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz" 48 | 49 | ```{r domyimport} 50 | library(rtracklayer) 51 | ch = import.chain("~/hg38ToHg19.over.chain") 52 | ch 53 | str(ch[[1]]) 54 | ``` 55 | 56 | Let's get the addresses for genes on chromosome 1 57 | in hg38. 58 | 59 | ```{r get38} 60 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 61 | tx38 = TxDb.Hsapiens.UCSC.hg38.knownGene 62 | seqlevels(tx38, force=TRUE) = "chr1" 63 | g1_38 = genes(tx38) 64 | ``` 65 | 66 | Now execute the liftOver: 67 | 68 | ```{r doli} 69 | g1_19L = liftOver(g1_38, ch) 70 | ``` 71 | 72 | The result is a list of GRanges, one for 73 | each translation event. 74 | 75 | ```{r lktx} 76 | g1_19L 77 | ``` 78 | 79 | Verification of accuracy of translation is covered in exercises. 80 | -------------------------------------------------------------------------------- /bioc/storage/annoPhen.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Annotating phenotypes and molecular function" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | --- 8 | 9 | ```{r options, echo=FALSE} 10 | library(knitr) 11 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 12 | ``` 13 | 14 | # The phenotype concept 15 | 16 | 17 | - "Phenotype" is an extremely broad term 18 | - In this course, it connotes low-dimensional representation of observable characteristics of an organism 19 | - Representation can be numerical or categorical 20 | * units of measurement should be recorded 21 | * "codes" for categorical items should be clear 22 | 23 | ## Example: ExperimentalData package on COPD 24 | 25 | We cross-tabulate gender and disease status for individuals in a study of chronic obstructive pulmonary disease 26 | 27 | ```{r dodim} 28 | library(COPDSexualDimorphism.data) 29 | data(lgrc.expr.meta) 30 | with(expr.meta, table(gender, diagmaj)) 31 | ``` 32 | ## Continuous by categorical 33 | 34 | Here's a boxplot of pack-years distributions, stratified by 35 | gender and disease status. The stratum labels become clumsy. 36 | 37 | ```{r lkbx, fig.width=8,fig.height=4.5,dpi=300,out.width="1920px",height="1080px",} 38 | gd = with(expr.meta, factor(paste(gender,diagmaj))) 39 | expr.meta$gd = gd 40 | library(ggplot2) 41 | ggplot(expr.meta, aes(x=gd, y=pkyrs)) + geom_boxplot() 42 | #plot(pkyrs~gd, data=expr.meta) 43 | ``` 44 | ## Phenotype carefully and record faithfully 45 | 46 | - Validated questionnaires and protocols 47 | - Standardized terminology, units 48 | - Precise phenotypic characterization fosters more accurate mechanistic modeling 49 | - Caveat: molecular "basis" suggests causal directionality, but phenotype and environment can influence molecular state 50 | 51 | # Computing tools for inference on molecular mechanisms 52 | 53 | - "Molecular basis" is likewise a broad notion 54 | - Systematic terminologies exist to help clarify what is asserted in a given hypothesis or finding 55 | - At the boundaries of scientific knowledge, disagreement is common and terminologies diverge 56 | - Two examples: 57 | * What is a gene? 58 | * What is a gene's function? 59 | 60 | # Gene: A concrete computational definition 61 | 62 | - ORMDL3 is a gene implicated in genome-wide association 63 | studies as a factor in risk of asthma 64 | - Here's a view of its "structure" according to human reference build hg19 (use ph525x::modPlot) 65 | ```{r domo,fig.height=4,fig.width=7} 66 | library(ph525x) 67 | modPlot("ORMDL3", collapse=FALSE, useGeneSym=FALSE) 68 | ``` 69 | * This will change with new reference build GRCh38 70 | 71 | # Characterizing ORMDL3 functionality 72 | 73 | ```{r dohum} 74 | library(Homo.sapiens) 75 | orfunc = select(Homo.sapiens, key="ORMDL3", keytype="SYMBOL", 76 | columns=c("GO", "TERM")) 77 | orfunc[,c("ONTOLOGY", "TERM")] 78 | ``` 79 | - Gene Ontology standardizes terminology for biological processes, cellular components, and molecular functions 80 | 81 | # Summary 82 | 83 | * Phenotype characterization is challenging and frequently non-standard 84 | * Tokens available for data analysis in R are fairly simple and are used in ad hoc ways to characterize sample phenotype and condition 85 | * Reasoning about molecular processes underlying phenotype and disease states is intrinsically complex 86 | * Standardized vocabularies and models exist and are available in Bioconductor, but limitations must be admitted 87 | 88 | -------------------------------------------------------------------------------- /bioc/storage/chromComp.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Computing with chromosomes and variants" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | --- 8 | 9 | ```{r options, echo=FALSE} 10 | library(knitr) 11 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 12 | ``` 13 | 14 | # Overview 15 | 16 | We will consider how to do various very high-level tasks with chromosomes and variants in Bioconductor. 17 | 18 | - listing packages representing reference builds for humans and model organisms 19 | - acquiring human reference genome sequence 20 | - finding views of genes as sequences 21 | - examining the dbSNP catalog of small variants in populations of human genomes 22 | - examining the NHGRI GWAS catalog of associations between variants and phenotypes 23 | 24 | # BSgenome and available genomes 25 | 26 | ```{r sillib,echo=FALSE,results="hide"} 27 | suppressPackageStartupMessages({ 28 | library(IRanges) 29 | library(BSgenome) 30 | library(BSgenome.Hsapiens.UCSC.hg19) 31 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 32 | library(SNPlocs.Hsapiens.dbSNP.20120608) 33 | library(gwascat) 34 | library(ggbio) 35 | }) 36 | ``` 37 | 38 | ```{r lkbs} 39 | library(BSgenome) 40 | head(available.genomes()) # requires internet access 41 | grep("Hsapiens", available.genomes(), value=TRUE) 42 | ``` 43 | 44 | # The human reference sequence, build hg19; gene sequences 45 | 46 | ```{r lkhs} 47 | library(BSgenome.Hsapiens.UCSC.hg19) 48 | Hsapiens 49 | c17 = Hsapiens$chr17 50 | c17 51 | ``` 52 | 53 | The class of `c17` is `r class(c17)`. This is a full in-memory representation of all the bases of the chromosome. We can work with substructures of interest without duplicating the contents of memory devoted to the sequence. We'll obtain a view of coding sequences of genes on chromosome 17. To do this we will employ a special transcript database structure. 54 | 55 | ```{r gettx} 56 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 57 | txdb = TxDb.Hsapiens.UCSC.hg19.knownGene 58 | txdb 59 | ``` 60 | 61 | We are only interested in information on chr17 at the moment. We establish chr17 as the active sequence in this transcript database 62 | ```{r settx} 63 | tmp = isActiveSeq(txdb) 64 | tmp[] = FALSE # turn all off 65 | tmp[17] = TRUE # turn 17 on 66 | isActiveSeq(txdb) = tmp 67 | g17 = genes(txdb) 68 | g17 69 | ``` 70 | 71 | Now we make a structure that has addresses and sequences of genes. 72 | 73 | ```{r getv} 74 | gs17 = getSeq(Hsapiens, g17) 75 | gs17 76 | ``` 77 | 78 | In the next version of Bioconductor this can be accomplished somewhat 79 | more efficiently using "Views()". 80 | 81 | # dbSNP 82 | 83 | We have an image of the dbSNP variant catalog for hg19. The information retained is limited to the dbSNP identifier, chromosome location, and variant content. 84 | 85 | ```{r dodb} 86 | library(SNPlocs.Hsapiens.dbSNP.20120608) 87 | sl17 = getSNPlocs("ch17", as.GRanges=TRUE) 88 | sl17 89 | ``` 90 | 91 | The allele codes are translated by the IUPAC map. 92 | ```{r lkal} 93 | IUPAC_CODE_MAP 94 | ``` 95 | 96 | # GWAS catalog 97 | 98 | National Human Genome Research Institute maintains a listing of genetic association studies that have found significant associations between DNA variants and major phenotypes and diseases. Inclusion in the catalog requires that the findings be replicated in an independent population. 99 | 100 | ```{r lkgw} 101 | library(gwascat) 102 | data(gwrngs19) # for hg19 103 | gwrngs19 104 | ``` 105 | 106 | A simple display of associations and phenotypes is available 107 | with the `traitsManh` function. 108 | 109 | ```{r lkg2,fig=TRUE} 110 | example(traitsManh) 111 | ``` 112 | -------------------------------------------------------------------------------- /bioc/storage/chromIntro.Rmd: -------------------------------------------------------------------------------- 1 | 2 | 3 | --- 4 | layout: page 5 | title: "Introductory problems with chromosomes and variants" 6 | Author: "Vince Carey" 7 | --- 8 | 9 | # The composition of a gene 10 | 11 | We can obtain chromosomal sequence for all genes 12 | on chr17 as follows. 13 | 14 | ```{r getviews} 15 | library(BSgenome.Hsapiens.UCSC.hg19) 16 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 17 | txdb = TxDb.Hsapiens.UCSC.hg19.knownGene 18 | aseq = isActiveSeq(txdb) 19 | aseq[] = FALSE 20 | aseq["chr17"] = TRUE 21 | isActiveSeq(txdb) = aseq 22 | gs17 = genes(txdb) 23 | gsv17 = Views(Hsapiens, gs17) 24 | gsv17 25 | ``` 26 | 27 | What is the distribution of nucleotide counts for gene 28 | ORMDL3? We need to obtain the ENTREZID: 29 | ```{r gettag} 30 | library(Homo.sapiens) 31 | eid = select(Homo.sapiens, keys="ORMDL3", keytype="SYMBOL", 32 | columns="ENTREZID")$ENTREZID 33 | ``` 34 | Now tabulate nucleotides: 35 | ```{r dotab} 36 | alphabetFrequency( gsv17[ which(mcols(gsv17)$gene_id == eid) ] ) 37 | ``` 38 | 39 | # Determination of the alternate allele 40 | 41 | rs145615430 is a SNP on chr17. What is the alternate allele? 42 | 43 | ```{r dolo} 44 | library(SNPlocs.Hsapiens.dbSNP.20120608) 45 | s17 = getSNPlocs("ch17") 46 | head(s17) 47 | ``` 48 | 49 | We see that it is at base 56 on chr17. 50 | 51 | ```{r getch} 52 | c17 = Hsapiens$chr17 53 | substr(c17, 56, 56) 54 | ``` 55 | The IUPAC code is Y, indicating a CT diallele, so the alternate 56 | is T. 57 | 58 | The associated dbSNP record indicates no frequency data available. 59 | 60 | # SNPs in ORMDL3 61 | 62 | Are there population-level polymorphisms in the coding region 63 | of ORMDL3? We can use GRanges to investigate. 64 | ```{r dopo} 65 | orgr = granges( gsv17[ which(mcols(gsv17)$gene_id == eid) ] ) 66 | s17r = getSNPlocs("ch17", as.GRanges=TRUE) 67 | seqlevelsStyle(s17r) = "UCSC" 68 | genome(s17r) = genome(orgr) 69 | seqlevels(s17r) = seqlevels(orgr) = "chr17" 70 | fo = findOverlaps(s17r, orgr, ignore.strand=TRUE) 71 | s17r[subjectHits(fo)] 72 | ``` 73 | 74 | # GWAS hits for ORMDL3 75 | 76 | We can see the traits of GWAS in which ORMDL3 was implicated. 77 | 78 | ```{r lkgw} 79 | library(gwascat) 80 | data(gwrngs19) 81 | gwrngs19[ grep("ORMDL3", mcols(gwrngs19)$Reported.Gene.s) ] 82 | `` 83 | -------------------------------------------------------------------------------- /bioc/storage/mapping_features.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Mapping features to genes 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ## Using Bioconductor annotation packages 12 | 13 | This unit will focus on mapping features to genes, i.e., getting annotation information from one format to another. We start by loading in the `maPooling` dataset from previous lectures. 14 | 15 | ```{r} 16 | # library(devtools) 17 | # install_github("dagdata","genomicsclass") 18 | library(dagdata) 19 | library(Biobase) 20 | data(maPooling) 21 | e <- maPooling 22 | head(rownames(e)) 23 | annotation(e) 24 | ``` 25 | 26 | The annotation for this ExpressionSet is *rae230a*. Many platforms will have database annotation packages already existing on Bioconductor. We can access these, first by installing, and then loading the library. We will use the `AnnotationDbi` package to query the information in the library. 27 | 28 | While in this unit we will use a microarray annotation package as an example, the same commands can be used for an organism package, such as the homo sapiens annotation package `org.Hs.eg.db`, which let's one query from one kind of gene annotation to another. 29 | 30 | ```{r} 31 | # biocLite(paste0(annotation(e),".db")) 32 | library(rae230a.db) 33 | # biocLite("AnnotationDbi") 34 | library(AnnotationDbi) 35 | ``` 36 | 37 | Annotation packages have *columns*, some of which may be *keys*. You can query the database using a *key*, and ask for one or more *columns* in return. We will use the rownames of the ExpressionSet as keys. 38 | 39 | ```{r} 40 | columns(rae230a.db) 41 | keytypes(rae230a.db) 42 | head(keys(rae230a.db, keytype="PROBEID")) 43 | head(rownames(e)) 44 | ``` 45 | 46 | The following `select` call will return the Entrez ID, ENSEMBL ID, and gene symbol for each Probe ID, which are the rownames of the ExpressionSet. 47 | 48 | ```{r} 49 | res <- select(rae230a.db, keys=rownames(e), 50 | columns=c("ENTREZID","ENSEMBL","SYMBOL"), 51 | keytype="PROBEID") 52 | head(res) 53 | idx <- match(rownames(e), res$PROBEID) 54 | ``` 55 | 56 | We need to align the `res` object so that we pull out, in order, one row for each row of the ExpressionSet. 57 | 58 | ```{r} 59 | head(rownames(e)) 60 | head(res$PROBEID,7) 61 | head(idx) 62 | ``` 63 | 64 | Here we add the new information to the `fData` of `e`. If there were already information in `fData`, we would have used `cbind` to add the new columns. Note here that, since we have a one-to-many mapping, the `match` function gave us the first match that it found. You could also collapse all possible matches of the Probe ID to the Genes using `split` and `paste` with the `collapse` argument. However, here we keep it simple and just take the first match in the `res` object. 65 | 66 | ```{r} 67 | fData(e) <- res[idx,] 68 | head(fData(e),10) 69 | all.equal(fData(e)$PROBEID, rownames(e)) 70 | ``` 71 | 72 | ## Using Biomart 73 | 74 | An alternate way to map from one annotation to another is using the `biomaRt` package. For more information on which Biomarts are available and how to access them, see the `biomaRt` vignette. 75 | 76 | ```{r} 77 | # biocLite("biomaRt") 78 | library(biomaRt) 79 | # vignette("biomaRt") 80 | m <- useMart( "ensembl", dataset = "rnorvegicus_gene_ensembl") 81 | map <- getBM(mart = m, 82 | attributes = c("ensembl_gene_id", "entrezgene"), 83 | filters = "ensembl_gene_id", 84 | values = fData(e)$ENSEMBL) 85 | head(map) 86 | ``` 87 | 88 | Finally, we need to align the new information with the old information using the `match` function as before, again picking the first match from a one-to-many mapping. We see that for the most part the new and the old Entrez IDs are the same, though some differences occur when we pick one from the one-to-many mappings that exist. 89 | 90 | 91 | ```{r} 92 | idx <- match(fData(e)$ENSEMBL, map$ensembl_gene_id) 93 | fData(e)$NEW_ENTREZID <- map$entrezgene[idx] 94 | head(fData(e)) 95 | mean(fData(e)$ENTREZID == fData(e)$NEW_ENTREZID, na.rm=TRUE) 96 | ``` 97 | 98 | -------------------------------------------------------------------------------- /bioc/storage/probeSearch.Rmd: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | layout: page 4 | title: "Searching the reference genome for array probe sequences" 5 | Author: "Vince Carey" 6 | --- 7 | 8 | # Introduction 9 | 10 | Classic Affymetrix expression arrays are known as "3'-biased". 11 | This is because the probe sequences used were selected primarily 12 | from 13 | sequences constituting the 3' untranslated region of mammalian 14 | genes. In this document we'll see how Bioconductor's 15 | annotation facilities can be used to check asserted locations 16 | of array probes. 17 | 18 | # The probe packages; sequence for a gene 19 | 20 | With Affymetrix expression arrays 21 | the primary unit intended for analysis is the probe 22 | set used for mRNA abundance quantification. 23 | Probe sequences are provided in Bioconductors *probe 24 | package series. 25 | 26 | ```{r quietatt, echo=FALSE, results="hide"} 27 | options(width=90) 28 | suppressPackageStartupMessages({ 29 | library(hgu133plus2probe) # probe package 30 | library(hgu133plus2.db) # ChipDb package, annotation mapping 31 | library(dplyr) 32 | library(Biostrings) 33 | library(BSgenome.Hsapiens.UCSC.hg19) 34 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 35 | library(lumiHumanAll.db) 36 | library(lumi) 37 | library(SNPlocs.Hsapiens.dbSNP.20120608) 38 | library(GenomeInfoDb) 39 | }) 40 | ``` 41 | ```{r lkp} 42 | library(hgu133plus2probe) # probe package 43 | library(hgu133plus2.db) # ChipDb package, annotation mapping 44 | ``` 45 | 46 | We'll use the `select` method to find identifiers for 47 | a gene of interest in various disease processes, BCL2L2. 48 | ```{r getid} 49 | AnnotationDbi::select(hgu133plus2.db, 50 | key="BCL2L2", keytype="SYMBOL", columns=c("PROBEID", "ENTREZID", "CHRLOC", 51 | "CHRLOCEND")) 52 | ``` 53 | 54 | Now we will obtain the probe sequences for one of these 55 | probe sets 56 | ```{r gets} 57 | library(dplyr) 58 | bs = hgu133plus2probe %>% 59 | filter(Probe.Set.Name == "209311_at") 60 | bs 61 | ``` 62 | 63 | # Matching the sequences to the reference genome 64 | 65 | First we convert the character data on probe sequence 66 | to Biostrings DNAStrings. 67 | ```{r conv} 68 | ss = bs[,"sequence"] 69 | library(Biostrings) 70 | sss = DNAStringSet(ss) 71 | ``` 72 | Obtain the reference sequence for chr14 73 | ```{r getbsg} 74 | library(BSgenome.Hsapiens.UCSC.hg19) 75 | c14 = Hsapiens$chr14 76 | c14 77 | ``` 78 | Biostrings can match fairly large numbers of 79 | sequences (called query sequences) to a 80 | subject sequence using the Aho-Corasick approach 81 | (see ?PDict for a reference.) 82 | ```{r dodi} 83 | pd = PDict(sss) 84 | pd 85 | ``` 86 | 87 | We now carry out the search. 88 | ```{r doma} 89 | mats = matchPDict(pd, c14) 90 | mats 91 | ``` 92 | 93 | # Are the probes in 3'UTR? 94 | 95 | We will use TranscriptDb to conclude this study. 96 | ```{r getlk} 97 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 98 | txdb = TxDb.Hsapiens.UCSC.hg19.knownGene # shorten 99 | ``` 100 | For simplicity, we'll restrict attention to chr14. 101 | ```{r doac} 102 | ii = isActiveSeq(txdb) 103 | ii[] = FALSE 104 | ii["chr14"] = TRUE 105 | isActiveSeq(txdb) = ii 106 | utrs3 = threeUTRsByTranscript(txdb, use.names=TRUE) 107 | utrs3 108 | ``` 109 | 110 | We can structure the record of matches of probe sequences 111 | to reference genome as a GRanges: 112 | ```{r restr} 113 | mats = GRanges("chr14", unlist(mats)) 114 | fo = findOverlaps(mats, utrs3) 115 | table(subjectHits(fo)) 116 | ufo = unique(subjectHits(fo)) 117 | utrs3[ufo] 118 | ``` 119 | 120 | # Analogous work with Illumina probe sequences 121 | 122 | You can perform a similar check with illumina probes. 123 | ```{r lkl} 124 | library(lumiHumanAll.db) 125 | library(lumi) 126 | sel = AnnotationDbi::select(lumiHumanAll.db, key="BCL2L2", 127 | keytype="SYMBOL", columns="PROBEID") 128 | sel 129 | id2seq(sel) 130 | ``` 131 | 132 | Search and verify. 133 | 134 | # SNPs in probes? 135 | 136 | There is a slight complication because dbSNP 137 | uses an unusual chromosome naming convention. 138 | ```{r lksn} 139 | library(SNPlocs.Hsapiens.dbSNP.20120608) 140 | library(GenomeInfoDb) 141 | s14 = getSNPlocs("ch14", as.GRanges=TRUE) 142 | seqlevelsStyle(s14) = "UCSC" 143 | findOverlaps(s14, mats) 144 | ``` 145 | 146 | We find that there are population level polymorphisms 147 | within the sequences for two of the Affy probes. 148 | Under what conditions 149 | would this cause a problem for accurate expression quantification? 150 | What sorts of statistical methods could ameliorate this? 151 | -------------------------------------------------------------------------------- /bioc/storage/using_limma_old_no_comments.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Using limma 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | 12 | # Using limma for microarray analysis 13 | 14 | Here we provide the code shown in the video 15 | ```{r} 16 | biocLite("SpikeInSubset") 17 | library(SpikeInSubset) 18 | data(rma95) 19 | library(genefilter) 20 | fac <- factor(rep(1:2,each=3)) 21 | tt <- rowttests(exprs(rma95),fac) 22 | mask <- with(tt, abs(dm) < .2 & p.value < .01) 23 | spike <- rownames(rma95) %in% colnames(pData(rma95)) 24 | cols <- ifelse(mask,"red",ifelse(spike,"dodgerblue","black")) 25 | 26 | with(tt, plot(-dm, -log10(p.value), cex=.8, pch=16, 27 | xlim=c(-1,1), ylim=c(0,5), 28 | xlab="difference in means", 29 | col=cols)) 30 | abline(h=2,v=c(-.2,.2), lty=2) 31 | 32 | tt$s <- apply(exprs(rma95), 1, function(row) sqrt(.5 * (var(row[1:3]) + var(row[4:6])))) 33 | with(tt, plot(s, -log10(p.value), cex=.8, pch=16, 34 | log="x",xlab="estimate of standard deviation", 35 | col=cols)) 36 | 37 | library(limma) 38 | fit <- lmFit(rma95, model.matrix(~ fac)) 39 | ebfit <- ebayes(fit) 40 | limmares <- data.frame(dm=coef(fit)[,"fac2"], p.value=ebfit$p.value[,"fac2"]) 41 | with(limmares, plot(dm, -log10(p.value),cex=.8, pch=16, 42 | col=cols,xlab="difference in means", 43 | xlim=c(-1,1), ylim=c(0,5))) 44 | abline(h=2,v=c(-.2,.2), lty=2) 45 | 46 | 47 | n <- 40 48 | qs <- seq(from=0,to=.2,length=n) 49 | idx <- sapply(seq_len(n),function(i) which(as.integer(cut(tt$s^2,qs)) == i)[1]) 50 | idx <- idx[!is.na(idx)] 51 | par(mar=c(5,5,2,2)) 52 | plot(1,1,xlim=c(0,.21),ylim=c(0,1),type="n", 53 | xlab="variance estimates",ylab="",yaxt="n") 54 | axis(2,at=c(.1,.9),c("before","after"),las=2) 55 | segments((tt$s^2)[idx],rep(.1,n), 56 | ebfit$s2.post[idx],rep(.9,n)) 57 | ``` 58 | -------------------------------------------------------------------------------- /bioc/tophat.md: -------------------------------------------------------------------------------- 1 | # Short video of mapping RNA-Seq reads 2 | 3 | Note that the commands used in this lab require you have a lot of free disk space (the FASTQ files alone are 28 GB) and many cores available for running the alignment program. We do not expect students to replicate the commands in this video. We do not expect students install the alignment software on their machines. Much of the software for processing NGS data is designed for Linux systems. Note that the case studies (in particular the variant discovery and genotyping case study) will go into more depth on using Linux for processing NGS data. 4 | 5 | The FASTQ file we are looking at in the beginning of this screencast was downloaded from: 6 | 7 | http://trace.ncbi.nlm.nih.gov/Traces/sra/?run=SRR1177756 8 | 9 | This is a human RNA-Seq sample from a [study](http://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP032775) of naturally acquired immunity to malaria. 10 | 11 | We discuss the following software in the screencast: 12 | 13 | * fastq-dump from the [SRA toolkit](http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software) 14 | * [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) 15 | * [Tophat2](http://ccb.jhu.edu/software/tophat/index.shtml) 16 | * [Samtools](http://samtools.sourceforge.net/) 17 | 18 | To extract the FASTQ files from the SRA file, we used the following line. The `--split-files` argument is used to extract two files for the two paired-ends of the fragments which were sequenced. 19 | 20 | ``` 21 | fastq-dump --split-files SRR1177756.sra 22 | ``` 23 | 24 | The call for running Tophat2 was: 25 | 26 | ``` 27 | export BOWTIE2_INDEXES=/path/to/your/Bowtie2Index 28 | 29 | tophat2 -o SRR1177756_tophat_out -p 10 genome SRR1177756_1.fastq SRR1177756_2.fastq 30 | ``` 31 | 32 | To view the reads we used Samtools: 33 | 34 | ``` 35 | samtools view accepted_hits.bam | head -1000 | less 36 | ``` 37 | 38 | For demonstration purposes (you wouldn't necessarily repeat these lines in a typical workflow), we merged the mapped and unmapped reads into a single sorted file. For this we used the following calls: 39 | 40 | ``` 41 | samtools sort -n accepted_hits.bam accepted_hits_name_sorted 42 | samtools sort -n unmapped.bam unmapped_name_sorted 43 | samtools merge -n all_reads.bam accepted_hits_name_sorted.bam unmapped_name_sorted.bam 44 | ``` 45 | 46 | -------------------------------------------------------------------------------- /biocadv_6x/bioc2_ggbio.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Sketching the binding landscape over chromosomes with ggbio's karyogram layout" 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | There are many interesting approaches to visualizing genome-scale data. 12 | Two major packages in Bioconductor are Gviz and ggbio. Both represent 13 | significant efforts at bridging the gap between graphics facilities 14 | and various genomic data structures. 15 | 16 | ggbio's `autoplot` method can be very useful for broad overviews. 17 | For a GRanges instance, each range for which data exists can be 18 | depicted as a band on the chromosome. The karyogram layout 19 | gives a genome-wide view, but it can be important to control 20 | the handling of extra-chromosomal sequence levels. 21 | 22 | ```{r getl,echo=FALSE,results="hide"} 23 | suppressWarnings({ 24 | suppressPackageStartupMessages({ 25 | library(ERBS) 26 | library(GenomeInfoDb) 27 | library(ggbio) 28 | }) 29 | }) 30 | ``` 31 | 32 | Here is the layout for the liver cell line: 33 | ```{r lkd, fig=TRUE} 34 | library(ERBS) 35 | data(HepG2) 36 | library(GenomeInfoDb) # trim all but autosomal chroms 37 | HepG2 = keepStandardChromosomes(HepG2) 38 | data(GM12878) 39 | GM12878 = keepStandardChromosomes(GM12878) 40 | library(ggbio) 41 | autoplot(HepG2, layout="karyogram", main="ESRRA binding on HepG2") 42 | ``` 43 | 44 | And for the B-cell line: 45 | 46 | ```{r lkm,fig=TRUE} 47 | autoplot(GM12878, layout="karyogram", main="ESRRA binding on GM12878") 48 | ``` 49 | -------------------------------------------------------------------------------- /biocadv_6x/bioc2_gvfeat.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Gviz for plotting data with genomic features" 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | It is often of interest to display observed data in the 12 | context of genomic reference information. We'll examine how to 13 | do this with the ESRRA binding data and Gviz. 14 | 15 | First we load up relevant data and annotation packages along with 16 | Gviz. 17 | 18 | ```{r getl,echo=FALSE,results="hide"} 19 | suppressWarnings({ 20 | suppressPackageStartupMessages({ 21 | library(ERBS) 22 | library(Gviz) 23 | library(Homo.sapiens) 24 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 25 | }) 26 | }) 27 | ``` 28 | ```{r getp} 29 | library(ERBS) 30 | library(Gviz) 31 | library(Homo.sapiens) 32 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 33 | txdb = TxDb.Hsapiens.UCSC.hg19.knownGene 34 | ``` 35 | 36 | ## Genes in the vicinity of ESRRA 37 | 38 | How can we identify a slice of the human genome containing 39 | ESRRA and some neighboring genes? There are various approaches; 40 | we'll start by obtaining the ENTREZ identifier. 41 | 42 | ```{r getid} 43 | library(Homo.sapiens) 44 | eid = select(Homo.sapiens, keys="ESRRA", keytype="SYMBOL", columns="ENTREZID") 45 | eid 46 | ``` 47 | 48 | Now we obtain the addresses for the ESRRA gene body, 49 | collect addresses of neighboring genes, and bind in the 50 | symbols for these genes. 51 | 52 | ```{r done} 53 | allg = genes(txdb) 54 | esrraAddr = genes(txdb, filter=list(gene_id=2101)) # redundant... 55 | esrraNeigh = subsetByOverlaps(allg, esrraAddr+500000) 56 | esrraNeigh$symbol = mapIds(Homo.sapiens, keys=esrraNeigh$gene_id, keytype="ENTREZID", 57 | column="SYMBOL") 58 | ``` 59 | 60 | A quick check on the task with Gviz: 61 | ```{r lknei,fig=TRUE} 62 | plotTracks(GeneRegionTrack(esrraNeigh, showId=TRUE)) 63 | ``` 64 | 65 | ## The ESRRA binding peaks in this region 66 | 67 | We obtain the ESRRA binding data for the GM12878 EBV-transformed 68 | B-cell and subset to events near our group of genes. 69 | ```{r gete} 70 | data(GM12878) 71 | sc = subsetByOverlaps(GM12878, range(esrraNeigh)) 72 | sc 73 | ``` 74 | 75 | ## Computing an ideogram to give context on the chromosome 76 | 77 | This computation is slow. 78 | ```{r doid,cache=TRUE} 79 | idxTrack = IdeogramTrack(genome="hg19", chr="chr11") 80 | ``` 81 | 82 | ## Putting it all together 83 | 84 | We start at the top with the ideogram to identify chromosome and 85 | region on chromosome to which we are zooming with observational 86 | and structural information. 87 | 88 | ```{r dofull,fig=TRUE} 89 | plotTracks(list(idxTrack, GenomeAxisTrack(), 90 | DataTrack(sc[,7], name="ESRRA peak values"), 91 | GeneRegionTrack(esrraNeigh, showId=TRUE, 92 | name="genes near ESRRA"), GenomeAxisTrack())) 93 | ``` 94 | -------------------------------------------------------------------------------- /biocadv_6x/bioc2_hybstor.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Sharded GRanges: a hybrid in/out of memory strategy for large sets of ranges" 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | suppressPackageStartupMessages({ 10 | library(Biobase) 11 | library(geuvStore2) 12 | library(gQTLBase) 13 | library(gQTLstats) 14 | library(foreach) 15 | library(doParallel) 16 | library(ph525x) 17 | }) 18 | ``` 19 | 20 | 21 | ## Introduction 22 | 23 | We've looked at a number of approaches to 24 | working with data external to R: 25 | 26 | * HDF5, which manages groups of multidimensional arrays on disk 27 | * sqlite, a zero-configuration relational database 28 | * tabix, a simple approach to indexing records on genomic coordinates 29 | 30 | Here I want to describe an approach that seems useful for millions 31 | of ranges annotated in the course of searching for variants that 32 | affect gene expression at the population level. The approach 33 | is based on a concept of storing data in "shards", homogeneous small 34 | fragments that can be quickly loaded and unloaded, discoverable 35 | by index and traversable in parallel. 36 | 37 | ## Motivation: An integrative view of associations in GEUVADIS 38 | 39 | The [GEUVADIS study](http://www.nature.com/nature/journal/v501/n7468/full/nature12531.html) is an intensive multiomic study of gene expression in multiple 40 | populations. We want to make use of the data from this study to 41 | investigate variants affecting genes of interest, with one tool 42 | an interactive graphical utility illustrated in the video: 43 | 44 | ```{r lkgg,fig=TRUE} 45 | library(ph525x) 46 | ggshot() 47 | ``` 48 | 49 | We want to be able to select genes by symbol and explore names 50 | and epigenetic contexts of variants whose content is associated with 51 | expression variation. It is useful to have the variants annotated 52 | using GRanges, but a very large GRanges object (there are hundreds 53 | of millions of SNP-gene associations recorded) can be unwieldy. 54 | Solutions using RDBMS or HDF5 may be viable but more infrastructure 55 | for rapidly searching such stores using genomic coordinates, 56 | and for converting query results to GRanges will be needed. 57 | 58 | BatchJobs was used to generate the association tests, and it 59 | produces 60 | an organized system of "sharded" GRanges recording the 61 | associations along with metadata about the associated features. 62 | This system can be stored in a package, exemplified by geuvStore. 63 | 64 | ## A quick look at geuvStore 65 | 66 | The association test results are organized using a BatchJobs 67 | registry that is wrapped in an S4 class called ciseStore. 68 | ```{r lkgv} 69 | library(geuvStore2) 70 | m = makeGeuvStore2() 71 | class(m) 72 | m 73 | ``` 74 | 75 | The show method for m probes into the store and retrieves one record 76 | from one GRanges instance. 77 | 78 | ## Scalable traversal 79 | 80 | The traversal of all GRanges available in this selection is 81 | governed by foreach loops. 82 | ```{r lksca, cache=TRUE} 83 | library(gQTLBase) 84 | ut1 = system.time(l1 <- storeApply(m, length)) 85 | ut1 86 | library(doParallel) 87 | registerDoParallel(cores=2) 88 | ut2 = system.time(l2 <- storeApply(m, length)) 89 | ut2 90 | print(sum(unlist(l2))) 91 | all.equal(unlist(l1), unlist(l2)) 92 | ``` 93 | We see that doubling the number of processors reduces the 94 | time required to get the length of each component of the archive. 95 | With large numbers of cores, we can quickly assemble information 96 | about many variants. 97 | 98 | ## Scalable histogram construction 99 | 100 | When the histogram bins are fixed, divide and conquer can be 101 | used to assemble a histogram in parallel over many chunks. 102 | 103 | ```{r lkhhh,cache=TRUE} 104 | registerDoParallel(cores=1) 105 | system.time(ll <- storeToHist(m, getter=function(x)log(mcols(x)$chisq+1), breaks=c(0,seq(.1,5,.1),10))) 106 | registerDoParallel(cores=2) 107 | system.time(ll <- storeToHist(m, getter=function(x)log(mcols(x)$chisq+1), breaks=c(0,seq(.1,5,.1),10))) 108 | ``` 109 | 110 | ## Indexing for targeted retrievals 111 | 112 | The ciseStore class includes two maps: one from range to shard number, 113 | another from gene identifier to shard number. This allows rapid 114 | retrievals. 115 | 116 | ```{r lkex} 117 | myr = GRanges(2, IRanges(1975.7e5, width=50000)) 118 | extractByRanges(m, myr) 119 | ``` 120 | 121 | ## Conclusions 122 | 123 | geuvStore2 is a complex architecture that aims to provide a 124 | partly baked representation of quantities from genome-scale 125 | surveys that can be scalably surveyed and integrated. This 126 | is accomplished by keeping ranges for association scores 127 | and metadata in small sharded GRanges with some simple indexes, 128 | retrieval utilities, and with support for parallelized traversal 129 | and summary. It would be very nice to achieve these aims with 130 | a more homogeneous underlying architecture such as HDF5, and 131 | this may be possible as file-backed SummarizedExperiments come 132 | on line. 133 | 134 | -------------------------------------------------------------------------------- /biocadv_6x/bioc2_ov.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Bioconductor for genome-scale data -- quick intro 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ```{r setup,echo=FALSE,results="hide"} 12 | suppressWarnings({ 13 | suppressPackageStartupMessages({ 14 | library(Biobase) 15 | library(GSE5859) 16 | library(annotate) 17 | library(BiocParallel) 18 | library(VariantAnnotation) 19 | library(BSgenome.Hsapiens.UCSC.hg19) 20 | }) 21 | }) 22 | ``` 23 | 24 | # PH525.6x: Basic premise and overview 25 | 26 | You know to manipulate and analyze data using R, and 27 | you understand a considerable amount about statistical modeling. 28 | If you've taken PH525.5x, you've gotten significant background 29 | on current agendas in computational biology, and have learned 30 | how to deal with genomic data from the management, annotation, 31 | and analysis perspectives. 32 | 33 | In this course, we will use Bioconductor as the foundation 34 | of demonstrations and exercises in 35 | * methods for genome-scale data visualization including interactive graphics with the shiny and ggvis packages; 36 | * programming strategies for scalable bioinformatics with multicore and cluster computing infrastructure; 37 | * integrative management and analysis of multiassay experiments, with illustrations from The Cancer Genome Atlas (TCGA); 38 | * approaches to improving reproducibility of genome-scale analyses. 39 | 40 | One week will be devoted to each of these topics. 41 | -------------------------------------------------------------------------------- /biocadv_6x/bioc2_rainfall.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "A view of genetic heterogeneity between and within cancer types" 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | suppressPackageStartupMessages({ 10 | library(ph525x) 11 | library(RTCGAToolbox) 12 | }) 13 | ``` 14 | 15 | 16 | ## Introduction 17 | 18 | We will use data in the ph525x package on mutations in 19 | breast cancer and rectal adenocarcinoma to illustrate 20 | some issues in dealing with mutations data from TCGA. 21 | A basic objective is construction of a "rainfall plot". 22 | An example is Figure 6 from [Alexandrov et al. 2013](http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3776390&tool=pmcentrez&rendertype=abstract): 23 | 24 | ```{r lkkat,fig=TRUE,echo=FALSE} 25 | kataegis() 26 | ``` 27 | 28 | These plots include data from deeply sequenced individual tumors, 29 | and we'd like to understand how to construct them using 30 | tools from Bioconductor. 31 | 32 | ## The mutation data frames from RTCGAToolbox 33 | 34 | The `readMuts` data are from the 20150402 TCGA production. 35 | ```{r lkread} 36 | library(ph525x) 37 | data(readMuts) 38 | dim(readMuts) 39 | data(brcaMuts) 40 | dim(brcaMuts) 41 | ``` 42 | 43 | ## Mutation types and their contents 44 | 45 | ```{r lkrmut} 46 | table(readMuts$Variant_Type) 47 | with(readMuts, head(Reference_Allele[Variant_Type=="DEL"])) 48 | ``` 49 | 50 | ## Tabulating substitution types 51 | 52 | The following function enumerates substitutions according to 53 | the [COSMIC convention](http://cancer.sanger.ac.uk/cosmic/signatures): 54 | "The profile of each signature is displayed using the six substitution subtypes: C>A, C>G, C>T, T>A, T>C, and T>G (all substitutions are referred to by the pyrimidine of the mutated Watson–Crick base pair)." 55 | 56 | ```{r dosubt} 57 | subt = function(ref, a1, a2) { 58 | alt = ifelse(a1 != ref, a1, a2) 59 | tmp = ref 60 | needsw = which(alt %in% c("C", "T")) 61 | ref[needsw] = alt[needsw] 62 | alt[needsw] = tmp[needsw] 63 | paste(ref, alt, sep = ">") 64 | } 65 | with(readMuts[readMuts$Variant_Type=="SNP",], 66 | table(subt(Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2))) 67 | ``` 68 | 69 | A>G and G>A substitutions are not included in kataegis plots. 70 | 71 | To define the colors used for substitutions: 72 | 73 | ```{r lkkac} 74 | ph525x:::kataColors 75 | ``` 76 | 77 | ## Total genomic distance 78 | 79 | The mutation locations reported are not particularly convenient for genome-wide 80 | plotting as the distances are all relative to chromosome start. 81 | The following hidden function computes total distance relative 82 | to start of chr1, assuming that the data are held in GRanges. 83 | ```{r lktg} 84 | ph525x:::totalgd 85 | ``` 86 | 87 | ## A demo plot for four tumors 88 | 89 | The rainfall function will organize the input data by sample, and 90 | samples can, in the present version, be selected according to 91 | their position in an ordering based on number of mutations reported. 92 | The default plots the sample with the greatest number of mutations. 93 | The oind parameter allows selection of samples further down in the 94 | ordering. We embellish the plot with a simple kernel estimate 95 | of the density of mutations along the chromosomes. The 96 | function invisibly returns a list of items related to the plot. 97 | 98 | ```{r do4f,fig=TRUE,fig.height=8} 99 | rainouts = list() 100 | par(mfrow=c(4,1),mar=c(4,5,1,1)) 101 | for (i in 1:4) rainouts[[i]] = rainfall(readMuts, oind=i) 102 | ``` 103 | 104 | ```{r lkrao} 105 | str(rainouts[[1]]) 106 | ``` 107 | 108 | 109 | -------------------------------------------------------------------------------- /biocadv_6x/finalViz.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Some final comments on genome-scale visualization" 4 | --- 5 | 6 | ```{r options, echo=FALSE, message=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | suppressPackageStartupMessages({ 10 | suppressWarnings({ 11 | library(BiocStyle) 12 | library(Biobase) 13 | library(randomForest) 14 | library(MLInterfaces) 15 | library(tissuesGeneExpression) 16 | library(limma) 17 | library(ph525x) 18 | library(RCircos) 19 | }) 20 | }) 21 | ``` 22 | 23 | 24 | ## RCircos 25 | 26 | RCircos is not distributed in Bioconductor, but can 27 | be useful for developing compact displays of interactions 28 | among genomic elements. I am unaware of any interfaces between 29 | Bioconductor data classes and RCircos, and this topic deserves 30 | attention. 31 | 32 | In the ph525x package we have added a selection of trans-eQTL 33 | findings from Westra et al. Nature 2013 (doi: 10.1038/ng.2756). 34 | We show a few SNP-gene associations from this study: 35 | ```{r lksn,fig=TRUE,message=FALSE,fig.height=9,fig.width=9} 36 | library(ph525x) 37 | data(westraTransSel) 38 | westraTransSel[1:3] 39 | sglToCircos(westraTransSel[1:5]) 40 | ``` 41 | 42 | ## ComplexHeatmap 43 | 44 | `r Biocpkg("ComplexHeatmap")` has a very nice vignette addressing many 45 | issues in combining heatmaps and repurposing the heatmap 46 | concept. The oncoprint example in the vignette is particularly 47 | comrelling. To use this interactively with TCGA, contact 48 | [the ISB](http://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/FAQ.html) and obtain a cloud platform account. 49 | Then obtain the `r Biocpkg("cgcR")` 50 | package, load it, and run `isbApp()`. You will have to authenticate 51 | with google to get access to the BigQuery representation of TCGA. 52 | 53 | ## WebGL and interaction with data 54 | 55 | In the short concluding video we use the MLInterfaces plspinHcube 56 | function to illustrate several aspects of interactivity: GUI for 57 | tuning, mouse-controlled rotation, and mouseover for point interrogation. 58 | 59 | ## EpiViz 60 | 61 | The `r Biocpkg("epivizr")` package interacts with the 62 | [epiviz](https://epiviz.github.io/) system and is capable of substantial feats of data integration and 63 | higher-level data interactivity. 64 | -------------------------------------------------------------------------------- /biocadv_6x/multiOOM.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Benchmarking multiple out-of-memory strategies" 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | suppressPackageStartupMessages({ 10 | suppressWarnings({ 11 | library(ph525x) 12 | library(microbenchmark) 13 | library(bigmemory) 14 | library(rhdf5) 15 | library(RSQLite) 16 | }) 17 | }) 18 | ``` 19 | 20 | 21 | ## Introduction 22 | 23 | In many large-data situations, it is impractical to load and retain 24 | data in R's working memory space. We have had a look at 25 | HDF5, SQLite and tabix-indexed text as possible solutions 26 | to problems arising with memory constraints. We'll call 27 | these "out-of-memory" (OOM) approaches 28 | 29 | How can we obtain data on which approach will be most effective 30 | for a given task? Comparative benchmarking is a very useful skill and 31 | we give a very rudimentary account of this here. 32 | 33 | ## The harness 34 | 35 | It is common to speak of a program that drives other programs 36 | as a "harness" (see [wikipedia](https://en.wikipedia.org/wiki/Test_harness) 37 | for related discussion). We have such a program in ph525x: 38 | 39 | ```{r lkph} 40 | benchOOM 41 | ``` 42 | 43 | This program is going to help us assess performance of various 44 | OOM approaches. We consider a very limited problem, that of 45 | managing data that could reside in an R matrix. 46 | The main parameters are 47 | 48 | * `NR` and `NC`: row and column dimensions 49 | * `times`: number of benchmark replications for averaging 50 | * `inseed`: a seed for random number generation to ensure reproducibility 51 | * `methods`: a list of methods 52 | 53 | The `methods` parameter is most complex. Each element of the list 54 | is assumed to be a function with the matrix to 55 | be managed via OOM as the first argument, some additional 56 | parameters, and a parameter `intimes` that gives the number 57 | of benchmark replicates. 58 | 59 | Our objective is to produce a table that looks like 60 | 61 | ``` 62 | > b1 63 | NR NC times meth wr ingFull ing1K 64 | 1 5000 100 5 hdf5 10.71714 9.4100810 14.2984402 65 | 2 5000 100 5 ff 25.34365 63.0977338 4.4320688 66 | 3 5000 100 5 sqlite 174.89003 105.1254638 28.4717496 67 | 4 5000 100 5 data.table 49.35190 7.9871552 13.9007588 68 | 5 5000 100 5 bigmemory 23.39697 0.9660878 0.9950034 69 | ``` 70 | 71 | where each method listed in `meth` is asked to perform the same 72 | task a fixed number of times for averaging. The construction of 73 | the table occurs by binding together metadata about the task and 74 | method to the result of `getStats`. We'll leave the details 75 | of `getStats` to independent investigation. 76 | 77 | 78 | ## An example method (OOM benchmarker) 79 | 80 | Let's look at the method for HDF5: 81 | ```{r lkme} 82 | ph525x:::.h5RoundTrip 83 | ``` 84 | 85 | The program has three main phases 86 | 87 | * HDF5-related setup, cleaning out any previous archives and establishing 88 | the basic target file 89 | * Benchmarking of data export via `h5write` 90 | * Benchmarking of ingestion via `h5read` with various restrictions 91 | 92 | The results of `microbenchmark` are assembled in a list. 93 | -------------------------------------------------------------------------------- /biocintro_5x/bioc1_align.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Notes on video of mapping RNA-seq reads" 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | # Short video of mapping RNA-Seq reads 12 | 13 | Note that the commands used in this lab require you have a lot of free disk space (the FASTQ files alone are 28 GB) and many cores available for running the alignment program. We do not expect students to replicate the commands in this video. We do not expect students install the alignment software on their machines. Much of the software for processing NGS data is designed for Linux systems. Note that the case studies (in particular the variant discovery and genotyping case study) will go into more depth on using Linux for processing NGS data. 14 | 15 | The FASTQ file we are looking at in the beginning of this screencast was downloaded from: 16 | 17 | http://trace.ncbi.nlm.nih.gov/Traces/sra/?run=SRR1177756 18 | 19 | This is a human RNA-Seq sample from a [study](http://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP032775) of naturally acquired immunity to malaria. 20 | 21 | We discuss the following software in the screencast: 22 | 23 | * fastq-dump from the [SRA toolkit](http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software) 24 | * [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) 25 | * [Tophat2](http://ccb.jhu.edu/software/tophat/index.shtml) 26 | * [Samtools](http://samtools.sourceforge.net/) 27 | 28 | To extract the FASTQ files from the SRA file, we used the following line. The `--split-files` argument is used to extract two files for the two paired-ends of the fragments which were sequenced. 29 | 30 | ``` 31 | fastq-dump --split-files SRR1177756.sra 32 | ``` 33 | 34 | The call for running Tophat2 was: 35 | 36 | ``` 37 | export BOWTIE2_INDEXES=/path/to/your/Bowtie2Index 38 | 39 | tophat2 -o SRR1177756_tophat_out -p 10 genome SRR1177756_1.fastq SRR1177756_2.fastq 40 | ``` 41 | 42 | To view the reads we used Samtools: 43 | 44 | ``` 45 | samtools view accepted_hits.bam | head -1000 | less 46 | ``` 47 | 48 | For demonstration purposes (you wouldn't necessarily repeat these lines in a typical workflow), we merged the mapped and unmapped reads into a single sorted file. For this we used the following calls: 49 | 50 | ``` 51 | samtools sort -n accepted_hits.bam accepted_hits_name_sorted 52 | samtools sort -n unmapped.bam unmapped_name_sorted 53 | samtools merge -n all_reads.bam accepted_hits_name_sorted.bam unmapped_name_sorted.bam 54 | ``` 55 | 56 | -------------------------------------------------------------------------------- /biocintro_5x/bioc1_annoCheat.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Genomic annotation in Bioconductor: Cheat sheet" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | # Summarizing the key genome annotation resources in Bioconductor 16 | 17 | ## Executive summary 18 | 19 | ### Organism-oriented annotation 20 | 21 | For biological annotation, generally sequence or gene based, there 22 | are three key types of package 23 | 24 | * Reference sequence packages: BSgenome.[Organism].[Curator].[BuildID] 25 | * Gene model database packages: TxDb.[Organism].[Curator].[BuildID].[Catalog], 26 | and, EnsDb.[Organism].[version], for Ensembl-derived annotation 27 | * Annotation map package: org.[Organism2let].[Institution].db 28 | 29 | wherever brackets are used, you must substitute an appropriate token. 30 | You can survey all annotation packages at [the annotation page](http://bioconductor.org/packages/release/BiocViews.html#___AnnotationData). 31 | 32 | Packages Homo.sapiens, Mus.musculus and Rattus.norvegicus are specialized 33 | integrative annotation resources with an evolving interface. 34 | 35 | ### Systems biology oriented annotation 36 | 37 | Packages GO.db, KEGG.db, KEGGREST, and reactome.db are primarily 38 | intended as organism-independent resources organizing genes into 39 | groups. However, there are organism-specific mappings between 40 | gene-oriented annotation and these resources, that involve specific 41 | abbreviations and symbol conventions. These are described 42 | when these packages are used. 43 | 44 | ## Names for organisms and their abbreviations 45 | 46 | The standard Linnaean taxonomy is used very generally. So you 47 | need to know that 48 | 49 | * Human = *Homo sapiens* 50 | * Mouse = *Mus musculus* 51 | * Rat = *Rattus norvegicus* 52 | * Yeast = *Saccharomyces cerevisiae* 53 | * Zebrafish = *Danio rerio* 54 | * Cow = *Bos taurus* 55 | 56 | and so on. We use two sorts of abbreviations. For 57 | Biostrings-based packages, the contraction of first 58 | and second names is used 59 | 60 | * Human = Hsapiens 61 | * Mouse = Mmusculus 62 | * Rat = Rnorvegicus 63 | * Yeast = Scerevisiae ... 64 | 65 | For NCBI-based annotation maps, we contract further 66 | 67 | * Human = Hs 68 | * Mouse = Mm 69 | * Rat = Rn 70 | * Yeast = Sc ... 71 | 72 | ## Genomic sequence 73 | 74 | These packages have four-component names that specify the reference build used 75 | 76 | * Human = BSgenome.Hsapiens.UCSC.hg19 77 | * Mouse = BSgenome.Mmusculus.UCSC.mm10 78 | * Rat = BSgenome.Rnorvegicus.UCSC.rn5 79 | * Yeast = BSgenome.Scerevisiae.UCSC.sacCer3 80 | 81 | ## Gene models 82 | 83 | These packages have five-component names that specify the reference build used and 84 | the gene catalog 85 | 86 | * Human = TxDb.Hsapiens.UCSC.hg19.knownGene 87 | * Mouse = TxDb.Mmusculus.UCSC.mm10.knownGene 88 | * Rat = TxDb.Rnorvegicus.UCSC.rn5.knownGene 89 | * Yeast = TxDb.Scerevisiae.UCSC.sacCer3.sgdGene 90 | 91 | Additional packages that are relevant are 92 | 93 | * Human = TxDb.Hsapiens.UCSC.hg38.knownGene 94 | * Human = EnsDb.Hsapiens.v75 -- related to hg19/GRCh37 95 | 96 | ## Annotation maps 97 | 98 | These packages have four component names, with two components fixed. The 99 | variable components indicate organism and curating institution. 100 | 101 | * Human = org.Hs.eg.db 102 | * Mouse = org.Mm.eg.db 103 | * Rat = org.Rn.eg.db 104 | * Yeast = org.Sc.sgd.db 105 | 106 | ## Additional options 107 | 108 | There are often alternative curating institutions available such as 109 | Ensembl. 110 | -------------------------------------------------------------------------------- /biocintro_5x/bioc1_grangeOps.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "GRanges operations related to gene model, TSS, and promoter region identification" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: 6 | pdf_document: default 7 | html_document: default 8 | layout: page 9 | toc: yes 10 | --- 11 | 12 | ```{r options, echo=FALSE} 13 | library(knitr) 14 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 15 | ``` 16 | 17 | 18 | 19 | ```{r setup,echo=FALSE,results="hide"} 20 | suppressPackageStartupMessages({ 21 | library(BSgenome.Hsapiens.UCSC.hg19) 22 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 23 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 24 | library(Biostrings) 25 | library(GenomicRanges) 26 | library(IRanges) 27 | library(ph525x) 28 | library(Homo.sapiens) 29 | library(Gviz) 30 | }) 31 | ``` 32 | # Overview 33 | 34 | In this document we work with a small set of ranges and 35 | illustrate basic intra-range operations reduce, disjoin, gaps. 36 | We then add strand and seqname information and show how 37 | resize and flank are useful for identifying TSS and promoter regions. 38 | 39 | ## A simple set of ranges 40 | 41 | ```{r newr} 42 | ir <- IRanges(c(3, 8, 14, 15, 19, 34, 40), 43 | width = c(12, 6, 6, 15, 6, 2, 7)) 44 | ``` 45 | 46 | ```{r plotr,echo=FALSE} 47 | plotRanges <- function(x, xlim = x, main = deparse(substitute(x)), 48 | col = "black", sep = 0.5, ...) 49 | { 50 | height <- 1 51 | if (is(xlim, "Ranges")) 52 | xlim <- c(min(start(xlim)), max(end(xlim))) 53 | bins <- disjointBins(IRanges(start(x), end(x) + 1)) 54 | plot.new() 55 | plot.window(xlim, c(0, max(bins)*(height + sep))) 56 | ybottom <- bins * (sep + height) - height 57 | rect(start(x)-0.5, ybottom, end(x)+0.5, ybottom + height, col = col, ...) 58 | title(main) 59 | axis(1) 60 | } 61 | 62 | plotGRanges = function (x, xlim = x, col = "black", sep = 0.5, xlimits = c(0, 63 | 60), ...) 64 | { 65 | main = deparse(substitute(x)) 66 | ch = as.character(seqnames(x)[1]) 67 | x = ranges(x) 68 | height <- 1 69 | if (is(xlim, "Ranges")) 70 | xlim <- c(min(start(xlim)), max(end(xlim))) 71 | bins <- disjointBins(IRanges(start(x), end(x) + 1)) 72 | plot.new() 73 | plot.window(xlim = xlimits, c(0, max(bins) * (height + sep))) 74 | ybottom <- bins * (sep + height) - height 75 | rect(start(x) - 0.5, ybottom, end(x) + 0.5, ybottom + height, 76 | col = col, ...) 77 | title(main, xlab = ch) 78 | axis(1) 79 | } 80 | ``` 81 | 82 | Let's visualize `ir` and several intra-range operations. 83 | ```{r lkir,fig=TRUE, out.height="800px"} 84 | par(mfrow=c(4,1), mar=c(4,2,2,2)) 85 | plotRanges(ir, xlim=c(0,60)) 86 | plotRanges(reduce(ir), xlim=c(0,60)) 87 | plotRanges(disjoin(ir), xlim=c(0,60)) 88 | plotRanges(gaps(ir), xlim=c(0,60)) 89 | ``` 90 | 91 | reduce(x) produces a set of 92 | nonoverlapping ranges that cover all positions covered by x. 93 | This can be used to reduce complexity of a gene model 94 | with many transcripts, where we may just want the addresses 95 | of intervals known to be transcribed, regardless of transcript 96 | of residence. 97 | 98 | disjoin(x) produces a set of ranges that cover all positions 99 | covered by x, such that none of the ranges in the 100 | disjoin output overlaps any end points of intervals in x. 101 | This gives us the largest possible collection of contiguous 102 | intervals that are separated wherever the original set 103 | of intervals had an endpoint. 104 | 105 | gaps(x) produces a set of ranges covering the positions 106 | in [start(x), end(x)] that are not covered by any range in x. 107 | Given coding sequence addresses and exon intervals, this can 108 | be used to enumerate introns. 109 | 110 | # Extension to GRanges 111 | 112 | We add chromosome and strand information. 113 | 114 | ```{r dogr} 115 | library(GenomicRanges) 116 | gir = GRanges(seqnames="chr1", ir, strand=c(rep("+", 4), rep("-",3))) 117 | ``` 118 | 119 | Let's assume the intervals represent genes. 120 | The following plots illustrate the identification of 121 | transcription start sites (green), upstream promoter 122 | regions (purple), downstream promoter regions (brown). 123 | 124 | ```{r dopr,fig=TRUE, out.height="800px", out.width="500px"} 125 | par(mfrow=c(4,1), mar=c(4,2,2,2)) 126 | plotGRanges(gir, xlim=c(0,60)) 127 | plotGRanges(resize(gir,1), xlim=c(0,60),col="green") 128 | plotGRanges(flank(gir,3), xlim=c(0,60), col="purple") 129 | plotGRanges(flank(gir,2,start=FALSE), xlim=c(0,60), col="brown") 130 | ``` 131 | 132 | Note that we do not need to take special steps to 133 | deal with the differences in strand. 134 | -------------------------------------------------------------------------------- /biocintro_5x/bioc1_liftOver.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Translating addresses between genome builds" 3 | author: "Vince" 4 | date: "March 19, 2015" 5 | output: html_document 6 | layout: page 7 | toc: yes 8 | --- 9 | 10 | ```{r options, echo=FALSE} 11 | library(knitr) 12 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 13 | ``` 14 | 15 | 16 | ```{r setup,echo=FALSE,results="hide"} 17 | suppressWarnings({ 18 | suppressMessages({ 19 | suppressPackageStartupMessages({ 20 | library(BSgenome.Hsapiens.UCSC.hg19) 21 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 22 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 23 | library(Biostrings) 24 | library(GenomicRanges) 25 | library(IRanges) 26 | library(ph525x) 27 | library(Homo.sapiens) 28 | library(rtracklayer) 29 | }) 30 | }) 31 | }) 32 | ``` 33 | 34 | # Translating addresses between genome builds: liftOver 35 | 36 | The rtracklayer package includes an interface to the 37 | liftOver utilities developed for the UCSC genome browser. 38 | The idea is that a collection of local alignments 39 | can be defined and used to remap coordinates from 40 | one reference build to another. 41 | 42 | We can illustrate this with gene addresses created for hg38, 43 | the current reference build. We want to translate them 44 | for comparison to addresses asserted for hg19. 45 | 46 | ## Acquiring a chain file 47 | 48 | Address translation between reference builds can be specified 49 | using a [chain format file](https://genome.ucsc.edu/goldenpath/help/chain.html). Two ways of getting the chain file are: 50 | 51 | ### Direct manual acquisition 52 | 53 | You can 54 | get it from the following URL, and use gunzip on your 55 | system to uncompress in your home dir, if you would 56 | like to emulate the commands below. 57 | 58 | "ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz" 59 | 60 | ### Acquisition through AnnotationHub 61 | 62 | This is fully programmatic but may involve acquiring and caching 63 | a metadata database with the AnnotationHub package. 64 | 65 | ```{r doviaah} 66 | library(AnnotationHub) 67 | ah = AnnotationHub() 68 | q1 = query(ah, c("chain")) # list all resources with 'chain' in metadata 69 | q1 70 | q2 = query(ah, c("chain", "hg38ToHg19")) # the one we want 71 | ch = ah[[names(q2)]] 72 | ``` 73 | 74 | ```{r domyimport} 75 | library(rtracklayer) 76 | # following only if you do not use AnnotationHub 77 | # ch = import.chain("~/hg38ToHg19.over.chain") 78 | ch 79 | str(ch[[1]]) 80 | ``` 81 | 82 | Let's get the addresses for genes on chromosome 1 83 | in hg38. 84 | 85 | ```{r get38} 86 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 87 | tx38 = TxDb.Hsapiens.UCSC.hg38.knownGene 88 | seqlevels(tx38) = "chr1" 89 | g1_38 = genes(tx38) 90 | ``` 91 | 92 | Now execute the liftOver: 93 | 94 | ```{r doli} 95 | g1_19L = liftOver(g1_38, ch) 96 | ``` 97 | 98 | The result is a list of GRanges, one for 99 | each translation event. 100 | 101 | ```{r lktx} 102 | g1_19L 103 | ``` 104 | 105 | Verification of accuracy of translation is covered in exercises. 106 | -------------------------------------------------------------------------------- /biocintro_5x/bioc1_t_mult.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: "Inference: t-tests, multiple comparisons" 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | 12 | # Introduction 13 | 14 | In the previous section, we focused on a pair of genes to 15 | illustrate two aspects of variation. One of the genes appeared to 16 | have high between-mouse variation that was hidden in the act 17 | of pooling samples within strain. When strains were compared on 18 | the basis of the pooled data, there was an appearance of a significant 19 | strain 20 | effect for this gene ($p < 10^{-6}$), but when individual-level data were used to 21 | perform the comparison, the strain effect was found to be very 22 | weak at best ($p = 0.089$). The lesson is to recognize that the 23 | most scientifically compelling questions concern biological variation, 24 | which can only be directly measured with good experimental design. Accurate 25 | interpretation of origin and size of biological variation requires 26 | appropriate statistical analysis. 27 | 28 | In this section we will cover inference in the context of genome-scale experiments. There are several serious conceptual problems: 29 | 30 | - there are many tests, often at least one test for each one of tens of thousands of features 31 | - each feature (typically a gene) exhibits its own technical and biological variability 32 | - there may be unmeasured or unreported sources of biological variation (such as time of day) 33 | - many features are inherently interrelated, so the tests are not independent 34 | 35 | We will apply some of the concepts we have covered in previous 36 | sections including t-tests and multiple comparisons; later we will 37 | compute standard deviation estimates from hierarchical models. 38 | 39 | We start by loading the pooling experiment data 40 | 41 | 42 | ```{r,message=FALSE} 43 | library(Biobase) 44 | library(maPooling) 45 | data(maPooling) 46 | pd=pData(maPooling) 47 | individuals=which(rowSums(pd)==1) 48 | ``` 49 | 50 | And extracting the individual mice as well as their strain 51 | 52 | ```{r} 53 | individuals=which(rowSums(pd)==1) 54 | individuals=individuals[-grep("tr",names(individuals))] 55 | y=exprs(maPooling)[,individuals] 56 | g=factor(as.numeric(grepl("b",names(individuals)))) 57 | ``` 58 | 59 | 60 | 61 | # T-tests 62 | 63 | We can now apply a t-test to each gene using the `rowttest` function in the `genefilter` package 64 | 65 | ```{r} 66 | library(genefilter) 67 | tt=rowttests(y,g) 68 | ``` 69 | 70 | 71 | Now which genes do we report as statistically significant? For somewhat arbitrary reasons, in science p-values of 0.01 and 0.05 are used as cutoff. In this particular example we get 72 | 73 | ```{r} 74 | NsigAt01 = sum(tt$p.value<0.01) 75 | NsigAt01 76 | NsigAt05 = sum(tt$p.value<0.05) 77 | NsigAt05 78 | ``` 79 | 80 | 81 | 82 | # Multiple testing 83 | We described multiple testing in detail [in course 3](http://genomicsclass.github.io/book/pages/multiple_testing.html). Here we provide a quick summary. 84 | 85 | Do we report all the nominally significant 86 | genes identified above? Let's explore what happens if we split the first group into two, forcing the null hypothesis to be true 87 | 88 | ```{r} 89 | set.seed(0) 90 | shuffledIndex <- factor(sample(c(0,1),sum(g==0),replace=TRUE )) 91 | nulltt <- rowttests(y[,g==0],shuffledIndex) 92 | NfalselySigAt01 = sum(nulltt$p.value<0.01) 93 | NfalselySigAt01 94 | NfalselySigAt05 = sum(nulltt$p.value<0.05) 95 | NfalselySigAt05 96 | ``` 97 | 98 | 99 | 100 | If we use the 0.05 cutoff we will be reporting `r NfalselySigAt05` false positives. We have described several ways to adjust for this including the `qvalue` method available in the `r Biocpkg("qvalue")` package. After this adjustment we acquire 101 | a smaller list of genes. 102 | 103 | ```{r} 104 | library(qvalue) 105 | qvals = qvalue(tt$p.value)$qvalue 106 | sum(qvals<0.05) 107 | sum(qvals<0.01) 108 | ``` 109 | 110 | 111 | And now the null case generates no false positives: 112 | 113 | ```{r} 114 | library(qvalue) 115 | nullqvals = qvalue(nulltt$p.value)$qvalue 116 | sum(nullqvals<0.05) 117 | sum(nullqvals<0.01) 118 | ``` 119 | 120 | This addresses in a fairly general way the problem of inflating 121 | significance claims when performing many hypothesis tests at 122 | a fixed nominal level of significance. 123 | -------------------------------------------------------------------------------- /biocintro_5x/optalign.Rmd: -------------------------------------------------------------------------------- 1 | # Short video of mapping RNA-Seq reads 2 | 3 | Note that the commands used in this lab require you have a lot of free disk space (the FASTQ files alone are 28 GB) and many cores available for running the alignment program. We do not expect students to replicate the commands in this video. We do not expect students install the alignment software on their machines. Much of the software for processing NGS data is designed for Linux systems. Note that the case studies (in particular the variant discovery and genotyping case study) will go into more depth on using Linux for processing NGS data. 4 | 5 | The FASTQ file we are looking at in the beginning of this screencast was downloaded from: 6 | 7 | http://trace.ncbi.nlm.nih.gov/Traces/sra/?run=SRR1177756 8 | 9 | This is a human RNA-Seq sample from a [study](http://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP032775) of naturally acquired immunity to malaria. 10 | 11 | We discuss the following software in the screencast: 12 | 13 | * fastq-dump from the [SRA toolkit](http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?cmd=show&f=software&m=software&s=software) 14 | * [Bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) 15 | * [Tophat2](http://ccb.jhu.edu/software/tophat/index.shtml) 16 | * [Samtools](http://samtools.sourceforge.net/) 17 | 18 | To extract the FASTQ files from the SRA file, we used the following line. The `--split-files` argument is used to extract two files for the two paired-ends of the fragments which were sequenced. 19 | 20 | ``` 21 | fastq-dump --split-files SRR1177756.sra 22 | ``` 23 | 24 | The call for running Tophat2 was: 25 | 26 | ``` 27 | export BOWTIE2_INDEXES=/path/to/your/Bowtie2Index 28 | 29 | tophat2 -o SRR1177756_tophat_out -p 10 genome SRR1177756_1.fastq SRR1177756_2.fastq 30 | ``` 31 | 32 | To view the reads we used Samtools: 33 | 34 | ``` 35 | samtools view accepted_hits.bam | head -1000 | less 36 | ``` 37 | 38 | For demonstration purposes (you wouldn't necessarily repeat these lines in a typical workflow), we merged the mapped and unmapped reads into a single sorted file. For this we used the following calls: 39 | 40 | ``` 41 | samtools sort -n accepted_hits.bam accepted_hits_name_sorted 42 | samtools sort -n unmapped.bam unmapped_name_sorted 43 | samtools merge -n all_reads.bam accepted_hits_name_sorted.bam unmapped_name_sorted.bam 44 | ``` 45 | 46 | -------------------------------------------------------------------------------- /chipseq/ChIPseq_quiz.R: -------------------------------------------------------------------------------- 1 | library(DiffBind) 2 | setwd(system.file("extra", package="DiffBind")) 3 | read.csv("tamoxifen.csv") 4 | list.files("peaks") 5 | ta <- dba(sampleSheet="tamoxifen.csv") 6 | head(ta$peaks[[1]]) 7 | 8 | pks <- GRanges(ta$peaks[[1]]$V1, 9 | IRanges(ta$peaks[[1]]$V2, 10 | ta$peaks[[1]]$V3)) 11 | 12 | table(seqnames(pks)) 13 | 14 | # find the distances between peaks 15 | plot(start(pks)) 16 | dists <- start(pks)[-1] - start(pks)[-length(pks)] 17 | max(dists) 18 | which.max(dists) 19 | abline(h=start(pks)[which.max(dists)]) 20 | abline(h=start(pks)[which.max(dists) + 1]) 21 | 22 | 23 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 24 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 25 | g <- genes(txdb) 26 | 27 | # find the gene which peak 500 is within 28 | idx <- which(g %over% pks[500]) 29 | g[idx] 30 | pks[500] 31 | 32 | # find the nearest gene to peak 33 | idx <- nearest(pks[475], g) 34 | pks[475] 35 | g[idx] 36 | distance(pks[475], g[idx]) 37 | 38 | # same as 39 | distanceToNearest(pks[475], g) 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /chipseq/MACS.txt: -------------------------------------------------------------------------------- 1 | from GEO, download a ChIP-seq experiment plus an Input experiment in the same tissue 2 | 3 | "Differential oestrogen receptor binding is associated with clinical outcome in breast cancer" 4 | 5 | http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3272464/ 6 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE32222 7 | 8 | Chromatin IP against ER MCF-7.3 9 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM798425 10 | ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX%2FSRX371%2FSRX371469/SRR1021789/SRR1021789.sra 11 | 12 | Input DNA MCF-7_Input 13 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM798440 14 | ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX%2FSRX371%2FSRX371484/SRR1021804/SRR1021804.sra 15 | 16 | 17 | Extracting FASTQ from SRA using sratoolkit.2.3.3-4: 18 | 19 | fastq-dump SRR1021789.sra 20 | fastq-dump SRR1021804.sra 21 | 22 | Aligning FASTQ using bowtie2-2.1.0: 23 | 24 | bowtie2 -p 10 -x /path/to/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/genome SRR1021789.fastq -S SRR1021789.sam 25 | bowtie2 -p 10 -x /path/to/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/genome SRR1021804.fastq -S SRR1021804.sam 26 | 27 | Running MACS usign MACS-2.0.10.20130306: 28 | 29 | macs2 callpeak -t SRR1021789.sam -c SRR1021804.sam -f SAM -g hs -n estrogen 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /example.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "This is the title" 3 | layout: page 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | # First section 12 | 13 | An introduction 14 | 15 | 16 | 17 | 18 | 19 | 20 | # Random normals 21 | 22 | Here are 10 random normals: 23 | 24 | ```{r} 25 | rnorm(10) 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /footnotes.R: -------------------------------------------------------------------------------- 1 | out <- c("---", "layout: page", 2 | "title: Footnotes for Data Analysis for Genomics", 3 | "---","") 4 | dirs <- list.files(".","course*") 5 | for (dir in dirs) { 6 | files <- list.files(dir, "*.Rmd") 7 | out <- c(out, paste0("# Course ", sub("course(.*)","\\1",dir)), "") 8 | for (file in files) { 9 | lines <- readLines(paste0(dir,"/",file)) 10 | if (sum(grepl("## Footnotes",lines)) == 0) next 11 | title <- sub("title: (.*)","\\1",grep("title:",lines,value=TRUE)[1]) 12 | cat("writing:",dir,"/",title,"\n") 13 | footidx <- grep("## Footnotes", lines) 14 | footnotes <- lines[(footidx+1):length(lines)] 15 | footnotes <- footnotes[footnotes != ""] 16 | footnotes.spaced <- character(2*length(footnotes)) 17 | footnotes.spaced[2 * seq_along(footnotes) - 1] <- footnotes 18 | out <- c(out, paste0("## ",title), footnotes.spaced,"","----","") 19 | } 20 | } 21 | writeLines(out, con="footnotes.md") 22 | -------------------------------------------------------------------------------- /highdim/images/handmade/Heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/highdim/images/handmade/Heatmap.png -------------------------------------------------------------------------------- /highdim/images/handmade/SVD1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/highdim/images/handmade/SVD1.png -------------------------------------------------------------------------------- /highdim/images/handmade/SVD2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/highdim/images/handmade/SVD2.png -------------------------------------------------------------------------------- /highdim/images/handmade/animals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genomicsclass/labs/970015761a00d84868a6fb3960fb0a7dd5975cb4/highdim/images/handmade/animals.png -------------------------------------------------------------------------------- /highdim/rotations.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Rotations 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | 12 | ## Rotations 13 | 14 | One of the most useful applications of projections relates to coordinate rotations. In data analysis, simple rotations can result in easier to visualize and interpret data. We will describe the mathematics behind rotations and give some data analysis examples. 15 | 16 | In our previous section, we used the following example: 17 | 18 | $$ 19 | Y = \begin{pmatrix} 2 \\ 20 | 3 21 | \end{pmatrix} 22 | = 23 | 2 24 | \begin{pmatrix} 1\\ 25 | 0 26 | \end{pmatrix} + 27 | 3 28 | \begin{pmatrix} 0\\ 29 | 1 30 | \end{pmatrix} 31 | $$ 32 | 33 | and noted that $2$ and $3$ are the _coordinates_. 34 | 35 | 36 | ```{r,fig.cap="Plot of (2,3) as coordinates along Dimension 1 (1,0) and Dimension 2 (0,1)."} 37 | library(rafalib) 38 | mypar() 39 | plot(c(-2,4),c(-2,4),xlab="Dimension 1",ylab="Dimension 2", 40 | type="n",xaxt="n",yaxt="n",bty="n") 41 | text(rep(0,6),c(c(-2,-1),c(1:4)),as.character(c(c(-2,-1),c(1:4))),pos=2) 42 | text(c(c(-2,-1),c(1:4)),rep(0,6),as.character(c(c(-2,-1),c(1:4))),pos=1) 43 | abline(v=0,h=0) 44 | arrows(0,0,2,3,lwd=3) 45 | segments(2,0,2,3,lty=2) 46 | segments(0,3,2,3,lty=2) 47 | text(2,3," Y",pos=4,cex=3) 48 | ``` 49 | 50 | However, mathematically we can represent the point $(2,3)$ with other linear combinations: 51 | 52 | $$ 53 | \begin{align*} 54 | Y &= \begin{pmatrix} 2 \\ 3\end{pmatrix} \\ 55 | &= 2.5 \begin{pmatrix} 1\\ 1\end{pmatrix} + -1 \begin{pmatrix} \phantom{-}0.5\\ -0.5\end{pmatrix} 56 | \end{align*}$$ 57 | 58 | The new coordinates are: 59 | 60 | $$Z = \begin{pmatrix} 2.5 \\ -1 \end{pmatrix}$$ 61 | 62 | Graphically, we can see that the coordinates are the projections to the spaces defined by the new basis: 63 | 64 | ```{r,fig.cap="Plot of (2,3) as a vector in a rotatated space, relative to the original dimensions."} 65 | library(rafalib) 66 | mypar() 67 | plot(c(-2,4),c(-2,4),xlab="Dimension 1",ylab="Dimension 2", 68 | type="n",xaxt="n",yaxt="n",bty="n") 69 | text(rep(0,6),c(c(-2,-1),c(1:4)),as.character(c(c(-2,-1),c(1:4))),pos=2) 70 | text(c(c(-2,-1),c(1:4)),rep(0,6),as.character(c(c(-2,-1),c(1:4))),pos=1) 71 | abline(v=0,h=0) 72 | abline(0,1,col="red") 73 | abline(0,-1,col="red") 74 | arrows(0,0,2,3,lwd=3) 75 | y=c(2,3) 76 | x1=c(1,1)##new basis 77 | x2=c(0.5,-0.5)##new basis 78 | c1 = crossprod(x1,y)/crossprod(x1) 79 | c2 = crossprod(x2,y)/crossprod(x2) 80 | segments(x1[1]*c1,x1[2]*c1,y[1],y[2],lty=2) 81 | segments(x2[1]*c2,x2[2]*c2,y[1],y[2],lty=2) 82 | text(2,3," Y",pos=4,cex=3) 83 | ``` 84 | 85 | We can go back and forth between these two representations of $(2,3)$ using matrix multiplication. 86 | 87 | $$ 88 | Y = AZ\\ 89 | $$ 90 | 91 | $$ 92 | A^{-1} Y = Z\\ 93 | $$ 94 | 95 | $$ 96 | A= \begin{pmatrix} 1& \phantom{-}0.5\\ 1 & -0.5\end{pmatrix} \implies 97 | A^{-1}= \begin{pmatrix} 0.5& 0.5 \\ 1 &-1\end{pmatrix} 98 | $$ 99 | 100 | $Z$ and $Y$ carry the same information, but in a different _coordinate system_. 101 | 102 | #### Example: Twin heights 103 | 104 | Here are 100 two dimensional points $Y$ 105 | 106 | ```{r twin-heights,fig.cap="Twin 2 heights versus twin 1 heights.",echo=FALSE,message=FALSE} 107 | library(MASS) 108 | n = 100 109 | mypar() 110 | set.seed(1) 111 | y=t(mvrnorm(n,c(0,0),matrix(c(1,0.95,0.95,1),2,2))) 112 | plot(y[1,],y[2,],xlab="Twin 1 (standardized height)",ylab="Twin 2 (standardized height)",xlim=c(-3,3),ylim=c(-3,3)) 113 | ``` 114 | 115 | Here are the rotations: $Z = A^{-1} Y$ 116 | 117 | ```{r twin-heights-rotated,fig.cap="Rotation of twin 2 heights versus twin 1 heights.",echo=FALSE,message=FALSE} 118 | A = matrix(c(0.5,1,0.5,-1),2,2) 119 | z = A%*%y 120 | mypar() 121 | plot(z[1,],z[2,],xlab="Average",ylab="Difference",xlim=c(-3,3),ylim=c(-3,3)) 122 | ``` 123 | 124 | What we have done here is rotate the data so that the first coordinate of $Z$ is the average height, while the second is the difference between twin heights. 125 | 126 | We have used the singular value decomposition to find principal components. It is sometimes useful to think of the SVD as a rotation, for example $\mathbf{U}^\top \mathbf{Y}$, that gives us a new coordinate system $\mathbf{DV}^\top$ in which the dimensions are ordered by how much variance they explain. 127 | 128 | -------------------------------------------------------------------------------- /inference/R_refresher.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: R refresher 4 | --- 5 | 6 | 7 | ## Data Summaries: Summary, str 8 | 9 | First we load an example data frame: 10 | 11 | ```{r} 12 | rats <- data.frame(id = paste0("rat",1:10), 13 | sex = factor(rep(c("female","male"),each=5)), 14 | weight = c(2,4,1,11,18,12,7,12,19,20), 15 | length = c(100,105,115,130,95,150,165,180,190,175)) 16 | rats 17 | ``` 18 | 19 | The `summary` and `str` functions are two helpful functions for getting a sense of data. `summary` works on vectors or matrix-like objects (including data.frames). `str` works on an arbitrary R object and will compactly display the structure. 20 | 21 | ```{r} 22 | summary(rats) 23 | summary(rats$weight) 24 | str(rats) 25 | ``` 26 | 27 | ## Aligning Two Objects: Match, Merge 28 | 29 | We load another example data frame, with the original ID and another secret ID. Suppose we want to sort the original data frame by the secret ID. 30 | 31 | ```{r} 32 | ratsTable <- data.frame(id = paste0("rat",c(6,9,7,3,5,1,10,4,8,2)), 33 | secretID = 1:10) 34 | ratsTable 35 | # wrong! 36 | cbind(rats, ratsTable) 37 | ``` 38 | 39 | `match` is a very useful function in R. It can give us this order, but it's also easy to get its arguments mixed up. Remember that `match` gives you, for each element in the first vector, the index of the first match in the second vector. So typically the data.frame or vector you are reordering would appear as the second argument to `match`. It's always a good idea to check that you got it right, which you can do by using `cbind` to line up both data frames. 40 | 41 | ```{r} 42 | match(ratsTable$id, rats$id) 43 | rats[match(ratsTable$id, rats$id),] 44 | cbind(rats[match(ratsTable$id, rats$id),], ratsTable) 45 | ``` 46 | 47 | Or you can use the `merge` function which will handle everything for you. You can tell it the names of the columns to merge on, or it will look for columns with the same name. 48 | 49 | ```{r} 50 | ratsMerged <- merge(rats, ratsTable, by.x="id", by.y="id") 51 | ratsMerged[order(ratsMerged$secretID),] 52 | ``` 53 | 54 | ## Analysis Over Groups: split, tapply, and dplyr libary 55 | 56 | Suppose we need to calculate the average rat weight for each sex. We could start by splitting the weight vector into a list of weight vectors divided by sex. `split` is a useful function for breaking up a vector into groups defined by a second vector, typically a factor. We can then use the `lapply` function to calculate the average of each element of the list, which are vectors of weights. 57 | 58 | ```{r} 59 | sp <- split(rats$weight, rats$sex) 60 | sp 61 | lapply(sp, mean) 62 | ``` 63 | 64 | A shortcut for this is to use `tapply` and give the function, which should run on each element of the list, as a third argument: 65 | 66 | ```{r} 67 | tapply(rats$weight, rats$sex, mean) 68 | ``` 69 | 70 | R is constantly being developed in the form of add-on packages, which can sometimes greatly simplify basic analysis tasks. A new library "dplyr" can accomplish the same task as above, and can be extended to many other, more complicated operations. The "d" in the name is for data.frame, and the "ply" is because the library attempts to simplify tasks typically used by the set of functions: `sapply`, `lapply`, `tapply`, etc. Here is the same task as before done with the dplyr functions `group_by` and `summarise`: 71 | 72 | ```{r} 73 | library(dplyr) 74 | sexes <- group_by(rats, sex) 75 | summarise(sexes, ave=mean(weight)) 76 | ``` 77 | 78 | With dplyr, you can chain operations using the `%.%` operator: 79 | 80 | ```{r} 81 | rats %.% group_by(sex) %.% summarise(ave=mean(weight)) 82 | ``` 83 | -------------------------------------------------------------------------------- /inference/permutation_tests.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Permutation tests 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ```{r,include=FALSE} 12 | set.seed(1) 13 | ``` 14 | 15 | ## Permutation Tests 16 | 17 | Suppose we have a situation in which none of the standard mathematical statistical approximations apply. We have computed a summary statistic, such as the difference in mean, but do not have a useful approximation, such as that provided by the CLT. In practice, we do not have access to all values in the population so we can't perform a simulation as done above. Permutation tests can be useful in these scenarios. 18 | 19 | We are back to the scenario where we only have 10 measurements for each group. 20 | 21 | ```{r,message=FALSE} 22 | dat=read.csv("femaleMiceWeights.csv") 23 | 24 | library(dplyr) 25 | 26 | control <- filter(dat,Diet=="chow") %>% select(Bodyweight) %>% unlist 27 | treatment <- filter(dat,Diet=="hf") %>% select(Bodyweight) %>% unlist 28 | obsdiff <- mean(treatment)-mean(control) 29 | ``` 30 | 31 | In previous sections, we showed parametric approaches that helped determine if the observed difference was significant. Permutation tests take advantage of the fact that if we randomly shuffle the cases and control labels, then the null is true. So we shuffle the cases and control labels and assume that the ensuing distribution approximates the null distribution. Here is how we generate a null distribution by shuffling the data 1,000 times: 32 | 33 | ```{r diff_hist, fig.cap="Histogram of difference between averages from permutations. Vertical line shows the observed difference."} 34 | N <- 12 35 | avgdiff <- replicate(1000, { 36 | all <- sample(c(control,treatment)) 37 | newcontrols <- all[1:N] 38 | newtreatments <- all[(N+1):(2*N)] 39 | return(mean(newtreatments) - mean(newcontrols)) 40 | }) 41 | hist(avgdiff) 42 | abline(v=obsdiff, col="red", lwd=2) 43 | ``` 44 | 45 | How many of the null means are bigger than the observed value? That 46 | proportion would be the p-value for the null. We add a 1 to the 47 | numerator and denominator to account for misestimation of the p-value 48 | (for more details see 49 | [Phipson and Smyth, Permutation P-values should never be zero](http://www.ncbi.nlm.nih.gov/pubmed/21044043)). 50 | 51 | ```{r} 52 | #the proportion of permutations with larger difference 53 | (sum(abs(avgdiff) > abs(obsdiff)) + 1) / (length(avgdiff) + 1) 54 | ``` 55 | 56 | Now let's repeat this experiment for a smaller dataset. We create a smaller dataset by sampling: 57 | 58 | ```{r} 59 | N <- 5 60 | control <- sample(control,N) 61 | treatment <- sample(treatment,N) 62 | obsdiff <- mean(treatment)- mean(control) 63 | ``` 64 | and repeat the exercise: 65 | 66 | 67 | ```{r diff_hist_N50, fig.cap="Histogram of difference between averages from permutations for smaller sample size. Vertical line shows the observed difference."} 68 | avgdiff <- replicate(1000, { 69 | all <- sample(c(control,treatment)) 70 | newcontrols <- all[1:N] 71 | newtreatments <- all[(N+1):(2*N)] 72 | return(mean(newtreatments) - mean(newcontrols)) 73 | }) 74 | hist(avgdiff) 75 | abline(v=obsdiff, col="red", lwd=2) 76 | ``` 77 | 78 | Now the observed difference is not significant using this approach. Keep in mind that there is no theoretical guarantee that the null distribution estimated from permutations approximates the actual null distribution. For example, if there is a real difference between the populations, some of the permutations will be unbalanced and will contain some samples that explain this difference. This implies that the null distribution created with permutations will have larger tails than the actual null distribution. This is why permutations result in conservative p-values. For this reason, when we have few samples, we can't do permutations. 79 | 80 | Note also that permutation tests still have assumptions: samples are 81 | assumed to be independent and "exchangeable". If there is hidden 82 | structure in your data, then permutation tests can result in estimated 83 | null distributions that underestimate the size of tails because the 84 | permutations may destroy the existing structure in the original data. 85 | 86 | -------------------------------------------------------------------------------- /inference/populations_and_samples.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Population, Samples, and Estimates 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ## Populations, Samples and Estimates 12 | 13 | Now that we have introduced the idea of a random variable, a null distribution, and a p-value, we are ready to describe the mathematical theory that permits us to compute p-values in practice. We will also learn about confidence intervals and power calculations. 14 | 15 | #### Population parameters 16 | 17 | A first step in statistical inference is to understand what population 18 | you are interested in. In the mouse weight example, we have two 19 | populations: female mice on control diets and female mice on high fat 20 | diets, with weight being the outcome of interest. We consider this 21 | population to be fixed, and the randomness comes from the 22 | sampling. One reason we have been using this dataset as an example is 23 | because we happen to have the weights of all the mice of this 24 | type. We download [this](https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/mice_pheno.csv) file to our working directory and read in to R: 25 | 26 | ```{r,message=FALSE,echo=FALSE} 27 | library(downloader) 28 | dir <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/" 29 | filename <- "mice_pheno.csv" 30 | url <- paste0(dir, filename) 31 | if (!file.exists(filename)) download(url,destfile=filename) 32 | ``` 33 | 34 | ```{r} 35 | dat <- read.csv("mice_pheno.csv") 36 | ``` 37 | 38 | We can then access the population values and determine, for example, how many we have. Here we compute the size of the control population: 39 | 40 | ```{r,message=FALSE} 41 | library(dplyr) 42 | controlPopulation <- filter(dat,Sex == "F" & Diet == "chow") %>% 43 | select(Bodyweight) %>% unlist 44 | length(controlPopulation) 45 | ``` 46 | 47 | We usually denote these values as $x_1,\dots,x_m$. In this case, $m$ is the number computed above. We can do the same for the high fat diet population: 48 | 49 | ```{r} 50 | hfPopulation <- filter(dat,Sex == "F" & Diet == "hf") %>% 51 | select(Bodyweight) %>% unlist 52 | length(hfPopulation) 53 | ``` 54 | 55 | and denote with $y_1,\dots,y_n$. 56 | 57 | We can then define summaries of interest for these populations, such as the mean and variance. 58 | 59 | The mean: 60 | 61 | $$\mu_X = \frac{1}{m}\sum_{i=1}^m x_i \mbox{ and } \mu_Y = \frac{1}{n} \sum_{i=1}^n y_i$$ 62 | 63 | The variance: 64 | 65 | $$\sigma_X^2 = \frac{1}{m}\sum_{i=1}^m (x_i-\mu_X)^2 \mbox{ and } \sigma_Y^2 = \frac{1}{n} \sum_{i=1}^n (y_i-\mu_Y)^2$$ 66 | 67 | with the standard deviation being the square root of the variance. We refer to such quantities that can be obtained from the population as _population parameters_. The question we started out asking can now be written mathematically: is $\mu_Y - \mu_X = 0$ ? 68 | 69 | Although in our illustration we have all the values and can check if this is true, in practice we do not. For example, in practice it would be prohibitively expensive to buy all the mice in a population. Here we learn how taking a _sample_ permits us to answer our questions. This is the essence of statistical inference. 70 | 71 | #### Sample estimates 72 | 73 | In the previous chapter, we obtained samples of 12 mice from each 74 | population. We represent data from samples with capital letters to 75 | indicate that they are random. This is common practice in statistics, 76 | although it is not always followed. So the samples are $X_1,\dots,X_M$ 77 | and $Y_1,\dots,Y_N$ and, in this case, $N=M=12$. In contrast and as we 78 | saw above, when we list out the values of the population, which are 79 | set and not random, we use lower-case letters. 80 | 81 | Since we want to know if $\mu_Y - \mu_X$ is 0, we consider the sample version: $\bar{Y}-\bar{X}$ with: 82 | 83 | $$ 84 | \bar{X}=\frac{1}{M} \sum_{i=1}^M X_i 85 | \mbox{ and }\bar{Y}=\frac{1}{N} \sum_{i=1}^N Y_i. 86 | $$ 87 | 88 | Note that this difference of averages is also a random 89 | variable. Previously, we learned about the behavior of random variables 90 | with an exercise that involved repeatedly sampling from the original 91 | distribution. Of course, this is not an exercise that we can execute 92 | in practice. In this particular case it would involve buying 24 mice 93 | over and over again. Here we described the mathematical theory that 94 | mathematically relates $\bar{X}$ to $\mu_X$ and $\bar{Y}$ to $\mu_Y$, 95 | that will in turn help us understand the relationship between 96 | $\bar{Y}-\bar{X}$ and $\mu_Y - \mu_X$. Specifically, we will describe 97 | how the Central Limit Theorem permits us to use an approximation to 98 | answer this question, as well as motivate the widely used t-distribution. 99 | 100 | -------------------------------------------------------------------------------- /intro/dplyr_intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Brief Introduction to `dplyr` 4 | --- 5 | 6 | ## Brief Introduction to `dplyr` 7 | 8 | The learning curve for R syntax is slow. One of the more difficult aspects that requires some getting used to is subsetting data tables. The `dplyr` package brings these tasks closer to English and we are therefore going to introduce two simple functions: one is used to subset and the other to select columns. 9 | 10 | Take a look at the dataset we read in: 11 | ```{r} 12 | filename <- "femaleMiceWeights.csv" 13 | dat <- read.csv(filename) 14 | head(dat) #In R Studio use View(dat) 15 | ``` 16 | 17 | There are two types of diets, which are denoted in the first column. If we want just the weights, we only need the second column. So if we want the weights for mice on the `chow` diet, we subset and filter like this: 18 | 19 | ```{r,message=FALSE} 20 | library(dplyr) 21 | chow <- filter(dat, Diet=="chow") #keep only the ones with chow diet 22 | head(chow) 23 | ``` 24 | 25 | And now we can select only the column with the values: 26 | 27 | ```{r} 28 | chowVals <- select(chow,Bodyweight) 29 | head(chowVals) 30 | ``` 31 | 32 | A nice feature of the `dplyr` package is that you can perform consecutive tasks by using what is called a "pipe". In `dplyr` we use `%>%` to denote a pipe. This symbol tells the program to first do one thing and then do something else to the result of the first. Hence, we can perform several data manipulations in one line. For example: 33 | 34 | ```{r} 35 | chowVals <- filter(dat, Diet=="chow") %>% select(Bodyweight) 36 | ``` 37 | 38 | In the second task, we no longer have to specify the object we are editing since it is whatever comes from the previous call. 39 | 40 | Also, note that if `dplyr` receives a `data.frame` it will return a `data.frame`. 41 | ```{r} 42 | class(dat) 43 | class(chowVals) 44 | ``` 45 | 46 | For pedagogical reasons, we will often want the final result to be a simple `numeric` vector. To obtain such a vector with `dplyr`, we can apply the `unlist` function which turns `lists`, such as `data.frames`, into `numeric` vectors: 47 | 48 | ```{r} 49 | chowVals <- filter(dat, Diet=="chow") %>% select(Bodyweight) %>% unlist 50 | class( chowVals ) 51 | ``` 52 | 53 | 54 | To do this in R without `dplyr` the code is the following: 55 | 56 | ```{r} 57 | chowVals <- dat[ dat$Diet=="chow", colnames(dat)=="Bodyweight"] 58 | ``` 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /intro/github.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Installing software from github.com 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | 12 | ```{r} 13 | #install.packages("devtools") 14 | library(devtools) 15 | #install_github("rafalib","ririzarr") 16 | library(rafalib) 17 | mypar 18 | shist(rnorm(100)) 19 | ``` 20 | 21 | -------------------------------------------------------------------------------- /intro/system_files.Rmd: -------------------------------------------------------------------------------- 1 | ## R system files 2 | Note that this file is also included in the 'dagdata' package. If you have the package installed the this file is already in your system and you can use the 'system.file' function to find it 3 | 4 | ```{r} 5 | dir <- system.file(package="dagdata") 6 | list.files(dir) 7 | list.files(file.path(dir,"extdata")) 8 | filename <- file.path(dir,"extdata/mice_pheno.csv") 9 | dat <- read.csv(filename) 10 | ``` 11 | 12 | 13 | ## Using download 14 | 15 | url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/mice_pheno.csv" 16 | filename <- tempfile() 17 | if (!file.exists(filename)) download.file(url,destfile=filename,method="curl") 18 | dat <- read.csv(filename) 19 | -------------------------------------------------------------------------------- /linear/linear_models_in_practice.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Linear models in practice 3 | layout: page 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | #### The mouse diet example 12 | 13 | We will demonstrate how to analyze the high fat diet data using linear models instead of directly applying a t-test. We will demonstrate how ultimately these two approaches are equivalent. 14 | 15 | We start by reading in the data and creating a quick stripchart: 16 | 17 | ```{r,echo=FALSE} 18 | url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/femaleMiceWeights.csv" 19 | filename <- "femaleMiceWeights.csv" 20 | library(downloader) 21 | if (!file.exists(filename)) download(url, filename) 22 | ``` 23 | 24 | ```{r,echo=FALSE} 25 | set.seed(1) #same jitter in stripchart 26 | ``` 27 | 28 | ```{r bodyweight_by_diet_stripchart, fig.cap="Mice bodyweights stratified by diet."} 29 | dat <- read.csv("femaleMiceWeights.csv") ##previously downloaded 30 | stripchart(dat$Bodyweight ~ dat$Diet, vertical=TRUE, method="jitter", 31 | main="Bodyweight over Diet") 32 | ``` 33 | 34 | We can see that the high fat diet group appears to have higher weights on average, although there is overlap between the two samples. 35 | 36 | For demonstration purposes, we will build the design matrix $\mathbf{X}$ using the formula `~ Diet`. The group with the 1's in the second column is determined by the level of `Diet` which comes second; that is, the non-reference level. 37 | 38 | ```{r} 39 | levels(dat$Diet) 40 | X <- model.matrix(~ Diet, data=dat) 41 | head(X) 42 | ``` 43 | 44 | ## The Mathematics Behind lm() 45 | 46 | Before we use our shortcut for running linear models, `lm`, we want to review what will happen internally. Inside of `lm`, we will form the design matrix $\mathbf{X}$ and calculate the $\boldsymbol{\beta}$, which minimizes the sum of squares using the previously described formula. The formula for this solution is: 47 | 48 | $$ \hat{\boldsymbol{\beta}} = (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{Y} $$ 49 | 50 | We can calculate this in R using our matrix multiplication operator `%*%`, the inverse function `solve`, and the transpose function `t`. 51 | 52 | 53 | ```{r} 54 | Y <- dat$Bodyweight 55 | X <- model.matrix(~ Diet, data=dat) 56 | solve(t(X) %*% X) %*% t(X) %*% Y 57 | ``` 58 | 59 | These coefficients are the average of the control group and the difference of the averages: 60 | 61 | 62 | ```{r} 63 | s <- split(dat$Bodyweight, dat$Diet) 64 | mean(s[["chow"]]) 65 | mean(s[["hf"]]) - mean(s[["chow"]]) 66 | ``` 67 | 68 | Finally, we use our shortcut, `lm`, to run the linear model: 69 | 70 | ```{r} 71 | fit <- lm(Bodyweight ~ Diet, data=dat) 72 | summary(fit) 73 | (coefs <- coef(fit)) 74 | ``` 75 | 76 | #### Examining the coefficients 77 | 78 | The following plot provides a visualization of the meaning of the coefficients with colored arrows (code not shown): 79 | 80 | ```{r parameter_estimate_illustration, fig.cap="Estimated linear model coefficients for bodyweight data illustrated with arrows.",echo=FALSE} 81 | stripchart(dat$Bodyweight ~ dat$Diet, vertical=TRUE, method="jitter", 82 | main="Bodyweight over Diet", ylim=c(0,40), xlim=c(0,3)) 83 | a <- -0.25 84 | lgth <- .1 85 | library(RColorBrewer) 86 | cols <- brewer.pal(3,"Dark2") 87 | abline(h=0) 88 | arrows(1+a,0,1+a,coefs[1],lwd=3,col=cols[1],length=lgth) 89 | abline(h=coefs[1],col=cols[1]) 90 | arrows(2+a,coefs[1],2+a,coefs[1]+coefs[2],lwd=3,col=cols[2],length=lgth) 91 | abline(h=coefs[1]+coefs[2],col=cols[2]) 92 | legend("right",names(coefs),fill=cols,cex=.75,bg="white") 93 | ``` 94 | 95 | To make a connection with material presented earlier, this simple linear model is actually giving us the same result (the t-statistic and p-value) for the difference as a specific kind of t-test. This is the t-test between two groups with the assumption that the population standard deviation is the same for both groups. This was encoded into our linear model when we assumed that the errors $\boldsymbol{\varepsilon}$ were all equally distributed. 96 | 97 | Although in this case the linear model is equivalent to a t-test, we will soon explore more complicated designs, where the linear model is a useful extension. Below we demonstrate that one does in fact get the exact same results: 98 | 99 | Our `lm` estimates were: 100 | 101 | ```{r} 102 | summary(fit)$coefficients 103 | ``` 104 | 105 | And the t-statistic is the same: 106 | 107 | ```{r} 108 | ttest <- t.test(s[["hf"]], s[["chow"]], var.equal=TRUE) 109 | summary(fit)$coefficients[2,3] 110 | ttest$statistic 111 | ``` 112 | -------------------------------------------------------------------------------- /linear/linear_models_intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Introduction to Linear Models 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | # Linear Models 12 | 13 | Many of the models we use in data analysis can be presented using matrix algebra. We refer to these types of models as _linear models_. "Linear" here does not refer to lines, but rather to linear combinations. The representations we describe are convenient because we can write models more succinctly and we have the matrix algebra mathematical machinery to facilitate computation. In this chapter, we will describe in some detail how we use matrix algebra to represent and fit. 14 | 15 | In this book, we focus on linear models that represent dichotomous groups: treatment versus control, for example. The effect of diet on mice weights is an example of this type of linear model. Here we describe slightly more complicated models, but continue to focus on dichotomous variables. 16 | 17 | As we learn about linear models, we need to remember that we are still working with random variables. This means that the estimates we obtain using linear models are also random variables. Although the mathematics is more complex, the concepts we learned in previous chapters apply here. We begin with some exercises to review the concept of random variables in the context of linear models. 18 | 19 | 20 | -------------------------------------------------------------------------------- /list_libs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for dir in `ls -d */`; 4 | do echo $dir; 5 | echo ""; 6 | grep -h 'library(' $dir/*.Rmd | sort | uniq; 7 | echo ""; 8 | done 9 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | course1rmd := $(wildcard course1/*.Rmd) 2 | course2rmd := $(wildcard course2/*.Rmd) 3 | course3rmd := $(wildcard course3/*.Rmd) 4 | course4rmd := $(wildcard course4/*.Rmd) 5 | 6 | course1md := $(course1rmd:.Rmd=.md) 7 | course2md := $(course2rmd:.Rmd=.md) 8 | course3md := $(course3rmd:.Rmd=.md) 9 | course4md := $(course4rmd:.Rmd=.md) 10 | 11 | all: course1 course2 course3 course4 foot 12 | 13 | course1: $(course1md) 14 | course2: $(course2md) 15 | course3: $(course3md) 16 | course4: $(course4md) 17 | 18 | %.md: %.Rmd 19 | cd $(dir $^); Rscript -e 'knit("$(notdir $^)")' 20 | 21 | foot: 22 | Rscript footnotes.R 23 | 24 | -------------------------------------------------------------------------------- /methyl/inference_for_DNAmeth.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Inference for DNA methylation data 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ```{r} 12 | library(minfi) ##Bioc 13 | library(IlluminaHumanMethylation450kmanifest) ##Bioc 14 | library(doParallel) ##CRAN 15 | library(pkgmaker) 16 | library(rafalib) 17 | ``` 18 | 19 | ```{r} 20 | path="/Users/ririzarr/myDocuments/teaching/HarvardX/tcgaMethylationSubset" # use your own path to downloaded data 21 | targets=read.delim(file.path (path,"targets.txt"),as.is=TRUE) 22 | table(targets$Tissue,targets$Status) 23 | ``` 24 | 25 | For illustration we will read in the normal colon and lung 26 | 27 | ```{r} 28 | index = which( targets$Status=="normal" & targets$Tissue%in%c("colon","lung") ) 29 | targets = targets[index,] 30 | ``` 31 | 32 | ```{r} 33 | dat = read.metharray.exp(base=path,targets = targets, verbose=TRUE) 34 | dat = preprocessIllumina(dat) 35 | dat = mapToGenome(dat) 36 | dat = ratioConvert(dat,type="Illumina") 37 | ``` 38 | 39 | ```{r} 40 | library(doParallel) 41 | detectCores() 42 | registerDoParallel(cores = 4) 43 | ``` 44 | 45 | ```{r} 46 | tissue =pData(dat)$Tissue 47 | X= model.matrix(~tissue) 48 | index = which(seqnames(dat)=="chr22") 49 | dat = dat[index,] ## for illustrative purposes 50 | res=bumphunter(dat,X,cutoff=0.1,B=1000) 51 | head(res$tab) 52 | ``` 53 | 54 | 55 | ```{r,message=FALSE} 56 | library(rafalib) 57 | library(AnnotationHub) 58 | cgi = AnnotationHub()[["AH5086"]] 59 | ``` 60 | 61 | ```{r} 62 | tab = res$tab[res$tab$fwer <= 0.05,] 63 | tab = makeGRangesFromDataFrame(tab,keep.extra.columns = TRUE) 64 | 65 | map=distanceToNearest(tab,cgi) 66 | d = mcols(map)$distance 67 | prop.table( table( cut(as.numeric(d),c(0,1,2000,5000,Inf),include.lowest=TRUE,right=FALSE) )) 68 | 69 | null = granges(dat) 70 | nulltab = makeGRangesFromDataFrame(null,keep.extra.columns = TRUE) 71 | 72 | nullmap=distanceToNearest(nulltab,cgi) 73 | nulld = mcols(nullmap)$distance 74 | prop.table( table( cut(nulld,c(0,1,2000,5000,Inf),include.lowest=TRUE,right=FALSE) )) 75 | ``` 76 | 77 | ```{r} 78 | beta = getBeta(dat) 79 | cols = as.factor(pData(dat)$Tissue) 80 | 81 | tab = tab[order(-mcols(tab)$area)] 82 | tab = tab+3000 ##add 3000 to each side 83 | mypar(1,1) 84 | i=17 85 | dataIndex = which(granges(dat)%over%tab[i]) 86 | cgiIndex = which(cgi%over%tab[i]) 87 | thecgi = cgi[cgiIndex] 88 | 89 | pos = start(dat)[dataIndex] 90 | xlim=range(c(pos,start(thecgi),end(thecgi)) ) 91 | 92 | y = beta[dataIndex,] 93 | 94 | matplot(pos,y,col=as.numeric(cols) , xlim=xlim, ylim=c(0,1),ylab="Methylation") 95 | apply(cbind(start(thecgi),end(thecgi)),1,function(x) segments(x[1],0,x[2],0,lwd=4,col=3)) 96 | 97 | plot(pos,res$fitted[dataIndex],xlim=xlim,ylim=c(-0.4,0.4)) 98 | abline(h=0) 99 | apply(cbind(start(thecgi),end(thecgi)),1,function(x) segments(x[1],0,x[2],0,lwd=4,col=3)) 100 | 101 | ``` 102 | 103 | ```{r} 104 | table(getIslandStatus(dat)) 105 | ``` 106 | 107 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /methyl/minfi.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Reading 450K idat files with the minfi package 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | In this unit we will demonstrate how to read idat files from the illumina 450K DNA methylation array. We make use the the Bioconductor minfi package [cite 24478339]. 12 | 13 | ```{r} 14 | # BiocManager::install(c("minfi","IlluminaHumanMethylation450kmanifest","IlluminaHumanMethylation450kanno.ilmn12.hg19")) 15 | library(minfi) 16 | ``` 17 | 18 | The first step is to determine the basename of the idat files. Note that for each sample we have two files: one for red and green channels respectively. These files are found here: 19 | 20 | ```{r} 21 | path <- "idats" 22 | list.files(path) 23 | ``` 24 | 25 | Let's start by reading in the csv file, which contains clinical information. This has one row for each sample and one of the columns includes the "basenames" for the files. 26 | 27 | ```{r} 28 | targets<-read.csv("idats/targets.csv",as.is=TRUE) 29 | names(targets) 30 | targets$Basename 31 | ``` 32 | 33 | To make this script work in any working directory we can edit that column to contain the absolute paths. Then we are ready to read in the raw data with `read.metharray`: 34 | 35 | ```{r} 36 | targets$Basename <- file.path(path,targets$Basename) 37 | rgset <- read.metharray(targets$Basename,verbose=TRUE) 38 | pData(rgset)<-as(targets, "DataFrame") 39 | ``` 40 | 41 | We now have the raw data, red and green intensities which we have access to: 42 | ```{r} 43 | dim(getRed(rgset)) 44 | dim(getGreen(rgset)) 45 | ``` 46 | 47 | If you are not interested in developing preprocessing algorithms then you can use the built in preprocessing algorithm and go straight to an object that give you access to methylation estimates: 48 | 49 | ```{r} 50 | mset <- preprocessIllumina(rgset) 51 | ``` 52 | 53 | This performs the default preprocessing algorithm developed by Illumina. However, for this to be useful, we want to have the locations of each CpG, and to do that we need map the CpGs to genome. minfi keeps this information modular so that when the genome annotation gets updated, one can easily change the mapping. 54 | ```{r} 55 | mset <- mapToGenome(mset) 56 | ``` 57 | 58 | Now we are ready to obtain the methylation values and CpG locations. 59 | 60 | ```{r} 61 | dim(getBeta(mset,type="Illumina")) ##the argument type="Illumina" gives us default procedure 62 | head(granges(mset)) 63 | ``` 64 | 65 | We can also use functions such as `getSex` and `getQC` on the mset object: 66 | ```{r} 67 | colData(mset)<-getSex(mset) 68 | plotSex(mset) 69 | plot(as.matrix(getQC(mset))) 70 | ``` 71 | -------------------------------------------------------------------------------- /ml/conditional_expectation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Conditional probabilities and expectations 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ## Conditional Probabilities and Expectations 12 | 13 | Prediction problems can be divided into categorical and continuous outcomes. However, many of the algorithms can be applied to both due to the connection between _conditional probabilities_ and _conditional expectations_. 14 | 15 | For categorical data, for example binary outcomes, if we know the probability of $Y$ being any of the possible outcomes $k$ given a set of predictors $X=(X_1,\dots,X_p)^\top$, 16 | 17 | $$ 18 | f_k(x) = \mbox{Pr}(Y=k \mid X=x) 19 | $$ 20 | 21 | we can optimize our predictions. Specifically, for any $x$ we predict the $k$ that has the largest probability $f_k(x)$. 22 | 23 | To simplify the exposition below, we will consider the case of binary data. You can think of the probability $\mbox{Pr}(Y=1 \mid X=x)$ as the proportion of 1s in the stratum of the population for which $X=x$. Given that the expectation is the average of all $Y$ values, in this case the expectation is equivalent to the probability: $f(x) \equiv \mbox{E}(Y \mid X=x)=\mbox{Pr}(Y=1 \mid X=x)$. We therefore use only the expectation in the descriptions below as it is more general. 24 | 25 | In general, the expected value has an attractive mathematical property, which is that it minimizes the expected distance between the predictor $\hat{Y}$ and $Y$: 26 | 27 | $$ 28 | \mbox{E}\{ (\hat{Y} - Y)^2 \mid X=x \} 29 | $$ 30 | 31 | 32 | #### Regression in the context of prediction 33 | 34 | 35 | 36 | We use the son and father height example to illustrate how regression can be interpreted as a machine learning technique. In our example, we are trying to predict the son's height $Y$ based on the father's $X$. Here we have only one predictor. Now if we were asked to predict the height of a randomly selected son, we would go with the average height: 37 | 38 | 39 | ```{r height_hist,message=FALSE,fig.cap="Histogram of son heights."} 40 | library(rafalib) 41 | mypar(1,1) 42 | data(father.son,package="UsingR") 43 | x=round(father.son$fheight) ##round to nearest inch 44 | y=round(father.son$sheight) 45 | hist(y,breaks=seq(min(y),max(y))) 46 | abline(v=mean(y),col="red",lwd=2) 47 | ``` 48 | 49 | In this case, we can also approximate the distribution of $Y$ as normal, which implies the mean maximizes the probability density. 50 | 51 | Let's imagine that we are given more information. We are told that the father of this randomly selected son has a height of 71 inches (1.25 SDs taller than the average). What is our prediction now? 52 | 53 | 54 | ```{r conditional_distribution, fig.cap="Son versus father height (left) with the red lines denoting the stratum defined by conditioning on fathers being 71 inches tall. Conditional distribution: son height distribution of stratum defined by 71 inch fathers.",fig.width=10.5,fig.height=5.25} 55 | mypar(1,2) 56 | plot(x,y,xlab="Father's height in inches",ylab="Son's height in inches", 57 | main=paste("correlation =",signif(cor(x,y),2))) 58 | abline(v=c(-0.35,0.35)+71,col="red") 59 | hist(y[x==71],xlab="Heights",nc=8,main="",xlim=range(y)) 60 | ``` 61 | 62 | 63 | The best guess is still the expectation, but our strata has changed from all the data, to only the $Y$ with $X=71$. So we can stratify and take the average, which is the conditional expectation. Our prediction for any $x$ is therefore: 64 | 65 | $$ 66 | f(x) = E(Y \mid X=x) 67 | $$ 68 | 69 | It turns out that because this data is approximated by a bivariate normal distribution, using calculus, we can show that: 70 | 71 | $$ 72 | f(x) = \mu_Y + \rho \frac{\sigma_Y}{\sigma_X} (X-\mu_X) 73 | $$ 74 | 75 | and if we estimate these five parameters from the sample, we get the regression line: 76 | 77 | ```{r regression, fig.cap="Son versus father height showing predicted heights based on regression line (left). Conditional distribution with vertical line representing regression prediction.",fig.width=10.5,fig.height=5.25} 78 | mypar(1,2) 79 | plot(x,y,xlab="Father's height in inches",ylab="Son's height in inches", 80 | main=paste("correlation =",signif(cor(x,y),2))) 81 | abline(v=c(-0.35,0.35)+71,col="red") 82 | 83 | fit <- lm(y~x) 84 | abline(fit,col=1) 85 | 86 | hist(y[x==71],xlab="Heights",nc=8,main="",xlim=range(y)) 87 | abline(v = fit$coef[1] + fit$coef[2]*71, col=1) 88 | ``` 89 | 90 | In this particular case, the regression line provides an optimal prediction function for $Y$. But this is not generally true because, in the typical machine learning problems, the optimal $f(x)$ is rarely a simple line. 91 | 92 | -------------------------------------------------------------------------------- /modeling/bayes-gif.R: -------------------------------------------------------------------------------- 1 | ##If you have ImageMagic installed on your computer, 2 | ##you can create an animated gif with code 3 | ##below. Note that the computation will make several 4 | ##gifs so it might take some time to compute. 5 | ##Make sure to pick a `filename` that does not already exist in the working directory. 6 | 7 | set.seed(3) 8 | prev <- 1/20 9 | acc <- 0.90 10 | ##For the animation we use 20 x 80 11 | N <- 20; M <- 80 12 | x<-rbinom(N*M,1,p=prev) 13 | cols <- c("grey","red") 14 | people <- expand.grid(1:M,N:1) 15 | people2 <- expand.grid(1:(M/2),N:1) 16 | 17 | cols1 <- cols[x+1] 18 | cols2 <- rep(NA,length(cols1));count2<-1 19 | cols3 <- rep(NA,length(cols1));count3<-1 20 | 21 | library(rafalib) 22 | library(animation) 23 | filename <- 'bayes.gif' 24 | saveGIF({ 25 | i=1 26 | while(count3 <= N*M/2 & count2 <= N*M/2){ 27 | test <- sample(100,1);min=round(100*acc) 28 | mypar() 29 | layout(matrix(c(1,2,1,3),2,2)) 30 | plot(people,col=cols1,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Population: ",round(mean(x)*100),"% are red")) 31 | if(test>min) axis(side=1,M/2,"X",col="red",tick=FALSE,cex.axis=3,line=1.5) else axis(side=1,M/2,"O",col="black",tick=FALSE,cex.axis=2,line=1.5) 32 | points(people[i,],pch=1,cex=1.5) 33 | if(all(is.na(cols2))) plot(people2,type="n",pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main="Tested Positive") else plot(people2,col=cols2,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Tested Positive: ",round(mean(cols2=="red",na.rm=TRUE)*100),"% are red")) 34 | if(all(is.na(cols3))) plot(people2,type="n",pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main="Tested Negative") else plot(people2,col=cols3,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Tested Negative: ",round(mean(cols3=="red",na.rm=TRUE)*100,1),"% are red")) 35 | outcome <- ifelse(x[i]==1, as.numeric(test<=min), as.numeric(test>min)) 36 | if(outcome==0) {cols3[count3]<-cols1[i];count3<-count3+1} else {cols2[count2]<-cols1[i];count2<-count2+1} 37 | i<-i+1 38 | }},filename, interval = .1, ani.width = 800, ani.height = 500) 39 | 40 | -------------------------------------------------------------------------------- /renaming_map.md: -------------------------------------------------------------------------------- 1 | # renaming map 2 | 3 | from | to 4 | --- | --- 5 | 1 | intro 6 | 1 | inference 7 | 1 | eda 8 | 1 | robust 9 | 2 | matrixalg 10 | 2 | linear 11 | 3 | advinference 12 | 3 | modeling 13 | 3 | highdim 14 | 3 | ml 15 | 3 | batch 16 | 4 | bioc 17 | 5 | rnaseq 18 | 6 | variants 19 | 7 | chipseq 20 | 8 | methyl 21 | -------------------------------------------------------------------------------- /rnaseq/airway_sample_table.csv: -------------------------------------------------------------------------------- 1 | "","SampleName","cell","dex","albut","Run","avgLength","Experiment","Sample","BioSample" 2 | "SRR1039508","GSM1275862","N61311","untrt","untrt","SRR1039508",126,"SRX384345","SRS508568","SAMN02422669" 3 | "SRR1039509","GSM1275863","N61311","trt","untrt","SRR1039509",126,"SRX384346","SRS508567","SAMN02422675" 4 | "SRR1039512","GSM1275866","N052611","untrt","untrt","SRR1039512",126,"SRX384349","SRS508571","SAMN02422678" 5 | "SRR1039513","GSM1275867","N052611","trt","untrt","SRR1039513",87,"SRX384350","SRS508572","SAMN02422670" 6 | "SRR1039516","GSM1275870","N080611","untrt","untrt","SRR1039516",120,"SRX384353","SRS508575","SAMN02422682" 7 | "SRR1039517","GSM1275871","N080611","trt","untrt","SRR1039517",126,"SRX384354","SRS508576","SAMN02422673" 8 | "SRR1039520","GSM1275874","N061011","untrt","untrt","SRR1039520",101,"SRX384357","SRS508579","SAMN02422683" 9 | "SRR1039521","GSM1275875","N061011","trt","untrt","SRR1039521",98,"SRX384358","SRS508580","SAMN02422677" 10 | -------------------------------------------------------------------------------- /rnaseq/fastq.md: -------------------------------------------------------------------------------- 1 | # Fastq files 2 | 3 | ## Links for this experiment 4 | 5 | Study information at the Sequence Read Archive: 6 | 7 | http://www.ncbi.nlm.nih.gov/Traces/sra/?study=SRP033351 8 | 9 | Himes et al paper at PubMed Central: 10 | 11 | http://www.ncbi.nlm.nih.gov/pubmed/24926665 12 | 13 | Example sample table stored in our course repo on github: 14 | 15 | https://github.com/genomicsclass/labs/blob/master/course5/airway_sample_table.csv 16 | 17 | Details on creating such a sample table from SRA and GEO: 18 | 19 | http://www.bioconductor.org/packages/release/data/experiment/vignettes/airway/inst/doc/airway.html 20 | 21 | The European Nucleotide Archive (EMBL-EBI): 22 | 23 | http://www.ebi.ac.uk/ena 24 | 25 | The Sequence Read Archive (NCBI): 26 | 27 | http://www.ncbi.nlm.nih.gov/sra/ 28 | 29 | ## Fastq file commands 30 | 31 | Downloading from the ENA: 32 | 33 | ``` 34 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR103/008/SRR1039508/SRR1039508_1.fastq.gz 35 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR103/008/SRR1039508/SRR1039508_2.fastq.gz 36 | ``` 37 | 38 | Alias for ls: 39 | 40 | ``` 41 | alias ll='ls -lGh' 42 | ``` 43 | 44 | Unzipping: 45 | 46 | ``` 47 | gunzip *.fastq.gz 48 | ``` 49 | 50 | Looking at the FASTQ files: 51 | 52 | ``` 53 | less SRR1039508_1.fastq 54 | wc -l SRR1039508_1.fastq 55 | ``` 56 | 57 | Quality control with [fastqc](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) 58 | 59 | ``` 60 | fastqc --noextract SRR1039508_1.fastq SRR1039508_2.fastq 61 | ``` 62 | 63 | -------------------------------------------------------------------------------- /rnaseq/genome_align_STAR.md: -------------------------------------------------------------------------------- 1 | # STAR commands 2 | 3 | The STAR homepage: 4 | 5 | https://github.com/alexdobin/STAR 6 | 7 | The STAR paper: 8 | 9 | http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3530905/ 10 | 11 | The STAR manual: 12 | 13 | https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf 14 | 15 | Downloading genome FASTA and GTF files from ENSEMBL: 16 | 17 | http://ensembl.org 18 | 19 | http://ensembl.org/info/data/ftp/index.html 20 | 21 | Generating the genome: 22 | 23 | Note the `sjdbOverhang` is used for constructing the splice junction database. It should be set to (read length - 1), and according to the manual a general value of 100 will work as well. 24 | 25 | For this limited demonstration, I am only going to align to the genes on chromosome 1, so I subset the GTF file: 26 | 27 | ``` 28 | grep -P '^1\t' Homo_sapiens.GRCh38.79.gtf > Homo_sapiens.GRCh38.79.chrom1.gtf 29 | ``` 30 | 31 | We then moved files in subdirectories, and created one for the STAR genome index: 32 | 33 | ``` 34 | mkdir gtf 35 | mkdir genome 36 | mv *.gtf gtf 37 | mv *.fa genome 38 | mkdir GRCh38.79.chrom1 39 | ``` 40 | 41 | The STAR command to generate the genome index: 42 | 43 | ``` 44 | STAR --runMode genomeGenerate \ 45 | --genomeDir GRCh38.79.chrom1 \ 46 | --genomeFastaFiles genome/Homo_sapiens.GRCh38.dna.chromosome.1.fa \ 47 | --sjdbGTFfile gtf/Homo_sapiens.GRCh38.79.chrom1.gtf \ 48 | --sjdbOverhang 62 49 | ``` 50 | 51 | Mapping the reads: 52 | 53 | ``` 54 | STAR --runThreadN 12 \ 55 | --genomeDir GRCh38.79.chrom1 \ 56 | --readFilesIn fastq/SRR1039508_1.fastq fastq/SRR1039508_2.fastq 57 | ``` 58 | -------------------------------------------------------------------------------- /rnaseq/r_bioc_links.md: -------------------------------------------------------------------------------- 1 | # R and Bioconductor links 2 | 3 | * [Central R Archive Network (CRAN)](http://cran.rstudio.com/) 4 | * [RStudio](http://www.rstudio.com/) 5 | * [Bioconductor](http://bioconductor.org/install) 6 | 7 | Once you have installed R and the `BiocManager` package, running the following lines in your console will install Bioconductor: 8 | 9 | ``` 10 | BiocManager::install() 11 | ``` 12 | 13 | Make sure to hit `[a]` to update all packages. This is important so that your answers will match the answers accepted by the grading bot. 14 | 15 | To install specific packages from Bioconductor use, for example: 16 | 17 | ``` 18 | BiocManager::install(c("pasilla", "DEXSeq")) 19 | ``` 20 | 21 | We will provide a list of all packages we will use [here](rnaseq_pkgs.R). 22 | 23 | If you want to see what version of Bioconductor you are using and whether your packages are up to date: 24 | 25 | ``` 26 | BiocManager::version() 27 | BiocManager::valid() 28 | ``` 29 | -------------------------------------------------------------------------------- /rnaseq/rnaseq_exon_usage.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: RNA-seq differential exon usage 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | The [DEXSeq](http://bioconductor.org/packages/release/bioc/html/DEXSeq.html) package offers differential testing of exon usage within each gene. Here we will explore the R code used in a *DEXSeq* analysis. We omit the python calls for preparing the annotation and count tables, but these can be found in the vignette at the above link. The python calls are generally along the lines of: 12 | 13 | ``` 14 | python dexseq_prepare_annotation.py gtffile.gtf dexseq.gff 15 | python dexseq_count.py dexseq.gff sample1.sam sample1.txt 16 | ``` 17 | 18 | Once we have repeated the `dexseq_count` script for each sample, we can read the data into R using the code chunks below. As we are working with pre-prepared data, we first point to these files which live within the *pasilla* package. 19 | 20 | The *pasilla* package contains counts from an experiment by [Brooks et al](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3032923/) 21 | 22 | We will run DEXSeq on a subset of the genes, for demonstration purposes. 23 | 24 | ```{r} 25 | library("pasilla") 26 | inDir = system.file("extdata", package="pasilla", mustWork=TRUE) 27 | countFiles = list.files(inDir, pattern="fb.txt$", full.names=TRUE) 28 | flattenedFile = list.files(inDir, pattern="gff$", full.names=TRUE) 29 | genesForSubset = read.table(file.path(inDir, "geneIDsinsubset.txt"), 30 | stringsAsFactors=FALSE)[[1]] 31 | ``` 32 | 33 | As in *DESeq2* we use a `sampleTable` to define the samples: 34 | 35 | ```{r} 36 | sampleTable = data.frame( 37 | row.names = c( "treated1", "treated2", "treated3", 38 | "untreated1", "untreated2", "untreated3", "untreated4" ), 39 | condition = c("knockdown", "knockdown", "knockdown", 40 | "control", "control", "control", "control" ), 41 | libType = c( "single-end", "paired-end", "paired-end", 42 | "single-end", "single-end", "paired-end", "paired-end" ) ) 43 | sampleTable 44 | ``` 45 | 46 | We now read the data into a `DEXSeqDataSet` object: 47 | 48 | ```{r message=FALSE} 49 | library("DEXSeq") 50 | dxd = DEXSeqDataSetFromHTSeq( 51 | countFiles, 52 | sampleData=sampleTable, 53 | design= ~ sample + exon + condition:exon, 54 | flattenedfile=flattenedFile ) 55 | ``` 56 | 57 | Subset the genes, for demonstration purposes: 58 | 59 | ```{r} 60 | dxd = dxd[geneIDs( dxd ) %in% genesForSubset,] 61 | ``` 62 | 63 | Now we run the estimation and testing functions: 64 | 65 | ```{r} 66 | dxd = estimateSizeFactors( dxd ) 67 | dxd = estimateDispersions( dxd ) 68 | dxd = testForDEU( dxd ) 69 | dxd = estimateExonFoldChanges( dxd, fitExpToVar="condition") 70 | ``` 71 | 72 | The following code extracts a results table, makes an MA-plot, and draws the expression levels over the exons to highlight differential exon usage: 73 | 74 | ```{r} 75 | dxr = DEXSeqResults( dxd ) 76 | plotMA( dxr, cex=0.8 ) 77 | plotDEXSeq( dxr, "FBgn0010909", legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ) 78 | ``` 79 | 80 | Again, drawing the expression levels, now showing the annotated transcripts below: 81 | 82 | ```{r} 83 | plotDEXSeq( dxr, "FBgn0010909", displayTranscripts=TRUE, legend=TRUE, 84 | cex.axis=1.2, cex=1.3, lwd=2 ) 85 | ``` 86 | 87 | For more details on the *DEXSeq* software, see the vignette and the paper, which is linked from the vignette page: 88 | 89 | ```{r eval=FALSE} 90 | browseVignettes("DEXSeq") 91 | ``` 92 | 93 | We conclude by adding the session information: 94 | 95 | ```{r} 96 | sessionInfo() 97 | ``` 98 | 99 | -------------------------------------------------------------------------------- /rnaseq/rnaseq_isoform_cummerbund.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Exploring Cufflinks output with cummeRbund 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | 12 | Here we show the exploratory plots offered by the [cummeRbund](http://www.bioconductor.org/packages/release/bioc/html/cummeRbund.html) package. These plots require loading in a directory in which results from a [Cufflinks](http://cole-trapnell-lab.github.io/cufflinks/) analysis has been run. Follow the vignette in the above link in order in order to perform a Cufflinks gene- and isoform-level analysis. From the vignette: 13 | 14 | > CummeRbund begins by re-organizing output files of a cuffdiff analysis, and storing these data in a local SQLite database. CummeRbund indexes the data to speed up access to specific feature data (genes, isoforms, TSS, CDS, etc.), and preserves the various relationships between these features. 15 | 16 | ```{r message=FALSE} 17 | library(cummeRbund) 18 | myDir <- system.file("extdata", package="cummeRbund") 19 | gtfFile <- system.file("extdata/chr1_snippet.gtf",package="cummeRbund") 20 | ``` 21 | 22 | Read in the prepared Cufflinks files from the directory: 23 | 24 | ```{r warning=FALSE} 25 | cuff <- readCufflinks(dir=myDir,gtfFile=gtfFile,genome="hg19",rebuild=TRUE) 26 | ``` 27 | 28 | Boxplots of expression (FPKM) at the gene and isoform level: 29 | 30 | ```{r} 31 | csBoxplot(genes(cuff)) 32 | csBoxplot(genes(cuff),replicates=TRUE) 33 | csBoxplot(isoforms(cuff),replicates=TRUE) 34 | ``` 35 | 36 | Scatterplot matrix of gene and isoform level expression: 37 | 38 | ```{r} 39 | csScatterMatrix(genes(cuff)) 40 | csScatterMatrix(isoforms(cuff)) 41 | ``` 42 | 43 | Sample dendrograms using Jensen-Shannon distances: 44 | 45 | ```{r} 46 | csDendro(genes(cuff),replicates=TRUE) 47 | csDendro(isoforms(cuff),replicates=TRUE) 48 | ``` 49 | 50 | MA-plot comparing two conditions: 51 | 52 | ```{r} 53 | MAplot(genes(cuff),"hESC","Fibroblasts") 54 | MAplot(isoforms(cuff),"hESC","Fibroblasts") 55 | ``` 56 | 57 | A "volcano plot" matrix. Each volcano plot is the -log10(p-value) over the log fold change. 58 | 59 | ```{r} 60 | csVolcanoMatrix(genes(cuff)) 61 | csVolcanoMatrix(isoforms(cuff)) 62 | ``` 63 | 64 | For all of these functions, see the help pages in the *cummeRbund* package for more details, and check the vignette for a sample workflow. The [Cufflinks homepage](http://cole-trapnell-lab.github.io/cufflinks/) has details about running the pipeline upstream of producing these figures. 65 | 66 | ```{r eval=FALSE} 67 | browseVignettes("cummeRbund") 68 | ``` 69 | -------------------------------------------------------------------------------- /rnaseq/rnaseq_pkgs.R: -------------------------------------------------------------------------------- 1 | # 2 CRAN packages 2 | cranpkgs <- c("ggplot2","pheatmap") 3 | install.packages(cranpkgs) 4 | 5 | # rafalib from github (not strictly necessary, but useful for plots) 6 | install.packages("devtools") 7 | library(devtools) 8 | install_github("ririzarr/rafalib") 9 | 10 | # the rest are Bioconductor packages 11 | biocpkgs <- c("Rsamtools", 12 | "GenomicFeatures", 13 | "GenomicAlignments", 14 | "Rsubread", 15 | "airway", 16 | "pasilla", 17 | "DESeq2", 18 | "DEXSeq", 19 | "vsn", 20 | "sva", 21 | "org.Hs.eg.db", 22 | "cummeRbund", 23 | "pasillaBamSubset", 24 | "TxDb.Dmelanogaster.UCSC.dm3.ensGene") 25 | BiocManager::install(biocpkgs) 26 | # note that Rsubread does not have a binary for Windows. This package is not required for class. 27 | -------------------------------------------------------------------------------- /rnaseq/storage/RNAseq_quiz.R: -------------------------------------------------------------------------------- 1 | link <- "http://bowtie-bio.sourceforge.net/recount/ExpressionSets/wang_eset.RData" 2 | if (!file.exists("wang_eset.RData")) download.file(link, "wang_eset.RData") 3 | load("wang_eset.RData") 4 | 5 | library(Biobase) 6 | library(GenomicRanges) 7 | # the SimpleList part is only necessary for Bioc <= 2.13 8 | se <- SummarizedExperiment(SimpleList(counts = exprs(wang.eset))) 9 | colData(se) <- DataFrame(pData(wang.eset)) 10 | 11 | table(colData(se)$cell.type) 12 | 13 | tissues <- c("cerebellum","breast","colon","heart","liver","skeletal.muscle") 14 | se <- se[,colData(se)$cell.type %in% tissues] 15 | 16 | table(colData(se)$cell.type) 17 | 18 | test <- colData(se)$cell.type == "cerebellum" 19 | lvls <- c("not","cerebellum") 20 | condition <- factor(ifelse(test,"cerebellum","not"), levels=lvls) 21 | colData(se)$condition <- condition 22 | 23 | library(DESeq2) 24 | dds <- DESeqDataSet( se, design = ~ condition ) 25 | dds <- DESeq( dds ) 26 | res <- results( dds ) 27 | 28 | res[order(res$pvalue)[1:10],] 29 | 30 | # Bioc 2.13 baseMean log2FoldChange lfcSE stat pvalue 31 | # ENSG00000143858 1727.5839 7.937566 0.2794747 28.40173 1.924967e-177 32 | # ENSG00000176749 733.8875 6.352004 0.2298648 27.63366 4.387200e-168 33 | # ENSG00000187730 2766.0323 10.587430 0.3875275 27.32046 2.423703e-164 34 | # ENSG00000161509 1229.6361 8.146916 0.3313924 24.58390 1.878103e-133 35 | # ENSG00000170616 1288.3151 9.776143 0.4349104 22.47852 6.734571e-112 36 | 37 | # Bioc 2.14 baseMean log2FoldChange lfcSE stat pvalue 38 | # ENSG00000143858 1727.5839 8.003262 0.3027984 26.43099 6.035903e-154 39 | # ENSG00000176749 733.8875 6.391632 0.2502764 25.53830 7.407303e-144 40 | # ENSG00000187730 2766.0323 10.767926 0.4252847 25.31933 1.956629e-141 41 | # ENSG00000161509 1229.6361 8.244928 0.3546406 23.24868 1.466793e-119 42 | # ENSG00000170616 1288.3151 9.988089 0.4793187 20.83809 1.954636e-96 43 | 44 | 45 | top <- rownames(res)[order(res$pvalue)[1:10]] 46 | top[1] 47 | 48 | stripchart(log10(counts(dds,normalized=TRUE)[top[1],] + 1) ~ se$condition, 49 | vertical=TRUE, method="jitter") 50 | 51 | # averages for each group 52 | tapply(counts(dds,normalized=TRUE)[top[2],], 53 | colData(dds)$condition, 54 | mean) 55 | 56 | library(org.Hs.eg.db) 57 | keytypes(org.Hs.eg.db) 58 | columns(org.Hs.eg.db) 59 | 60 | # gene names 61 | map <- select(org.Hs.eg.db, keys=top, 62 | columns=c("SYMBOL", "GENENAME"), keytype="ENSEMBL") 63 | map 64 | 65 | map <- select(org.Hs.eg.db, keys=top[3], 66 | columns=c("GO"), keytype="ENSEMBL") 67 | 68 | # The following gives the GO terms: 69 | map$GO 70 | 71 | # this gives the meaning of these: 72 | library(GO.db) 73 | as.list(GOTERM[map$GO]) 74 | -------------------------------------------------------------------------------- /rnaseq/storage/cufflinks.txt: -------------------------------------------------------------------------------- 1 | We will work with the Hammer et al dataset, as prepared by the ReCount 2 | website. 3 | 4 | http://bowtie-bio.sourceforge.net/recount/ 5 | 6 | The Hammer et al paper: 7 | 8 | http://www.ncbi.nlm.nih.gov/pubmed?term=20452967 9 | 10 | The GEO page: 11 | 12 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE20895 13 | 14 | The sample I will align: 15 | 16 | http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM539553 17 | 18 | which points to the SRA: 19 | 20 | ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX%2FSRX020%2FSRX020088/SRR042499/ 21 | 22 | fastq-dump SRR042499.sra 23 | 24 | The genome was downloaded from Illumina iGenomes 25 | 26 | http://cufflinks.cbcb.umd.edu/igenomes.html 27 | 28 | The tophat call to align the reads: 29 | 30 | tophat2 -o tophat_out -p 10 /path/to/Rattus_norvegicus/Ensembl/RGSC3.4/Sequence/Bowtie2Index/genome SRR042499.fastq 31 | 32 | cufflinks -o cufflinks -p 10 --GTF-guide /path/to/Rattus_norvegicus/Ensembl/RGSC3.4/Annotation/Genes/genes.gtf \ 33 | tophat_out/accepted_hits.bam 34 | 35 | grep -v 'FPKM "0.0000000000"' transcripts.gtf | less 36 | 37 | For visualizing: 38 | 39 | ftp://ftp.ensembl.org/pub/release-69/fasta/rattus_norvegicus/dna/Rattus_norvegicus.RGSC3.4.69.dna.chromosome.1.fa.gz 40 | ftp://ftp.ensembl.org/pub/release-69/gtf/rattus_norvegicus/Rattus_norvegicus.RGSC3.4.69.gtf.gz 41 | -------------------------------------------------------------------------------- /rnaseq/trancsript_align_RSEM.md: -------------------------------------------------------------------------------- 1 | # RSEM transcript alignment 2 | 3 | RSEM homepage: 4 | 5 | http://deweylab.biostat.wisc.edu/rsem/ 6 | 7 | RSEM paper: 8 | 9 | http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3163565/ 10 | 11 | prepare-reference help: 12 | 13 | http://deweylab.biostat.wisc.edu/rsem/rsem-prepare-reference.html 14 | 15 | calculate-expression help: 16 | 17 | http://deweylab.biostat.wisc.edu/rsem/rsem-calculate-expression.html 18 | 19 | RSEM expects a GTF file with only exons, which are each assigned to a `transcript_id`. 20 | 21 | Note that we only align to chromosome 1 for demonstration purposes. 22 | 23 | ``` 24 | awk '$3 == "exon"' gtf/Homo_sapiens.GRCh38.79.chrom1.gtf > gtf/Homo_sapiens.GRCh38.79.chrom1.exons.gtf 25 | ``` 26 | 27 | RSEM will then prepare a reference transcriptome against which to align reads. 28 | 29 | ``` 30 | mkdir rsemGenome 31 | rsem-prepare-reference --gtf gtf/Homo_sapiens.GRCh38.79.chrom1.exons.gtf genome/Homo_sapiens.GRCh38.dna.chromosome.1.fa rsemGenome/GRCh38.79.chrom1 32 | ``` 33 | 34 | ``` 35 | rsem-calculate-expression -p 12 --paired-end fastq/SRR1039508_1.fastq fastq/SRR1039508_2.fastq rsemGenome/GRCh38.79.chrom1 SRR1039508 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /robust/ranktest.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | layout: page 3 | title: Rank tests 4 | --- 5 | 6 | ```{r options, echo=FALSE} 7 | library(knitr) 8 | opts_chunk$set(fig.path=paste0("figure/", sub("(.*).Rmd","\\1",basename(knitr:::knit_concord$get('infile'))), "-")) 9 | ``` 10 | 11 | ## Wilcoxon Rank Sum Test 12 | 13 | We learned how the sample mean and SD are susceptible to outliers. The 14 | t-test is based on these measures and is susceptible as well. The 15 | Wilcoxon rank test (equivalent to the Mann-Whitney test) provides an 16 | alternative. In the code below, we perform a t-test on data for which 17 | the null is true. However, we change one sum observation by mistake 18 | in each sample and the values incorrectly entered are different. Here 19 | we see that the t-test results in a small p-value, while the Wilcoxon 20 | test does not: 21 | 22 | ```{r} 23 | set.seed(779) ##779 picked for illustration purposes 24 | N=25 25 | x<- rnorm(N,0,1) 26 | y<- rnorm(N,0,1) 27 | ``` 28 | 29 | Create outliers: 30 | 31 | ```{r} 32 | x[1] <- 5 33 | x[2] <- 7 34 | cat("t-test pval:",t.test(x,y)$p.value) 35 | cat("Wilcox test pval:",wilcox.test(x,y)$p.value) 36 | ``` 37 | 38 | The basic idea is to 1) combine all the data, 2) turn the values into ranks, 3) separate them back into their groups, and 4) compute the sum or average rank and perform a test. 39 | 40 | ```{r rank-test-illustration, fig.cap="Data from two populations with two outliers. The left plot shows the original data and the right plot shows their ranks. The numbers are the w values ",fig.width=10.5,fig.height=5.25} 41 | library(rafalib) 42 | mypar(1,2) 43 | 44 | stripchart(list(x,y),vertical=TRUE,ylim=c(-7,7),ylab="Observations",pch=21,bg=1) 45 | abline(h=0) 46 | 47 | xrank<-rank(c(x,y))[seq(along=x)] 48 | yrank<-rank(c(x,y))[-seq(along=x)] 49 | 50 | stripchart(list(xrank,yrank),vertical=TRUE,ylab="Ranks",pch=21,bg=1,cex=1.25) 51 | 52 | ws <- sapply(x,function(z) rank(c(z,y))[1]-1) 53 | text( rep(1.05,length(ws)), xrank, ws, cex=0.8) 54 | W <-sum(ws) 55 | ``` 56 | 57 | `W` is the sum of the ranks for the first group relative to the second 58 | group. We can compute an exact p-value for $W$ based on 59 | combinatorics. We can also use the CLT since 60 | statistical theory tells us that this `W` is approximated by the 61 | normal distribution. We can construct a z-score as follows: 62 | 63 | ```{r} 64 | n1<-length(x);n2<-length(y) 65 | Z <- (mean(ws)-n2/2)/ sqrt(n2*(n1+n2+1)/12/n1) 66 | print(Z) 67 | ``` 68 | 69 | Here the `Z` is not large enough to give us a p-value less 70 | than 0.05. These are part of the calculations performed by the R function 71 | `wilcox.test`. 72 | 73 | -------------------------------------------------------------------------------- /variants/SNP_quiz.R: -------------------------------------------------------------------------------- 1 | library(VariantTools) 2 | library(LungCancerLines) 3 | library(BSgenome.Hsapiens.UCSC.hg19) 4 | genome <- gmapR::TP53Genome() 5 | bams <- LungCancerLines::LungCancerBamFiles() 6 | bam <- bams$H1993 7 | 8 | tally.param <- TallyVariantsParam(genome, 9 | high_base_quality = 23L) 10 | call23 <- callVariants(bam, tally.param) 11 | length(call23) 12 | mean(mcols(call23)$raw.count) 13 | 14 | # what's the average raw count with higher filter on quality 15 | tally.param <- TallyVariantsParam(genome, 16 | high_base_quality = 32L) 17 | call32 <- callVariants(bam, tally.param) 18 | length(call32) 19 | mean(mcols(call32)$raw.count) 20 | 21 | 22 | 23 | 24 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 25 | txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 26 | library(VariantAnnotation) 27 | fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation") 28 | vcf <- readVcf(fl, genome="hg19") 29 | seqlevels(vcf) <- paste0("chr", seqlevels(vcf)) 30 | 31 | 32 | # what is the gene whose promoter overlaps the first variant in vcf? 33 | loc <- locateVariants(vcf, txdb, PromoterVariants()) 34 | loc 35 | 36 | # ?promoter tells us that the promoter is 2000 bp upstream from the gene 37 | # start and 200 bp downstream. note that gene start 38 | 39 | # how far is this variant from this gene? 40 | rowData(vcf[197]) 41 | g <- genes(txdb) 42 | idx <- as.character(mcols(g)$gene_id) == "79174" 43 | g[idx] 44 | 45 | # this gives us the distance to the gene start 46 | distance(rowData(vcf[197]), flank(g[idx], 0)) 47 | 48 | # it gives the same as this... 49 | distance(rowData(vcf[197]), g[idx]) 50 | 51 | # however, theoretically if the variant was less 52 | # than 200 bp downstream from the TSS, and the 53 | # gene was very short, the variant could end up 54 | # closer to the gene end. so the first line of code 55 | # is safer. --------------------------------------------------------------------------------