├── imgs
├── PCA-plot.png
├── GWAS_qqplot.png
├── LDheatmap.png
├── regionalplot.png
├── GWAS_Manhattan.png
└── HDL-transformation.png
├── R
├── map2gene.R
├── globals.R
├── GWAS_ManhattanFunction.R
└── GWAA.R
├── README.md
├── Introduction.md
├── Data-generation.md
├── GWAS-analysis.md
├── Post-analytic-visualization-and-genomic-interrogation.md
└── Data-pre-processing.md
/imgs/PCA-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AAlhendi1707/GWAS/HEAD/imgs/PCA-plot.png
--------------------------------------------------------------------------------
/imgs/GWAS_qqplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AAlhendi1707/GWAS/HEAD/imgs/GWAS_qqplot.png
--------------------------------------------------------------------------------
/imgs/LDheatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AAlhendi1707/GWAS/HEAD/imgs/LDheatmap.png
--------------------------------------------------------------------------------
/imgs/regionalplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AAlhendi1707/GWAS/HEAD/imgs/regionalplot.png
--------------------------------------------------------------------------------
/imgs/GWAS_Manhattan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AAlhendi1707/GWAS/HEAD/imgs/GWAS_Manhattan.png
--------------------------------------------------------------------------------
/imgs/HDL-transformation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AAlhendi1707/GWAS/HEAD/imgs/HDL-transformation.png
--------------------------------------------------------------------------------
/R/map2gene.R:
--------------------------------------------------------------------------------
1 | # ---- map2gene ----
2 | # Returns the subset of SNPs that are within extend.boundary of gene
3 | # using the coords table of gene locations.
4 | map2gene <- function(gene, coords, SNPs, extend.boundary = 5000) {
5 | coordsSub <- coords[coords$gene == gene,] #Subset coordinate file for spcified gene
6 |
7 | coordsSub$start <- coordsSub$start - extend.boundary # Extend gene boundaries
8 | coordsSub$stop <- coordsSub$stop + extend.boundary
9 |
10 | SNPsub <- SNPs[SNPs$position >= coordsSub$start & SNPs$position <= coordsSub$stop &
11 | SNPs$chr == coordsSub$chr,] #Subset for SNPs in gene
12 |
13 | return(data.frame(SNPsub, gene = gene, stringsAsFactors = FALSE))
14 | }
15 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GWAS
2 | Genome-wide Association Study Tutorial
3 |
4 |
5 | ### Contents:
6 |
7 | - [Introduction](https://github.com/AAlhendi1707/GWAS/blob/master/Introduction.md)
8 | - [Data pre-processing - steps 1-4](https://github.com/AAlhendi1707/GWAS/blob/master/Data-pre-processing.md)
9 | - [Data generation - steps 5-6](https://github.com/AAlhendi1707/GWAS/blob/master/Data-generation.md)
10 | - [GWAS analysis - steps 7-8](https://github.com/AAlhendi1707/GWAS/blob/master/GWAS-analysis.md)
11 | - [Post-analytic visualization and-genomic interrogation - steps 9-10](https://github.com/AAlhendi1707/GWAS/blob/master/Post-analytic-visualization-and-genomic-interrogation.md)
12 |
13 |
14 | Materials from [Reed et al. (2015)](https://doi.org/10.1002/sim.6605).
15 |
16 |
17 |
--------------------------------------------------------------------------------
/R/globals.R:
--------------------------------------------------------------------------------
1 | # ---- globals ----
2 | # Customize as needed for file locations
3 |
4 | # Modify data.dir to indicate the location of the GWAStutorial files
5 | # Intermediate data files will also be stored in this same location unless you set out.dir
6 |
7 | # Intermediate data files will also be stored in this same location unless you set out.dir
8 | data.dir <- '/scratch/spectre/a/asna4/GWAS'
9 | out.dir <- '/scratch/spectre/a/asna4/GWAS/out'
10 |
11 | gwas.fn <- lapply( c (bed='bed', bim='bim', fam='fam' ,gds='gds'), function (n) sprintf ("%s/GWAStutorial.%s", data.dir, n))
12 | clinical.fn <- sprintf("%s/GWAStutorial_clinical.csv", data.dir)
13 | onethou.fn = lapply(c(info='info' ,ped='ped'), function(n) sprintf("%s/chr16_1000g_CEU.%s", data.dir, n))
14 | protein.coding.coords.fname <- sprintf ("%s/ProCodgene_coords.csv", data.dir)
15 |
16 | # Output files
17 | gwaa.fname <- sprintf ("%s/GWAStutorialout.txt ", out.dir)
18 | gwaa.unadj.fname <- sprintf ("%s/GWAStutorialoutUnadj.txt", out.dir)
19 | impute.out.fname <- sprintf ("%s/GWAStutorial_imputationOut.csv", out.dir)
20 | CETP.fname <- sprintf("%s/CETP_GWASout.csv" , out.dir)
21 |
22 | #saving configs
23 | working.data.fname <- function(num) { sprintf("%s/working.%s.Rdata", out.dir, num) }
24 |
--------------------------------------------------------------------------------
/Introduction.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 |
4 | This web tutorial is derived from 'A guide to genome-wide association analysis and post-analytic interrogation' (Statistics in Medicine, in review). The tutorial presents fundamental concepts and specific software tools for implementing a complete genome wide association (GWA) analysis, as well as post-analytic visualization and interrogation of potentially novel findings. In this tutorial we use complete GWA data on 1401 individuals from [the PennCATH study of coronary artery disease (CAD).](http://www.ncbi.nlm.nih.gov/pubmed/21239051)
5 |
6 | In the steps to follow we begin by demonstrating a method for downloading necessary R packages and setting global parameters as a means for saving progress while working through a GWA analysis. Next, we include quality control steps for both SNP and sample level filtering. The third section is split into principal component calculation for population stratification in statistical modeling, as well as imputation of non-typed SNPs using 1000 Genomes reference genotype data. We then demonstrate strategies to carry out the GWA analysis on the typed data using basic linear modeling functionality in R, as well as imputed data using functionality contained within the `snpStats` package. Finally, we demonstrate means for post-analytic interrogation, including methods for evaluating the performance of statistical models, as well as visualization of the global and subsetted GWAS output.
7 |
8 | ## Installing necessary packages
9 |
10 | ```r
11 | # Run this once interactively to download and install BioConductor packages and other packages.
12 |
13 | source ("http://bioconductor.org/biocLite.R ")
14 | list.of.packages <- c("snpStats", "SNPRelate","rtracklayer", "biomaRt")
15 | new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
16 | if(length(new.packages)) biocLite(new.packages)
17 |
18 |
19 | list.of.packages <- c('plyr', 'LDheatmap', 'doParallel', 'ggplot2', 'coin' ,'igraph', 'devtools', 'downloader')
20 | new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
21 | if(length(new.packages)) install.packages(new.packages)
22 |
23 | # GenABEL has moved to CRAN archive. The below command for local installation from CRAN archive.
24 | install.packages("https://cran.r-project.org/src/contrib/Archive/GenABEL/GenABEL_1.7-6.tar.gz",
25 | type = "source", repos = NULL)
26 |
27 |
28 | library(devtools)
29 | install_url("http://cran.r-project.org/src/contrib/Archive/postgwas/postgwas_1.11.tar.gz")
30 |
31 | ```
32 |
33 |
34 | ## Configuring global parameters
35 |
36 | Customize and Run [globals.R](R/globals.R)
37 |
38 | ```r
39 | source("globals.R")
40 |
41 | # Downloading support files
42 | # Download and unzip data needed for this tutorial
43 |
44 | library(downloader)
45 |
46 | download(urlSupport, zipSupport.fn)
47 | unzip(zipSupport.fn, exdir = data.dir)
48 |
49 | ```
50 |
51 |
--------------------------------------------------------------------------------
/R/GWAS_ManhattanFunction.R:
--------------------------------------------------------------------------------
1 | # ---- manhattan ----
2 | # Receives a data.frame of SNPs with Neg_logP, chr, position, and type.
3 | # Plots Manhattan plot with significant SNPs highlighted.
4 | GWAS_Manhattan <- function(GWAS, col.snps=c("black","gray"),
5 | col.detected=c("black"), col.imputed=c("blue"), col.text="black",
6 | title="GWAS Tutorial Manhattan Plot", display.text=TRUE,
7 | bonferroni.alpha=0.05, bonferroni.adjustment=1000000,
8 | Lstringent.adjustment=10000) {
9 |
10 | bonferroni.thresh <- -log10(bonferroni.alpha / bonferroni.adjustment)
11 | Lstringent.thresh <- -log10(bonferroni.alpha / Lstringent.adjustment)
12 | xscale <- 1000000
13 |
14 | manhat <- GWAS[!grepl("[A-z]",GWAS$chr),]
15 |
16 | #sort the data by chromosome and then location
17 | manhat.ord <- manhat[order(as.numeric(manhat$chr),manhat$position),]
18 | manhat.ord <- manhat.ord[!is.na(manhat.ord$position),]
19 |
20 | ##Finding the maximum position for each chromosome
21 | max.pos <- sapply(1:21, function(i) { max(manhat.ord$position[manhat.ord$chr==i],0) })
22 | max.pos2 <- c(0, cumsum(max.pos))
23 |
24 | #Add spacing between chromosomes
25 | max.pos2 <- max.pos2 + c(0:21) * xscale * 10
26 |
27 | #defining the positions of each snp in the plot
28 | manhat.ord$pos <- manhat.ord$position + max.pos2[as.numeric(manhat.ord$chr)]
29 |
30 | # alternate coloring of chromosomes
31 | manhat.ord$col <- col.snps[1 + as.numeric(manhat.ord$chr) %% 2]
32 |
33 | # draw the chromosome label roughly in the middle of each chromosome band
34 | text.pos <- sapply(c(1:22), function(i) { mean(manhat.ord$pos[manhat.ord$chr==i]) })
35 |
36 | # Plot the data
37 | plot(manhat.ord$pos[manhat.ord$type=="typed"]/xscale, manhat.ord$Neg_logP[manhat.ord$type=="typed"],
38 | pch=20, cex=.3, col= manhat.ord$col[manhat.ord$type=="typed"], xlab=NA,
39 | ylab="Negative Log P-value", axes=F, ylim=c(0,max(manhat$Neg_logP)+1))
40 | #Add x-label so that it is close to axis
41 | mtext(side = 1, "Chromosome", line = 1.25)
42 |
43 | points(manhat.ord$pos[manhat.ord$type=="imputed"]/xscale, manhat.ord$Neg_logP[manhat.ord$type=="imputed"],
44 | pch=20, cex=.4, col = col.imputed)
45 |
46 | points(manhat.ord$pos[manhat.ord$type=="typed"]/xscale, manhat.ord$Neg_logP[manhat.ord$type=="typed"],
47 | pch=20, cex=.3, col = manhat.ord$col[manhat.ord$type=="typed"])
48 |
49 | axis(2)
50 | abline(h=0)
51 |
52 | SigNifSNPs <- as.character(GWAS[GWAS$Neg_logP > Lstringent.thresh & GWAS$type=="typed", "SNP"])
53 |
54 | #Add legend
55 | legend("topright",c("Bonferroni corrected threshold (p = 5E-8)", "Candidate threshold (p = 5E-6)"),
56 | border="black", col=c("gray60", "gray60"), pch=c(0, 0), lwd=c(1,1),
57 | lty=c(1,2), pt.cex=c(0,0), bty="o", cex=0.6)
58 |
59 | #Add chromosome number
60 | text(text.pos/xscale, -.3, seq(1,22,by=1), xpd=TRUE, cex=.8)
61 |
62 | #Add bonferroni line
63 | abline(h=bonferroni.thresh, untf = FALSE, col = "gray60")
64 |
65 | #Add "less stringent" line
66 | abline(h=Lstringent.thresh, untf = FALSE, col = "gray60", lty = 2 )
67 |
68 | #Plotting detected genes
69 | #Were any genes detected?
70 | if (length(SigNifSNPs)>0){
71 |
72 | sig.snps <- manhat.ord[,'SNP'] %in% SigNifSNPs
73 |
74 | points(manhat.ord$pos[sig.snps]/xscale,
75 | manhat.ord$Neg_logP[sig.snps],
76 | pch=20,col=col.detected, bg=col.detected,cex=0.5)
77 |
78 | text(manhat.ord$pos[sig.snps]/xscale,
79 | manhat.ord$Neg_logP[sig.snps],
80 | as.character(manhat.ord[sig.snps,1]), col=col.text, offset=1, adj=-.1, cex=.5)
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/R/GWAA.R:
--------------------------------------------------------------------------------
1 | # ---- gwaa ----
2 |
3 | # Genome-wide Association Analysis
4 | # Parallel implementation of linear model fitting on each SNP
5 |
6 | GWAA <- function(genodata=genotypes, phenodata=phenotypes, family = gaussian, filename=NULL,
7 | append=FALSE, workers=getOption("mc.cores",2L), flip=TRUE,
8 | select.snps=NULL, hosts=NULL, nSplits=10)
9 | {
10 | if (!require(doParallel)) { stop("Missing doParallel package") }
11 |
12 | #Check that a filename was specified
13 | if(is.null(filename)) stop("Must specify a filename for output.")
14 |
15 | #Check that the genotype data is of class 'SnpMatrix'
16 | if( class(genodata)!="SnpMatrix") stop("Genotype data must of class 'SnpMatrix'.")
17 |
18 | #Check that there is a variable named 'phenotype' in phenodata table
19 | if( !"phenotype" %in% colnames(phenodata)) stop("Phenotype data must have column named 'phenotype'")
20 |
21 | #Check that there is a variable named 'id' in phenodata table
22 | if( !"id" %in% colnames(phenodata)) stop("Phenotype data must have column named 'id'.")
23 |
24 | #If a vector of SNPs is given, subset genotype data for these SNPs
25 | if(!is.null(select.snps)) genodata<-genodata[,which(colnames(genodata)%in%select.snps)]
26 |
27 | #Check that there are still SNPs in 'SnpMatrix' object
28 | if(ncol(genodata)==0) stop("There are no SNPs in the 'SnpMatrix' object.")
29 |
30 | #Print the number of SNPs to be checked
31 | cat(paste(ncol(genodata), " SNPs included in analysis.\n"))
32 |
33 | #If append=FALSE than we will overwrite file with column names
34 | if(!isTRUE(append)) {
35 | columns<-c("SNP", "Estimate", "Std.Error", "t-value", "p-value")
36 | write.table(t(columns), filename, row.names=FALSE, col.names=FALSE, quote=FALSE)
37 | }
38 |
39 | # Check sample counts
40 | if (nrow(phenodata) != nrow(genodata)) {
41 | warning("Number of samples mismatch. Using subset found in phenodata.")
42 | }
43 |
44 | # Order genodata rows to be the same as phenodata
45 | genodata <- genodata[phenodata$id,]
46 |
47 | cat(nrow(genodata), "samples included in analysis.\n")
48 |
49 | # Change which allele is counted (major or minor)
50 | flip.matrix<-function(x) {
51 | zero2 <- which(x==0)
52 | two0 <- which(x==2)
53 | x[zero2] <- 2
54 | x[two0] <- 0
55 | return(x)
56 | }
57 |
58 | nSNPs <- ncol(genodata)
59 | genosplit <- ceiling(nSNPs/nSplits) # number of SNPs in each subset
60 |
61 | snp.start <- seq(1, nSNPs, genosplit) # index of first SNP in group
62 | snp.stop <- pmin(snp.start+genosplit-1, nSNPs) # index of last SNP in group
63 |
64 | if (is.null(hosts)) {
65 | # On Unix this will use fork and mclapply. On Windows it
66 | # will create multiple processes on localhost.
67 | cl <- makeCluster(workers)
68 | } else {
69 | # The listed hosts must be accessible by the current user using
70 | # password-less ssh with R installed on all hosts, all
71 | # packages installed, and "rscript" is in the default PATH.
72 | # See docs for makeCluster() for more information.
73 | cl <- makeCluster(hosts, "PSOCK")
74 | }
75 | show(cl) # report number of workers and type of parallel implementation
76 | registerDoParallel(cl)
77 |
78 | foreach (part=1:nSplits) %do% {
79 | # Returns a standar matrix of the alleles encoded as 0, 1 or 2
80 | genoNum <- as(genodata[,snp.start[part]:snp.stop[part]], "numeric")
81 |
82 | # Flip the numeric values of genotypes to count minor allele
83 | if (isTRUE(flip)) genoNum <- flip.matrix(genoNum)
84 |
85 | # For each SNP, concatenate the genotype column to the
86 | # phenodata and fit a generalized linear model
87 | rsVec <- colnames(genoNum)
88 | res <- foreach(snp.name=rsVec, .combine='rbind') %dopar% {
89 | a <- summary(glm(phenotype~ . - id, family=family, data=cbind(phenodata, snp=genoNum[,snp.name])))
90 | a$coefficients['snp',]
91 | }
92 |
93 | # write results so far to a file
94 | write.table(cbind(rsVec,res), filename, append=TRUE, quote=FALSE, col.names=FALSE, row.names=FALSE)
95 |
96 | cat(sprintf("GWAS SNPs %s-%s (%s%% finished)\n", snp.start[part], snp.stop[part], 100*part/nSplits))
97 | }
98 |
99 | stopCluster(cl)
100 |
101 | return(print("Done."))
102 | }
103 |
104 |
--------------------------------------------------------------------------------
/Data-generation.md:
--------------------------------------------------------------------------------
1 | # Data generation
2 |
3 | ## Re-compute PCA for modeling - Step 5
4 |
5 | Now that we have performed SNP and sample level quality control on our genotype data, we will calculate principal components to be included as covariates in the GWA models. These serve to adjust for any remaining substructure that may confound SNP level association. As with Ancestry filtering we will calculate PCs using the `snpgdsPCA` function from SNPRelate, after performing LD pruning once again on the filtered genotype data set. In this example, we will include the first 10 principal components in GWA models.
6 |
7 | ```r
8 | #Set LD threshold to 0.2
9 | ld.thresh <- 0.2
10 |
11 | set.seed(1000)
12 | geno.sample.ids <- rownames(genotype)
13 | snpSUB <- snpgdsLDpruning(genofile, ld.threshold = ld.thresh,
14 | sample.id = geno.sample.ids, # Only analyze the filtered samples
15 | snp.id = colnames(genotype)) # Only analyze the filtered SNPs
16 | ```
17 | ```
18 | ## SNP pruning based on LD:
19 | ## Excluding 204583 SNPs on non-autosomes
20 | ## Excluding 0 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
21 | ## Working space: 1401 samples, 656890 SNPs
22 | ## Using 1 (CPU) core
23 | ## Sliding window: 500000 basepairs, Inf SNPs
24 | ## |LD| threshold: 0.2
25 | ## Chromosome 1: 8.23%, 5845/71038
26 | ## Chromosome 3: 8.08%, 4893/60565
27 | ## Chromosome 6: 8.03%, 4352/54176
28 | ## Chromosome 12: 8.56%, 3606/42124
29 | ## Chromosome 21: 9.41%, 1173/12463
30 | ## Chromosome 2: 7.66%, 5647/73717
31 | ## Chromosome 4: 8.20%, 4567/55675
32 | ## Chromosome 7: 8.49%, 3939/46391
33 | ## Chromosome 11: 7.89%, 3489/44213
34 | ## Chromosome 10: 7.96%, 3814/47930
35 | ## Chromosome 8: 7.65%, 3694/48299
36 | ## Chromosome 5: 8.04%, 4514/56178
37 | ## Chromosome 14: 8.77%, 2460/28054
38 | ## Chromosome 9: 8.21%, 3374/41110
39 | ## Chromosome 17: 11.14%, 2222/19939
40 | ## Chromosome 13: 8.30%, 2843/34262
41 | ## Chromosome 20: 9.39%, 2137/22753
42 | ## Chromosome 15: 9.23%, 2390/25900
43 | ## Chromosome 16: 9.27%, 2558/27591
44 | ## Chromosome 18: 8.87%, 2327/26231
45 | ## Chromosome 19: 12.99%, 1491/11482
46 | ## Chromosome 22: 10.92%, 1243/11382
47 | ## 72578 SNPs are selected in total.
48 | ```
49 | ```r
50 | snpset.pca <- unlist(snpSUB, use.names=FALSE)
51 | cat(length(snpset.pca),"\n") #72578 SNPs will be used in PCA analysis
52 |
53 | pca <- snpgdsPCA(genofile, sample.id = geno.sample.ids, snp.id = snpset.pca, num.thread=1)
54 | ```
55 | ```
56 | Principal Component Analysis (PCA) on genotypes:
57 | Excluding 788,895 SNPs (non-autosomes or non-selection)
58 | Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
59 | Working space: 1,401 samples, 72,578 SNPs
60 | using 1 (CPU) core
61 | PCA: the sum of all selected genotypes (0,1,2) = 32714193
62 | CPU capabilities: Double-Precision SSE2
63 | Thu Apr 11 20:22:03 2019 (internal increment: 3248)
64 | [==================================================] 100%, completed in 37s
65 | Thu Apr 11 20:22:40 2019 Begin (eigenvalues and eigenvectors)
66 | Thu Apr 11 20:22:41 2019 Done.
67 | ```
68 | ```r
69 | # Find and record first 10 principal components
70 | # pcs will be a N:10 matrix. Each column is a principal component.
71 | pcs <- data.frame(FamID = pca$sample.id, pca$eigenvect[,1 : 10],
72 | stringsAsFactors = FALSE)
73 | colnames(pcs)[2:11]<-paste("pc", 1:10, sep = "")
74 |
75 | print(head(pcs))
76 | ```
77 | ```
78 | ## FamID pc1 pc2 pc3 pc4
79 | ## 1 10002 0.007764870 0.014480384 -0.0006315881 0.0028664643
80 | ## 2 10004 -0.012045108 -0.007231015 -0.0030012896 -0.0107972693
81 | ## 3 10005 -0.016702930 -0.005347697 0.0144498361 -0.0006151058
82 | ## 4 10007 -0.009537235 0.004556977 0.0026835662 0.0166255657
83 | ## 5 10008 -0.015392106 -0.002446933 0.0205087909 -0.0057241772
84 | ## 6 10009 -0.015123858 -0.002353917 0.0213604518 0.0069156529
85 | ## pc5 pc6 pc7 pc8 pc9
86 | ## 1 -0.0188391406 0.009680646 0.0276468057 -0.006645818 -0.023429747
87 | ## 2 -0.0077705400 -0.004645751 0.0018061075 -0.003087891 -0.001833242
88 | ## 3 0.0345170160 0.038708551 0.0205790788 -0.012265508 0.003592690
89 | ## 4 -0.0002363142 0.005514627 0.0159588869 0.027975455 0.029777180
90 | ## 5 -0.0039696226 0.005354244 -0.0007269312 0.027014714 0.010672162
91 | ## 6 0.0400677558 0.023222478 0.0152485234 0.013296852 0.022746352
92 | ## pc10
93 | ## 1 0.010492314
94 | ## 2 -0.004538746
95 | ## 3 -0.002287043
96 | ## 4 -0.007461255
97 | ## 5 -0.003352997
98 | ## 6 0.013143889
99 | ```
100 | ```r
101 | # Close GDS file
102 | closefn.gds(genofile)
103 | ```
104 | ## Genotype imputation - Step 6
105 | In addition to the genotyped SNPs from our study, it is useful to extend the analysis to other known SNPs, that were not typed or were removed by SNP level filtering. In this example, we impute SNPs on chromosome 16.
106 |
107 | Performance of genotype imputation requires reference data, which has typed genotypes at the SNPs of interest from similar homogeneous sample. Sources for this data include HapMap and 1000 Genomes.
108 |
109 | For this example, we will use 1000 Genomes data, read in from .ped and.info using the `read.pedfile` in from snpStats. Note, that the .info file is similar to the .map file. To specify the column in the .info file with the SNP IDs, we use the `which` argument.
110 |
111 | We derive imputation “rules” for the additional SNPs that were not typed in our study using `snp.imputation` based on the genotypes from the 1000 Genomes data. Each rule represents a predictive model for genotypes of untyped SNPs associated with near-by typed SNPs. Using these rules, we can calculate the expected posterior value of the non-typed SNPs using the `impute` function from SNPRelate.
112 |
113 | In the last step we remove un-typed SNPs in which we fail to derive imputation “rules”. We also filter out SNPs that have low estimated minor allele frequency, and low imputation accuracy. The latter is based on the R2 value of the model estimated by the `snp.imputation` function.
114 |
115 | ```r
116 | # Read in 1000g data for given chromosome 16
117 | thougeno <- read.pedfile(onethou.fn$ped, snps = onethou.fn$info, which=1)
118 |
119 | # Obtain genotype data for given chromosome
120 | genoMatrix <- thougeno$genotypes
121 |
122 | # Obtain the chromosome position for each SNP
123 | support <- thougeno$map
124 | colnames(support)<-c("SNP", "position", "A1", "A2")
125 | head(support)
126 | ```
127 | ```
128 | ## SNP position A1 A2
129 | ## 1 rs140769322 60180 3 2
130 | ## 2 rs188810967 60288 2 1
131 | ## 3 rs76368850 60291 2 4
132 | ## 4 rs185537431 60778 3 1
133 | ## 5 rs542544747 60842 2 1
134 | ## 6 rs4021615 61349 1 3
135 | ```
136 | ```r
137 | # Imputation of non-typed 1000g SNPs
138 | presSnps <- colnames(genotype)
139 |
140 | # Subset for SNPs on given chromosome
141 | presSnps <- colnames(genotype)
142 | presDatChr <- genoBim[genoBim$SNP %in% presSnps & genoBim$chr==16, ]
143 | targetSnps <- presDatChr$SNP
144 |
145 | # Subset 1000g data for our SNPs
146 | # "missing" and "present" are snpMatrix objects needed for imputation rules
147 | is.present <- colnames(genoMatrix) %in% targetSnps
148 |
149 | missing <- genoMatrix[,!is.present]
150 | print(missing) # Almost 400,000 SNPs
151 | ```
152 | ```
153 | ## A SnpMatrix with 99 rows and 377819 columns
154 | ## Row names: CEU_1 ... CEU_99
155 | ## Col names: rs140769322 ... rs111706106
156 | ```
157 | ```r
158 | present <- genoMatrix[,is.present]
159 | print(present) # Our typed SNPs
160 | ```
161 | ```
162 | ## A SnpMatrix with 99 rows and 20632 columns
163 | ## Row names: CEU_1 ... CEU_99
164 | ## Col names: rs41340949 ... rs4785775
165 | ```
166 | ```r
167 | # Obtain positions of SNPs to be used for imputation rules
168 | pos.pres <- support$position[is.present]
169 | pos.miss <- support$position[!is.present]
170 |
171 | # Calculate and store imputation rules using snp.imputation()
172 | rules <- snp.imputation(present, missing, pos.pres, pos.miss)
173 | ```
174 | ```
175 | ## SNPs tagged by a single SNP: 82119
176 | ## SNPs tagged by multiple tag haplotypes (saturated model): 115769
177 | ```
178 | ```r
179 | # Remove failed imputations
180 | rules <- rules[can.impute(rules)]
181 | cat("Imputation rules for", length(rules), "SNPs were estimated\n")
182 | # Imputation rules for 197888 SNPs were estimated
183 | ```
184 | ```
185 | ## Imputation rules for 197888 SNPs were estimated
186 | ```
187 |
188 | ```r
189 | # Quality control for imputation certainty and MAF
190 | # Set thresholds
191 | r2threshold <- 0.7
192 | minor <- 0.01
193 |
194 | # Filter on imputation certainty and MAF
195 | rules <- rules[imputation.r2(rules) >= r2threshold]
196 |
197 | cat(length(rules),"imputation rules remain after imputations with low certainty were removed\n")
198 | # 162565 imputation rules remain after imputations with low certainty were removed
199 | ```
200 | ```r
201 | rules <- rules[imputation.maf(rules) >= minor]
202 | cat(length(rules),"imputation rules remain after MAF filtering\n")
203 | # 162565 imputation rules remain after MAF filtering
204 | ```
205 | ```r
206 | # Obtain posterior expectation of genotypes of imputed snps
207 | target <- genotype[,targetSnps]
208 | imputed <- impute.snps(rules, target, as.numeric=FALSE)
209 | print(imputed) # 162565 SNPs were imputed
210 | ```
211 | ```
212 | ## A SnpMatrix with 1401 rows and 162565 columns
213 | ## Row names: 10002 ... 11596
214 | ## Col names: rs560777354;rs80001234 ... rs62053708
215 | ```
216 | ```r
217 | # Free some memory in your R session
218 | rm(genoMatrix)
219 | rm(missing)
220 | rm(present)
221 |
222 | # Add new imputed, target and rules data to saved results
223 | save(genotype, genoBim, clinical, pcs, imputed, target, rules, support, file=working.data.fname(6))
224 |
225 | ```
226 |
227 |
--------------------------------------------------------------------------------
/GWAS-analysis.md:
--------------------------------------------------------------------------------
1 | # Genome-wide association analysis
2 | Now that our data is loaded, filtered, and additional SNP genotypes imputed we are ready to perform genome-wide association analysis. This involves regressing each SNP separately on a given trait, adjusted for sample level clinical, environmental, and demographic factors. Due to the large number of SNPs and the generally uncharacterized relationships to the outcome, a simple single additive model will be employed.
3 |
4 | The `GWAA` function requires two arguments. The `genodata` argument should specify the entire genotype data object in `SnpMatrix` format. The phenodata argument should be a data frame with a column of sample IDs, corresponding to the row names of genodata, and a columns for the continuous outcome variable. These columns must be named “id” and “phenotype”, respectively. In order to fit the model, genotype data is converted to numeric format using the as function from snpStats. The genotypes of each SNP are then coded as continuous, thereby taking on the value of 0, 1, and 2. For this example, we wish for the value of the genotype to reflect the number of minor alleles. However, following conversion our values will reflect the opposite. To fix this a flip.matrix procedure is included in our `GWAA` function, which can be turned on or off using the `flip` argument.
5 |
6 | Due to the large number of models that require fitting, the GWA analysis can be deployed in parallel across multiple processors or machines to reduce the running time. Here we demonstrate two basic methods for performing parallel processing using the doParallel package. This will be carried out differently depending on whether or not the analysis is run on a UNIX based system, though the arguments are the same. The user can specify the number of processes using the `worker` argument (set to 2 by default). Additional arguments include `select.snps` and `nSplits`. The former allows the user to subset the analysis via a vector of SNP IDs. The latter specifies a number of SNP-wise splits that are made to the genotype data. The function runs the GWA analysis on these smaller subsets of the genotype data one at a time. After each subset has finished running the function will print a progress update onto the R console. By default this is set to 10.
7 |
8 | ## Association analysis of typed SNPs - Step 7
9 | First we create a data frame of phenotype features that is the concatenation of clinical features and the first ten principal components. The HDL feature is normalized using a rank-based inverse normal transform. We then remove variables that we are not including in the analysis, i.e. HDL(non-normalized), LDL, TG, and CAD. Finally, we remove samples with missing normalized HDL data.
10 |
11 | ```r
12 | ## restore the data generated from steps 1-6
13 | source("globals.R")
14 |
15 | # load data created in previous snippets
16 | load(working.data.fname(6))
17 |
18 | ## Require GenABEL and GWAA function
19 | library(GenABEL)
20 | source("https://github.com/AAlhendi1707/GWAS/blob/master/R/GWAA.R?raw=true")
21 |
22 | # Merge clincal data and principal components to create phenotype table
23 | phenoSub <- merge(clinical,pcs) # data.frame => [ FamID CAD sex age hdl pc1 pc2 ... pc10 ]
24 |
25 | # We will do a rank-based inverse normal transformation of hdl
26 | phenoSub$phenotype <- rntransform(phenoSub$hdl, family="gaussian")
27 |
28 | # Show that the assumptions of normality met after transformation
29 | par(mfrow=c(1,2))
30 | hist(phenoSub$hdl, main="Histogram of HDL", xlab="HDL")
31 | hist(phenoSub$phenotype, main="Histogram of Tranformed HDL", xlab="Transformed HDL")
32 | ```
33 |
34 |
35 | ```r
36 | # Remove unnecessary columns from table
37 | phenoSub$hdl <- NULL
38 | phenoSub$ldl <- NULL
39 | phenoSub$tg <- NULL
40 | phenoSub$CAD <- NULL
41 |
42 | # Rename columns to match names necessary for GWAS() function
43 | colnames(phenoSub)[1]<- "id"
44 |
45 | # Include only subjects with hdl data
46 | phenoSub<-phenoSub[!is.na(phenoSub$phenotype),]
47 | # 1309 subjects included with phenotype data
48 |
49 | print(head(phenoSub))
50 | ```
51 | ```
52 | ## id sex age pc1 pc2 pc3 pc4
53 | ## 2 10004 2 50 -0.012045108 -0.007231015 -0.003001290 -0.0107972693
54 | ## 3 10005 1 55 -0.016702930 -0.005347697 0.014449836 -0.0006151058
55 | ## 4 10007 1 52 -0.009537235 0.004556977 0.002683566 0.0166255657
56 | ## 5 10008 1 58 -0.015392106 -0.002446933 0.020508791 -0.0057241772
57 | ## 6 10009 1 59 -0.015123858 -0.002353917 0.021360452 0.0069156529
58 | ## 7 10010 1 54 -0.012816157 0.005126124 0.014654847 -0.0147082270
59 | ## pc5 pc6 pc7 pc8 pc9
60 | ## 2 -0.0077705400 -0.0046457510 0.0018061075 -0.003087891 -0.001833242
61 | ## 3 0.0345170160 0.0387085513 0.0205790788 -0.012265508 0.003592690
62 | ## 4 -0.0002363142 0.0055146271 0.0159588869 0.027975455 0.029777180
63 | ## 5 -0.0039696226 0.0053542437 -0.0007269312 0.027014714 0.010672162
64 | ## 6 0.0400677558 0.0232224781 0.0152485234 0.013296852 0.022746352
65 | ## 7 -0.0008190769 -0.0003831342 -0.0131606658 -0.013647709 -0.008912913
66 | ## pc10 phenotype
67 | ## 2 -0.004538746 -2.2877117
68 | ## 3 -0.002287043 -0.4749316
69 | ## 4 -0.007461255 0.8855512
70 | ## 5 -0.003352997 -0.1644639
71 | ## 6 0.013143889 0.3938940
72 | ## 7 -0.056187339 1.7109552
73 | ```
74 | ### Parallel model fitting
75 | Using this phenotype data, we perform model fitting on each of the typed SNPs in the `genotype` object and write the results to a *.txt* file.
76 |
77 | ```r
78 | # Run GWAS analysis
79 | # Note: This function writes a file, but does not produce an R object
80 | start <- Sys.time()
81 | GWAA(genodata=genotype, phenodata=phenoSub, filename=gwaa.fname)
82 | end <- Sys.time()
83 | print(end-start)
84 | ```
85 | ```
86 | ## Loading required package: doParallel
87 | ## Loading required package: foreach
88 | ## Loading required package: iterators
89 | ## Loading required package: parallel
90 |
91 | ## 656890 SNPs included in analysis.
92 | ## 1309 samples included in analysis.
93 | ## socket cluster with 2 nodes on host 'localhost'
94 | ## GWAS SNPs 1-65689 (10% finished)
95 | ## GWAS SNPs 65690-131378 (20% finished)
96 | ## GWAS SNPs 131379-197067 (30% finished)
97 | ## GWAS SNPs 197068-262756 (40% finished)
98 | ## GWAS SNPs 262757-328445 (50% finished)
99 | ## GWAS SNPs 328446-394134 (60% finished)
100 | ## GWAS SNPs 394135-459823 (70% finished)
101 | ## GWAS SNPs 459824-525512 (80% finished)
102 | ## GWAS SNPs 525513-591201 (90% finished)
103 | ## GWAS SNPs 591202-656890 (100% finished)
104 | ## [1] "Done."
105 | ```
106 | ```
107 | ## Time difference of 2.259378 hours
108 | ```
109 | ```r
110 | # Add phenosub to saved results
111 | save(genotype, genoBim, clinical, pcs, imputed, target, rules, phenoSub, support, file=working.data.fname(7))
112 | ```
113 |
114 | ## Association analysis of imputed SNPs - Step 8
115 | ### Model fitting of non-typed SNPs
116 | We also perform association testing on additional SNPs from genotype imputation. Here we use thesnp.rhs.tests function from `snpStats` to perform the analysis based on the imputation “rules” we calculated previously. We need to specify the variables from the `phenoSub` data frame that we are including in the model with row names corresponding to the sample IDs.
117 |
118 | The resulting SNPs are combined with the chromosome position information to create a table of SNPs, location and p-value. Finally, we take *−log10* of the p-value for plotting.
119 |
120 | ```r
121 | # Carry out association testing for imputed SNPs using snp.rhs.tests()
122 | rownames(phenoSub) <- phenoSub$id
123 |
124 | imp <- snp.rhs.tests(phenotype ~ sex + age + pc1 + pc2 + pc3 + pc4 + pc5 + pc6 + pc7 + pc8 + pc9 + pc10,
125 | family = "Gaussian", data = phenoSub, snp.data = target, rules = rules)
126 |
127 | # Obtain p values for imputed SNPs by calling methods on the returned GlmTests object.
128 | results <- data.frame(SNP = imp@snp.names, p.value = p.value(imp), stringsAsFactors = FALSE)
129 | results <- results[!is.na(results$p.value),]
130 |
131 | #Write a file containing the results
132 | write.csv(results, impute.out.fname, row.names=FALSE)
133 |
134 | # Merge imputation testing results with support to obtain coordinates
135 | imputeOut<-merge(results, support[, c("SNP", "position")])
136 | imputeOut$chr <- 16
137 |
138 | imputeOut$type <- "imputed"
139 |
140 | # Find the -log_10 of the p-values
141 | imputeOut$Neg_logP <- -log10(imputeOut$p.value)
142 |
143 | # Order by p-value
144 | imputeOut <- arrange(imputeOut, p.value)
145 | print(head(imputeOut))
146 | ```
147 | ```
148 | ## SNP p.value position chr type Neg_logP
149 | ## 1 rs1532624 9.805683e-08 57005479 16 imputed 7.008522
150 | ## 2 rs7205804 9.805683e-08 57004889 16 imputed 7.008522
151 | ## 3 rs12446515 1.430239e-07 56987015 16 imputed 6.844591
152 | ## 4 rs17231506 1.430239e-07 56994528 16 imputed 6.844591
153 | ## 5 rs173539 1.430239e-07 56988044 16 imputed 6.844591
154 | ## 6 rs183130 1.430239e-07 56991363 16 imputed 6.844591
155 | ```
156 | ### Mapping associated SNPs to genes
157 |
158 | Using a separate data file containing the chromosome and coordinate locations of each protein coding gene, we can locate coincident genes and SNPs.
159 | The SNP with the lowest p-value in both the typed and imputed SNP analysis lies within the boundaries of the cholesteryl ester transfer protein gene, CETP. We can call the `map2gene` function for “CETP” to filter the imputed genotypes and extract only those SNPs that are near CETP. This will be used for post-analytic interrogation to follow.
160 |
161 | ```r
162 | source("https://github.com/AAlhendi1707/GWAS/blob/master/R/map2gene.R?raw=true")
163 | # Read in file containing protein coding genes coords
164 | genes <- read.csv(protein.coding.coords.fname, stringsAsFactors = FALSE)
165 |
166 | # Subset for CETP SNPs
167 | impCETP <- map2gene("CETP", coords = genes, SNPs = imputeOut)
168 |
169 | # Filter only the imputed CETP SNP genotypes
170 | impCETPgeno <- imputed[, impCETP$SNP ]
171 | ```
172 |
173 | ```r
174 | ## Save GWAS analysis restuls
175 | save(genotype, genoBim, clinical, pcs, imputed, target, rules,
176 | phenoSub, support, genes, impCETP, impCETPgeno, imputeOut, file = working.data.fname(8))
177 | ```
178 |
--------------------------------------------------------------------------------
/Post-analytic-visualization-and-genomic-interrogation.md:
--------------------------------------------------------------------------------
1 | # Post-analytic visualization and genomic interrogation
2 | We now have generated and fit both typed and imputed genotypes. The next step is to combine the results, and isolate just those SNPs in our region of interest. Following similar steps as for imputed SNPs, the typed SNPs are loaded from a file generated by the `GWAA` function. We follow similar steps to attach chromosome and position to each SNP, order by significance, and take *−log10* of the p-value.
3 |
4 | ## Data Integration
5 | ```r
6 | # Read in GWAS output that was produced by GWAA function
7 | GWASout <- read.table(gwaa.fname, header=TRUE, colClasses=c("character", rep("numeric",4)))
8 |
9 | # Find the -log_10 of the p-values
10 | GWASout$Neg_logP <- -log10(GWASout$p.value)
11 |
12 | # Merge output with genoBim by SNP name to add position and chromosome number
13 | GWASout <- merge(GWASout, genoBim[,c("SNP", "chr", "position")])
14 | rm(genoBim)
15 |
16 | # Order SNPs by significance
17 | GWASout <- arrange(GWASout, -Neg_logP)
18 | print(head(GWASout))
19 | ```
20 | ```
21 | ## SNP Estimate Std.Error t.value p.value Neg_logP chr
22 | ## 1 rs1532625 0.2024060 0.03756207 5.388575 8.452365e-08 7.073022 16
23 | ## 2 rs247617 0.2119357 0.03985979 5.317030 1.243480e-07 6.905361 16
24 | ## 3 rs10945761 0.1856564 0.04093602 4.535282 6.285358e-06 5.201670 6
25 | ## 4 rs3803768 -0.3060086 0.06755628 -4.529685 6.451945e-06 5.190309 17
26 | ## 5 rs4821708 -0.1816673 0.04020915 -4.518058 6.825085e-06 5.165892 22
27 | ## 6 rs9647610 0.1830434 0.04072772 4.494320 7.607161e-06 5.118777 6
28 | ## position
29 | ## 1 57005301
30 | ## 2 56990716
31 | ## 3 162065367
32 | ## 4 80872028
33 | ## 5 38164106
34 | ## 6 162066421
35 | ```
36 | ### Combine typed and imputed
37 | Isolate CETP-specific SNPs
38 | The two tables of typed and imputed genotypes are combined into a single table. In addition, we also concatenate just the SNPs near CETP and display them all here.
39 |
40 | ```r
41 | GWASout$type <- "typed"
42 |
43 | GWAScomb<-rbind.fill(GWASout, imputeOut)
44 | head(GWAScomb)
45 | ```
46 | ```
47 | ## SNP Estimate Std.Error t.value p.value Neg_logP chr
48 | ## 1 rs1532625 0.2024060 0.03756207 5.388575 8.452365e-08 7.073022 16
49 | ## 2 rs247617 0.2119357 0.03985979 5.317030 1.243480e-07 6.905361 16
50 | ## 3 rs10945761 0.1856564 0.04093602 4.535282 6.285358e-06 5.201670 6
51 | ## 4 rs3803768 -0.3060086 0.06755628 -4.529685 6.451945e-06 5.190309 17
52 | ## 5 rs4821708 -0.1816673 0.04020915 -4.518058 6.825085e-06 5.165892 22
53 | ## 6 rs9647610 0.1830434 0.04072772 4.494320 7.607161e-06 5.118777 6
54 | ## position type
55 | ## 1 57005301 typed
56 | ## 2 56990716 typed
57 | ## 3 162065367 typed
58 | ## 4 80872028 typed
59 | ## 5 38164106 typed
60 | ## 6 162066421 typed
61 | ```
62 | ```r
63 | tail(GWAScomb)
64 | ```
65 | ```
66 | ## SNP Estimate Std.Error t.value p.value Neg_logP chr
67 | ## 818521 rs62048372 NA NA NA 0.9999838 7.048600e-06 16
68 | ## 818522 rs8056666 NA NA NA 0.9999838 7.048600e-06 16
69 | ## 818523 rs8057313 NA NA NA 0.9999838 7.048600e-06 16
70 | ## 818524 rs8061812 NA NA NA 0.9999838 7.048600e-06 16
71 | ## 818525 rs9940700 NA NA NA 0.9999838 7.048600e-06 16
72 | ## 818526 rs13334556 NA NA NA 0.9999843 6.825503e-06 16
73 | ## position type
74 | ## 818521 53775940 imputed
75 | ## 818522 53794830 imputed
76 | ## 818523 53794855 imputed
77 | ## 818524 53794856 imputed
78 | ## 818525 53795409 imputed
79 | ## 818526 5463800 imputed
80 | ```
81 | ```r
82 | # Subset for CETP SNPs
83 | typCETP <- map2gene("CETP", coords = genes, SNPs = GWASout)
84 |
85 | # Combine CETP SNPs from imputed and typed analysis
86 | CETP <- rbind.fill(typCETP, impCETP)[,c("SNP","p.value","Neg_logP","chr","position","type","gene")]
87 | print(CETP)
88 | ```
89 | ```
90 | ## SNP p.value Neg_logP chr position type gene
91 | ## 1 rs1532625 8.452365e-08 7.07302173 16 57005301 typed CETP
92 | ## 2 rs289742 3.788738e-04 3.42150548 16 57017762 typed CETP
93 | ## 3 rs289715 4.299934e-03 2.36653823 16 57008508 typed CETP
94 | ## 4 rs6499863 1.382602e-02 1.85930275 16 56992017 typed CETP
95 | ## 5 rs1800777 8.833782e-02 1.05385333 16 57017319 typed CETP
96 | ## 6 rs4783962 1.039467e-01 0.98318933 16 56995038 typed CETP
97 | ## 7 rs12708980 6.375740e-01 0.19546941 16 57012379 typed CETP
98 | ## 8 rs1532624 9.805683e-08 7.00852215 16 57005479 imputed CETP
99 | ## 9 rs7205804 9.805683e-08 7.00852215 16 57004889 imputed CETP
100 | ## 10 rs17231506 1.430239e-07 6.84459142 16 56994528 imputed CETP
101 | ## 11 rs183130 1.430239e-07 6.84459142 16 56991363 imputed CETP
102 | ## 12 rs3764261 1.430239e-07 6.84459142 16 56993324 imputed CETP
103 | ## 13 rs821840 1.430239e-07 6.84459142 16 56993886 imputed CETP
104 | ## 14 rs11508026 1.151771e-06 5.93863373 16 56999328 imputed CETP
105 | ## 15 rs12444012 1.151771e-06 5.93863373 16 57001438 imputed CETP
106 | ## 16 rs12720926 1.151771e-06 5.93863373 16 56998918 imputed CETP
107 | ## 17 rs4784741 1.151771e-06 5.93863373 16 57001216 imputed CETP
108 | ## 18 rs34620476 1.155266e-06 5.93731819 16 56996649 imputed CETP
109 | ## 19 rs708272 1.155266e-06 5.93731819 16 56996288 imputed CETP
110 | ## 20 rs711752 1.155266e-06 5.93731819 16 56996211 imputed CETP
111 | ## 21 rs12720922 3.238664e-06 5.48963411 16 57000885 imputed CETP
112 | ## 22 rs8045855 3.238664e-06 5.48963411 16 57000696 imputed CETP
113 | ## 23 rs12149545 3.245934e-06 5.48866029 16 56993161 imputed CETP
114 | ## 24 rs11076175 1.400697e-05 4.85365587 16 57006378 imputed CETP
115 | ## 25 rs7499892 1.400697e-05 4.85365587 16 57006590 imputed CETP
116 | ## 26 rs1800775 1.747444e-05 4.75759678 16 56995236 imputed CETP
117 | ## 27 rs3816117 1.747444e-05 4.75759678 16 56996158 imputed CETP
118 | ## 28 rs11076176 1.089765e-04 3.96266723 16 57007446 imputed CETP
119 | ## 29 rs289714 1.121002e-04 3.95039374 16 57007451 imputed CETP
120 | ## 30 rs158478 2.513994e-04 3.59963575 16 57007734 imputed CETP
121 | ## 31 rs9939224 2.868544e-04 3.54233851 16 57002732 imputed CETP
122 | ## 32 rs12447620 3.868267e-04 3.41248361 16 57014319 imputed CETP
123 | ## 33 rs158480 3.868267e-04 3.41248361 16 57008227 imputed CETP
124 | ## 34 rs158617 3.868267e-04 3.41248361 16 57008287 imputed CETP
125 | ## 35 rs112039804 4.305196e-03 2.36600705 16 57018856 imputed CETP
126 | ## 36 rs12708985 4.305196e-03 2.36600705 16 57014610 imputed CETP
127 | ## 37 rs736274 4.305196e-03 2.36600705 16 57009769 imputed CETP
128 | ## 38 rs11076174 4.439341e-03 2.35268153 16 57003146 imputed CETP
129 | ## 39 rs158479 1.358926e-02 1.86680426 16 57008048 imputed CETP
130 | ## 40 rs201825234 1.392675e-02 1.85615030 16 56991948 imputed CETP
131 | ## 41 rs2115429 1.392675e-02 1.85615030 16 56992842 imputed CETP
132 | ## 42 rs6499861 1.392675e-02 1.85615030 16 56991495 imputed CETP
133 | ## 43 rs6499862 1.392675e-02 1.85615030 16 56991524 imputed CETP
134 | ## 44 rs289713 1.902194e-02 1.72074521 16 57006829 imputed CETP
135 | ## 45 rs12720918 2.238286e-02 1.65008448 16 56994212 imputed CETP
136 | ## 46 rs12920974 2.238286e-02 1.65008448 16 56993025 imputed CETP
137 | ## 47 rs36229787 3.026885e-02 1.51900413 16 56993897 imputed CETP
138 | ## 48 rs820299 4.470355e-02 1.34965802 16 57000284 imputed CETP
139 | ## 49 rs289712 4.529779e-02 1.34392301 16 57006305 imputed CETP
140 | ## 50 rs34946873 5.624406e-02 1.24992336 16 56991143 imputed CETP
141 | ## 51 rs12597002 6.153983e-02 1.21084368 16 57002404 imputed CETP
142 | ## 52 rs60545348 6.153983e-02 1.21084368 16 57001985 imputed CETP
143 | ## 53 rs708273 6.153983e-02 1.21084368 16 56999949 imputed CETP
144 | ## 54 rs4369653 6.333149e-02 1.19838029 16 56997551 imputed CETP
145 | ## 55 rs5880 7.129792e-02 1.14692314 16 57015091 imputed CETP
146 | ## 56 rs4587963 8.354674e-02 1.07807049 16 56997369 imputed CETP
147 | ## 57 rs1800776 9.239564e-02 1.03434852 16 56995234 imputed CETP
148 | ## 58 rs289746 9.693910e-02 1.01350102 16 57020205 imputed CETP
149 | ## 59 rs12447839 1.042017e-01 0.98212538 16 56993935 imputed CETP
150 | ## 60 rs12447924 1.042017e-01 0.98212538 16 56994192 imputed CETP
151 | ## 61 rs158477 1.519849e-01 0.81819960 16 57007610 imputed CETP
152 | ## 62 rs12720889 2.755963e-01 0.55972661 16 57012563 imputed CETP
153 | ## 63 rs12708983 2.772136e-01 0.55718551 16 57014411 imputed CETP
154 | ## 64 rs66495554 2.790835e-01 0.55426586 16 57018636 imputed CETP
155 | ## 65 rs12934552 3.156022e-01 0.50085994 16 57021433 imputed CETP
156 | ## 66 rs12708968 3.597273e-01 0.44402664 16 56994819 imputed CETP
157 | ## 67 rs17245715 3.597273e-01 0.44402664 16 56994990 imputed CETP
158 | ## 68 rs4783961 4.335221e-01 0.36298880 16 56994894 imputed CETP
159 | ## 69 rs12598522 5.138788e-01 0.28913932 16 57022352 imputed CETP
160 | ## 70 rs56315364 5.138788e-01 0.28913932 16 57021524 imputed CETP
161 | ## 71 rs117427818 5.582634e-01 0.25316088 16 57010486 imputed CETP
162 | ## 72 rs36229786 5.721591e-01 0.24248319 16 56993901 imputed CETP
163 | ## 73 rs11860407 6.108898e-01 0.21403710 16 57010828 imputed CETP
164 | ## 74 rs2033254 6.108898e-01 0.21403710 16 57009985 imputed CETP
165 | ## 75 rs1800774 6.293251e-01 0.20112492 16 57015545 imputed CETP
166 | ## 76 rs7405284 6.519531e-01 0.18578366 16 57001275 imputed CETP
167 | ## 77 rs12708974 9.096021e-01 0.04114853 16 57005550 imputed CETP
168 | ```
169 | ```
170 | write.csv(CETP, CETP.fname, row.names=FALSE) # save for future use
171 | ```
172 |
173 | ## Visualization and QC - Step 10
174 |
175 | Several plots allow us both to visualize the GWA analysis findings while performing quality control checks. Specifically, we are interested in identifying data inconsistencies and potential systemic biases.
176 |
177 | ### Manhattan plot
178 | Manhattan plots are used to visual GWA significant results by chromosome location. We will call the `GWAS_Manhattan` function to plot *−log10* of the p-value against SNP position across the entire set of typed and imputed SNPs. The plot will show two horizontal lines. The higher of the two is the commonly used “Bonferroni” adjusted significance cut-off of *−log10*(5×10−8), while the lower is less stringent (“Candidate”) cut-off of *−log10*(5×10−6). Typed and imputed SNPs will be represented by black and blue, respectively. We label the typed SNPs with signals that have surpassed the less stringent cutoff.
179 |
180 |
181 | ```r
182 | source("https://github.com/AAlhendi1707/GWAS/blob/master/R/GWAS_ManhattanFunction.R?raw=true")
183 | # Plots Manhattan plot with significant SNPs highlighted.
184 |
185 | # Create Manhattan Plot
186 | GWAS_Manhattan(GWAScomb)
187 | ```
188 |
189 |
190 | ### Q–Q plots and the 𝜆-statistic
191 | Q-Q plots are used to visualize the relationship between the expected and observed distributions of SNP level test statistics. Here we compare these statistics for the unadjusted model (left) compared with the model adjusted for confounders by incorporating the first ten principal components along with clinical covariates.
192 |
193 | A new set of models is generated with only the phenotype (HDL) and no additional factors. The results are plotted using the GenABEL package's `estlambda` function.
194 |
195 | ```r
196 | # Rerun the GWAS using unadjusted model
197 | phenoSub2 <- phenoSub[,c("id","phenotype")] # remove all extra factors, leave only phenotype
198 |
199 | GWAA(genodata=genotype, phenodata=phenoSub2, filename=gwaa.unadj.fname)
200 | ```
201 | ```
202 | ## 656890 SNPs included in analysis.
203 | ## 1309 samples included in analysis.
204 | ## socket cluster with 2 nodes on host 'localhost'
205 | ## GWAS SNPs 1-65689 (10% finished)
206 | ## GWAS SNPs 65690-131378 (20% finished)
207 | ## GWAS SNPs 131379-197067 (30% finished)
208 | ## GWAS SNPs 197068-262756 (40% finished)
209 | ## GWAS SNPs 262757-328445 (50% finished)
210 | ## GWAS SNPs 328446-394134 (60% finished)
211 | ## GWAS SNPs 394135-459823 (70% finished)
212 | ## GWAS SNPs 459824-525512 (80% finished)
213 | ## GWAS SNPs 525513-591201 (90% finished)
214 | ## GWAS SNPs 591202-656890 (100% finished)
215 | ## [1] "Done."
216 | ```
217 | ```r
218 | GWASoutUnadj <- read.table(gwaa.unadj.fname, header=TRUE, colClasses=c("character", rep("numeric",4)))
219 |
220 | # Create QQ plots for adjusted and unadjusted model outputs
221 | par(mfrow=c(1,2))
222 | lambdaAdj <- estlambda(GWASout$t.value^2,plot=TRUE,method="median")
223 | lambdaUnadj <- estlambda(GWASoutUnadj$t.value^2,plot=TRUE,method="median")
224 | ```
225 |
226 |
227 | ```r
228 | cat(sprintf("Unadjusted lambda: %s\nAdjusted lambda: %s\n", lambdaUnadj$estimate, lambdaAdj$estimate))
229 | ```
230 | ```
231 | ## Unadjusted lambda: 1.01417377078806
232 | ## Adjusted lambda: 1.00214021515846
233 | ```
234 | ```r
235 | # Calculate standardized lambda
236 | lambdaAdj_1000<-1+(lambdaAdj$estimate-1)/nrow(phenoSub)*1000
237 | lambdaUnadj_1000<-1+(lambdaUnadj$estimate-1)/nrow(phenoSub)*1000
238 | cat(sprintf("Standardized unadjusted lambda: %s\nStandardized adjusted lambda: %s\n",
239 | lambdaUnadj_1000, lambdaAdj_1000))
240 | ```
241 | ```
242 | ## Standardized unadjusted lambda: 1.0108279379588
243 | ## Standardized adjusted lambda: 1.00163500012105
244 | ```
245 | We see here that the tail of the distribution is brought closer to the *y=x* line after accounting for confounding by race/ethnicity in the modeling framework. If the data in this figure were shifted up or down from the *y=x* line, then we would want to investigate some form of systemic bias. The degree of deviation from this line is measured formally by the λ-statistic, where a value close to 1 suggests appropriate adjustment for the potential admixture. A slight deviation in the upper right tail from the *y=x* line suggests crudely that some form of association is present in the data. There is only a slight improvement in λ between the unadjusted model and the model with PCs indicating that the population is relatively homogenous.
246 |
247 | ### Heatmap
248 | Heatmaps are typically used in the context of GWA analysis to visualize the linkage disequilibrium pattern between significant SNPs other SNPs in nearby regions. Here we include our most significant SNP from our analysis and other SNPs near CETP. The darker shading indicates higher LD. The plot also includes *−log10*(p) values to illustrate their connection with physical location and LD.
249 |
250 | ```r
251 | library(LDheatmap)
252 | library(rtracklayer)
253 |
254 | # Add "rs247617" to CETP
255 | CETP <- rbind.fill(GWASout[GWASout$SNP == "rs247617",], CETP)
256 |
257 | # Combine genotypes and imputed genotypes for CETP region
258 | subgen <- cbind(genotype[,colnames(genotype) %in% CETP$SNP], impCETPgeno) # CETP subsets from typed and imputed SNPs
259 |
260 | # Subset SNPs for only certain genotypes
261 | certain <- apply(as(subgen, 'numeric'), 2, function(x) { all(x %in% c(0,1,2,NA)) })
262 | subgen <- subgen[,certain]
263 |
264 | # Subset and order CETP SNPs by position
265 | CETP <- CETP[CETP$SNP %in% colnames(subgen),]
266 | CETP <- arrange(CETP, position)
267 | subgen <- subgen[, order(match(colnames(subgen),CETP$SNP)) ]
268 |
269 | # Create LDheatmap
270 | ld <- ld(subgen, subgen, stats="R.squared") # Find LD map of CETP SNPs
271 |
272 | ll <- LDheatmap(ld, CETP$position, flip=TRUE, name="myLDgrob", title=NULL)
273 |
274 | # Add genes, recombination
275 | llplusgenes <- LDheatmap.addGenes(ll, chr = "chr16", genome = "hg19", genesLocation = 0.01)
276 |
277 | # Add plot of -log(p)
278 | library(ggplot2)
279 |
280 | plot.new()
281 | llQplot2<-LDheatmap.addGrob(llplusgenes, rectGrob(gp = gpar(col = "white")),height = .34)
282 | pushViewport(viewport(x = 0.483, y= 0.76, width = .91 ,height = .4))
283 |
284 | grid.draw(ggplotGrob({
285 | qplot(position, Neg_logP, data = CETP, xlab="", ylab = "Negative Log P-value", xlim = range(CETP$position),
286 | asp = 1/10, color = factor(type), colour=c("#000000", "#D55E00")) +
287 | theme(axis.text.x = element_blank(),
288 | axis.title.y = element_text(size = rel(0.75)), legend.position = "none",
289 | panel.background = element_blank(),
290 | axis.line = element_line(colour = "black")) +
291 | scale_color_manual(values = c("red", "black"))
292 | }))
293 | ```
294 |
295 |
296 | ### Regional Association
297 | Similar to the LD heatmap above, a regional association plot allows for visualization of SNP-wise signal accross a segment of a particular chromsome with the pairwise correlation between SNPs. However regional assoication plots typically show a larger window of the genome. Therefore, for plot legibility, LD calculations to be displayed can be selected based on pairwise SNP proximity and minimum LD. In this example we demonstrate a regional plot create by the `regionplot` function from postgwas. This function can use HapMap data downloaded from [Ensembl](http://www.ensembl.org/index.html), for LD calculations. By default it will use the most recent [Genome Reference Consortium](http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/) human genome build. Therefore, if we wish to use build GRCh37 (hg19) we will have to create a custom `biomartConfigs` object to retrieve the appropriate data.
298 |
299 | ```r
300 | # Create regional association plot
301 | # Create data.frame of most significant SNP only
302 | library(postgwas)
303 |
304 | snps<-data.frame(SNP=c("rs1532625"))
305 |
306 | # Change column names necessary to run regionalplot function
307 | GWAScomb <- rename(GWAScomb, c(p.value="P", chr="CHR", position="BP"))
308 |
309 |
310 | # Edit biomartConfigs so regionalplot function
311 | # pulls from human genome build 37/hg19
312 |
313 | myconfig <- biomartConfigs$hsapiens
314 | myconfig$hsapiens$gene$host <- "grch37.ensembl.org"
315 | myconfig$hsapiens$gene$mart <- "ENSEMBL_MART_ENSEMBL"
316 | myconfig$hsapiens$snp$host <- "grch37.ensembl.org"
317 | myconfig$hsapiens$snp$mart <- "ENSEMBL_MART_SNP"
318 |
319 |
320 | # Run regionalplot using HAPMAP data (pop = CEU)
321 | regionalplot(snps, GWAScomb, biomart.config = myconfig, window.size = 400000, draw.snpname = data.frame(
322 | snps = c("rs1532625", "rs247617"),
323 | text = c("rs1532625", "rs247617"),
324 | angle = c(20, 160),
325 | length = c(1, 1),
326 | cex = c(0.8)
327 | ),
328 | ld.options = list(
329 | gts.source = 2,
330 | max.snps.per.window = 2000,
331 | rsquare.min = 0.8,
332 | show.rsquare.text = FALSE
333 | ),
334 | out.format = list(file = NULL, panels.per.page = 2))
335 | ```
336 |
337 |
338 |
--------------------------------------------------------------------------------
/Data-pre-processing.md:
--------------------------------------------------------------------------------
1 | # Data pre-processing
2 |
3 | For this tutorial we use genotype data files formatted for use with [PLINK](http://pngu.mgh.harvard.edu/~purcell/plink/") software. We utilize the function, `read.plink` from snpStats, which allows the reading in of data formatted as .bed, .bim, and .fam files. The .bed file contains the genotype information, coded in binary. The .bim file contains information for each SNP with a respective column for each of the following information: chromosome number, SNP name (typically an rs #), genetic distance (not necessary for this tutorial), chromosomal position, identity of allele 1, and identity of allele 2. The assignment of allele 1 and allele 2, is related to the effect allele, or the allele that is being counted when we assign a numeric value to a genotype. This is typically assigned based on allele frequency, though not always. In this tutorial, allele 1 pertains to the minor, or less common allele. Lastly, the .fam file contains information for each samples with a respective column for each of the following information: family ID (this will be used to identify each sample when read into R), individual ID, paternal ID, maternal ID, sex (coded as 1 = male, 2 = female), and phenotype. In this tutorial we utilize a supplemental clinical file for outcome variables and additional covariates.
4 |
5 | Alternatively, similar genotype information can also be formatted for PLINK software as .ped and .map files. The information of the .ped file can be thought of as a combination of the .bed and .fam files. It is a large table with the first six columns identical to a .fam file, followed by a columns containing the genotype data for each SNP. The .map file contains the first four columns of the .bim file, without the allele assignments. These files can be read in using the function, `read.pedfile`, from snpStats. More information about the formatting of these files can be found on the PLINK website.
6 |
7 | ## Read in PLINK files - Step1
8 | ```r
9 | library(snpStats)
10 |
11 | # Read in PLINK files
12 | geno <- read.plink(gwas.fn$bed, gwas.fn$bim, gwas.fn$fam, na.strings = ("-9"))
13 |
14 | # Obtain the SnpMatrix object (genotypes) table from geno list
15 | # Note: Phenotypes and covariates will be read from the clinical data file, below
16 | genotype <- geno$genotype
17 | print(genotype) # 861473 SNPs read in for 1401 subjects
18 | ```
19 | ```
20 | ## A SnpMatrix with 1401 rows and 861473 columns
21 | ## Row names: 10002 ... 11596
22 | ## Col names: rs10458597 ... rs5970564
23 | ```
24 |
25 | ```r
26 | #Obtain the SNP information from geno list
27 | genoBim <- geno$map
28 | colnames(genoBim) <- c("chr", "SNP", "gen.dist", "position", "A1", "A2")
29 | print(head(genoBim))
30 | ```
31 |
32 | ```
33 | ## chr SNP gen.dist position A1 A2
34 | ## rs10458597 1 rs10458597 0 564621 C
35 | ## rs12565286 1 rs12565286 0 721290 G C
36 | ## rs12082473 1 rs12082473 0 740857 T C
37 | ## rs3094315 1 rs3094315 0 752566 C T
38 | ## rs2286139 1 rs2286139 0 761732 C T
39 | ## rs11240776 1 rs11240776 0 765269 G A
40 | ```
41 | ```r
42 | # Remove raw file to free up memory
43 | rm(geno)
44 | ```
45 | Supplemental clinical data is found in a corresponding CSV file for each sample. It contains a column for the sample ID (Family ID in the .fam file) and a respective column for each of the following variables: coronary artery disease status (coded as 0 = control and 1 = affected), sex (coded as 1 = male, 2 = female), age (years), triglyceride level (mg/dL), high-density lipoprotein level (mg/dL), low-density lipoprotein level (mg/dL).
46 |
47 | ```r
48 | # Read in clinical file
49 | clinical <- read.csv(clinical.fn,
50 | colClasses=c("character", "factor", "factor", rep("numeric", 4)))
51 | rownames(clinical) <- clinical$FamID
52 | print(head(clinical))
53 | ```
54 | ```
55 | ## FamID CAD sex age tg hdl ldl
56 | ## 10002 10002 1 1 60 NA NA NA
57 | ## 10004 10004 1 2 50 55 23 75
58 | ## 10005 10005 1 1 55 105 37 69
59 | ## 10007 10007 1 1 52 314 54 108
60 | ## 10008 10008 1 1 58 161 40 94
61 | ## 10009 10009 1 1 59 171 46 92
62 | ```
63 | We filter the genotype data to only include samples with corresponding clinical data by indexing the genotype object using only row names that match the sample IDs.
64 |
65 | ```r
66 | # Subset genotype for subject data
67 | genotype <- genotype[clinical$FamID, ]
68 | print(genotype) # Tutorial: All 1401 subjects contain both clinical and genotype data
69 | ```
70 | ```
71 | ## A SnpMatrix with 1401 rows and 861473 columns
72 | ## Row names: 10002 ... 11596
73 | ## Col names: rs10458597 ... rs5970564
74 | ```
75 |
76 | ## SNP level filtering - Step 2
77 | Once the data is loaded, we are ready to remove SNPs that fail to meet minimum criteria due to missing data, low variability or genotyping errors. snpStats provides functions, col.summary and row.summary, that return statistics on SNPs and samples, respectively.
78 |
79 | ```r
80 | # Create SNP summary statistics (MAF, call rate, etc.)
81 | snpsum.col <- col.summary(genotype)
82 | print(head(snpsum.col))
83 | ```
84 | ```
85 | ## Calls Call.rate Certain.calls RAF MAF P.AA
86 | ## rs10458597 1398 0.9978587 1 1.0000000 0.000000000 0.00000000
87 | ## rs12565286 1384 0.9878658 1 0.9483382 0.051661850 0.00433526
88 | ## rs12082473 1369 0.9771592 1 0.9985391 0.001460920 0.00000000
89 | ## rs3094315 1386 0.9892934 1 0.8217893 0.178210678 0.04761905
90 | ## rs2286139 1364 0.9735903 1 0.8621701 0.137829912 0.02199413
91 | ## rs11240776 1269 0.9057816 1 0.9988180 0.001182033 0.00000000
92 | ## P.AB P.BB z.HWE
93 | ## rs10458597 0.000000000 1.0000000 NA
94 | ## rs12565286 0.094653179 0.9010116 -1.26529432
95 | ## rs12082473 0.002921841 0.9970782 0.05413314
96 | ## rs3094315 0.261183261 0.6911977 -4.03172248
97 | ## rs2286139 0.231671554 0.7463343 -0.93146122
98 | ## rs11240776 0.002364066 0.9976359 0.04215743
99 | ```
100 | Using these summary statistics, we keep the subset of SNPs that meet our criteria for minimum call rate and minor allele frequency.
101 |
102 | ```r
103 | # Setting thresholds
104 | call <- 0.95
105 | minor <- 0.01
106 |
107 | # Filter on MAF and call rate
108 | use <- with(snpsum.col, (!is.na(MAF) & MAF > minor) & Call.rate >= call)
109 | use[is.na(use)] <- FALSE # Remove NA's as well
110 |
111 | cat(ncol(genotype)-sum(use),
112 | "SNPs will be removed due to low MAF or call rate.\n") #203287 SNPs will be removed
113 | ```
114 | ```
115 | ## 203287 SNPs will be removed due to low MAF or call rate.
116 | ```
117 | ```r
118 | # Subset genotype and SNP summary data for SNPs that pass call rate and MAF criteria
119 | genotype <- genotype[,use]
120 | snpsum.col <- snpsum.col[use,]
121 |
122 | print(genotype) # 658186 SNPs remain
123 | ```
124 | ```
125 | ## A SnpMatrix with 1401 rows and 658186 columns
126 | ## Row names: 10002 ... 11596
127 | ## Col names: rs12565286 ... rs5970564
128 | ```
129 | ```r
130 | # Write subsetted genotype data and derived results for future use
131 | save(genotype, snpsum.col, genoBim, clinical, file=working.data.fname(2))
132 | ```
133 |
134 | ## Sample level filtering - Step 3
135 | The second stage of data pre-processing involves filtering samples, i.e. removing individuals due to missing data, sample contamination, correlation (for population-based investigations), and racial/ethnic or gender ambiguity or discordance. In our study, we address these issues by filtering on call rate, heterozygosity, cryptic relatedness and duplicates using identity-by-descent, and we visually assess ancestry.
136 |
137 | ### Basic sample filtering
138 | Sample level quality control for missing data and heterozygosity is achieved using the row.summary function from snpStats. An additional heterozygosity F statistic calculation is carried out with the form, |F|=(1−O/E), where O is observed proportion of heterozygous genotypes for a given sample and E is the expected proportion of heterozygous genotypes for a given sample based on the minor allele frequency across all non-missing SNPs for a given sample.
139 |
140 | ```r
141 | # Sample level filtering
142 | source("https://github.com/AAlhendi1707/GWAS/blob/master/R/globals.R?raw=true")
143 |
144 | # load data created in previous snippets
145 | load(working.data.fname(2))
146 |
147 | library(snpStats)
148 | library(SNPRelate) # LD pruning, relatedness, PCA
149 | library(plyr)
150 |
151 | # Create sample statistics (Call rate, Heterozygosity)
152 | snpsum.row <- row.summary(genotype)
153 |
154 | # Add the F stat (inbreeding coefficient) to snpsum.row
155 | MAF <- snpsum.col$MAF
156 | callmatrix <- !is.na(genotype)
157 | hetExp <- callmatrix %*% (2*MAF*(1-MAF))
158 | hetObs <- with(snpsum.row, Heterozygosity*(ncol(genotype))*Call.rate)
159 | snpsum.row$hetF <- 1-(hetObs/hetExp)
160 |
161 | head(snpsum.row)
162 | ```
163 | ```
164 | ## Call.rate Certain.calls Heterozygosity hetF
165 | ## 10002 0.9826554 1 0.3289825 -0.0247708291
166 | ## 10004 0.9891581 1 0.3242931 -0.0103236529
167 | ## 10005 0.9918427 1 0.3231825 -0.0062550972
168 | ## 10007 0.9861027 1 0.3241469 -0.0098475016
169 | ## 10008 0.9823333 1 0.3228218 -0.0075941985
170 | ## 10009 0.9913034 1 0.3213658 -0.0002633189
171 | ```
172 |
173 | We apply filtering on call rate and heterozygosity, selecting only those samples that meet our criteria.
174 |
175 | ```r
176 | # Setting thresholds
177 | sampcall <- 0.95 # Sample call rate cut-off
178 | hetcutoff <- 0.1 # Inbreeding coefficient cut-off
179 |
180 | sampleuse <- with(snpsum.row, !is.na(Call.rate) & Call.rate > sampcall & abs(hetF) <= hetcutoff)
181 | sampleuse[is.na(sampleuse)] <- FALSE # remove NA's as well
182 | cat(nrow(genotype)-sum(sampleuse),
183 | "subjects will be removed due to low sample call rate or inbreeding coefficient.\n") #0 subjects removed
184 | ```
185 | ```
186 | ## 0 subjects will be removed due to low sample call rate or inbreeding coefficient.
187 | ```
188 | ```r
189 | # Subset genotype and clinical data for subjects who pass call rate and heterozygosity crtieria
190 | genotype <- genotype[sampleuse,]
191 | clinical<- clinical[ rownames(genotype), ]
192 | ```
193 | ### IBD analysis
194 | In addition to these summary statistics, we also want to filter on relatedness criteria. We use the SNPRelate package to perform identity-by-descent (IBD) analysis. This package requires that the data be transformed into a GDS format file. IBD analysis is performed on only a subset of SNPs that are in linkage equilibrium by iteratively removing adjacent SNPs that exceed an LD threshold in a sliding window using the `snpgdsLDpruning` function.
195 |
196 | ```r
197 | # Checking for Relatedness
198 |
199 | ld.thresh <- 0.2 # LD cut-off
200 | kin.thresh <- 0.1 # Kinship cut-off
201 |
202 | # Create gds file, required for SNPRelate functions
203 | snpgdsBED2GDS(gwas.fn$bed, gwas.fn$fam, gwas.fn$bim, gwas.fn$gds)
204 | ```
205 | ```
206 | Start snpgdsBED2GDS ...
207 | BED file: "/scratch/spectre/a/asna4/GWAS/GWAStutorial.bed" in the SNP-major mode (Sample X SNP)
208 | FAM file: "/scratch/spectre/a/asna4/GWAS/GWAStutorial.fam", DONE.
209 | BIM file: "/scratch/spectre/a/asna4/GWAS/GWAStutorial.bim", DONE.
210 | Thu Apr 11 19:57:17 2019 store sample id, snp id, position, and chromosome.
211 | start writing: 1401 samples, 861473 SNPs ...
212 | Thu Apr 11 19:57:17 2019 0%
213 | Thu Apr 11 19:57:29 2019 100%
214 | Thu Apr 11 19:57:29 2019 Done.
215 | Optimize the access efficiency ...
216 | Clean up the fragments of GDS file:
217 | open the file '/scratch/spectre/a/asna4/GWAS/GWAStutorial.gds' (292.4M)
218 | # of fragments: 39
219 | save to '/scratch/spectre/a/asna4/GWAS/GWAStutorial.gds.tmp'
220 | rename '/scratch/spectre/a/asna4/GWAS/GWAStutorial.gds.tmp' (292.4M, reduced: 252B)
221 | # of fragments: 18
222 | ```
223 | ```r
224 | genofile <- openfn.gds(gwas.fn$gds, readonly = FALSE)
225 |
226 | # Automatically added "-1" sample suffixes are removed
227 | gds.ids <- read.gdsn(index.gdsn(genofile, "sample.id"))
228 | gds.ids <- sub("-1", "", gds.ids)
229 | add.gdsn(genofile, "sample.id", gds.ids, replace = TRUE)
230 |
231 | #Prune SNPs for IBD analysis
232 | set.seed(1000)
233 | geno.sample.ids <- rownames(genotype)
234 | snpSUB <- snpgdsLDpruning(genofile, ld.threshold = ld.thresh,
235 | sample.id = geno.sample.ids, # Only analyze the filtered samples
236 | snp.id = colnames(genotype)) # Only analyze the filtered SNPs
237 |
238 | ```
239 | ```
240 | ## Hint: it is suggested to call `snpgdsOpen' to open a SNP GDS file instead of `openfn.gds'.
241 | ```
242 | ```
243 | ## SNP pruning based on LD:
244 | ## Excluding 203287 SNPs on non-autosomes
245 | ## Excluding 0 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
246 | ## Working space: 1401 samples, 658186 SNPs
247 | ## Using 1 (CPU) core
248 | ## Sliding window: 500000 basepairs, Inf SNPs
249 | ## |LD| threshold: 0.2
250 | ## Chromosome 1: 8.25%, 5863/71038
251 | ## Chromosome 3: 8.10%, 4906/60565
252 | ## Chromosome 6: 8.06%, 4364/54176
253 | ## Chromosome 12: 8.59%, 3619/42124
254 | ## Chromosome 21: 9.40%, 1171/12463
255 | ## Chromosome 2: 7.67%, 5655/73717
256 | ## Chromosome 4: 8.23%, 4582/55675
257 | ## Chromosome 7: 8.51%, 3947/46391
258 | ## Chromosome 11: 7.90%, 3495/44213
259 | ## Chromosome 10: 8.01%, 3837/47930
260 | ## Chromosome 8: 7.68%, 3709/48299
261 | ## Chromosome 5: 8.08%, 4537/56178
262 | ## Chromosome 14: 8.79%, 2467/28054
263 | ## Chromosome 9: 8.25%, 3392/41110
264 | ## Chromosome 17: 11.17%, 2227/19939
265 | ## Chromosome 13: 8.36%, 2863/34262
266 | ## Chromosome 20: 9.40%, 2139/22753
267 | ## Chromosome 15: 9.25%, 2396/25900
268 | ## Chromosome 16: 9.30%, 2566/27591
269 | ## Chromosome 18: 8.90%, 2335/26231
270 | ## Chromosome 19: 13.01%, 1494/11482
271 | ## Chromosome 22: 10.96%, 1248/11382
272 | ## 72812 SNPs are selected in total.
273 | ```
274 | ```r
275 | snpset.ibd <- unlist(snpSUB, use.names=FALSE)
276 | cat(length(snpset.ibd),"will be used in IBD analysis\n") # Tutorial: expect 72812 SNPs
277 | ```
278 | ```
279 | ## 72812 will be used in IBD analysis
280 | ```
281 | The `snpgdsIBDMoM` function computes the IBD coefficients using method of moments. The result is a table indicating kinship among pairs of samples.
282 |
283 | ```r
284 | # Find IBD coefficients using Method of Moments procedure. Include pairwise kinship.
285 | ibd <- snpgdsIBDMoM(genofile, kinship=TRUE,
286 | sample.id = geno.sample.ids,
287 | snp.id = snpset.ibd,
288 | num.thread = 1)
289 |
290 | ```
291 | ```
292 | ## Hint: it is suggested to call `snpgdsOpen' to open a SNP GDS file instead of `openfn.gds'.
293 | ```
294 | ```
295 | IBD analysis (PLINK method of moment) on genotypes:
296 | Excluding 788,661 SNPs (non-autosomes or non-selection)
297 | Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
298 | Working space: 1,401 samples, 72,812 SNPs
299 | using 1 (CPU) core
300 | PLINK IBD: the sum of all selected genotypes (0,1,2) = 32757268
301 | Thu Apr 11 20:01:28 2019 (internal increment: 65536)
302 | [==================================================] 100%, completed in 13s
303 | Thu Apr 11 20:01:41 2019 Done.
304 | ```
305 | ```r
306 | ibdcoeff <- snpgdsIBDSelection(ibd) # Pairwise sample comparison
307 | head(ibdcoeff)
308 | ```
309 | ```
310 | ## ID1 ID2 k0 k1 kinship
311 | ## 1 10002 10004 0.9201072 0.07989281 0.01997320
312 | ## 2 10002 10005 0.9478000 0.05220002 0.01305001
313 | ## 3 10002 10007 0.9209875 0.07901253 0.01975313
314 | ## 4 10002 10008 0.9312527 0.06874726 0.01718682
315 | ## 5 10002 10009 0.9386937 0.06130626 0.01532656
316 | ## 6 10002 10010 0.9146065 0.08539354 0.02134839
317 | ```
318 |
319 | Using the IBD pairwise sample relatedness measure, we iteratively remove samples that are too similar using a greedy strategy in which the sample with the largest number of related samples is removed. The process is repeated until there are no more pairs of samples with kinship coefficients above our cut-off.
320 |
321 | ```r
322 | # Check if there are any candidates for relatedness
323 | ibdcoeff <- ibdcoeff[ ibdcoeff$kinship >= kin.thresh, ]
324 |
325 | # iteratively remove samples with high kinship starting with the sample with the most pairings
326 | related.samples <- NULL
327 | while ( nrow(ibdcoeff) > 0 ) {
328 |
329 | # count the number of occurrences of each and take the top one
330 | sample.counts <- arrange(count(c(ibdcoeff$ID1, ibdcoeff$ID2)), -freq)
331 | rm.sample <- sample.counts[1, 'x']
332 | cat("Removing sample", as.character(rm.sample), 'too closely related to',
333 | sample.counts[1, 'freq'],'other samples.\n')
334 |
335 | # remove from ibdcoeff and add to list
336 | ibdcoeff <- ibdcoeff[ibdcoeff$ID1 != rm.sample & ibdcoeff$ID2 != rm.sample,]
337 | related.samples <- c(as.character(rm.sample), related.samples)
338 | }
339 |
340 | # filter genotype and clinical to include only unrelated samples
341 | genotype <- genotype[ !(rownames(genotype) %in% related.samples), ]
342 | clinical <- clinical[ !(clinical$FamID %in% related.samples), ]
343 |
344 | geno.sample.ids <- rownames(genotype)
345 |
346 | cat(length(related.samples),
347 | "similar samples removed due to correlation coefficient >=", kin.thresh,"\n")
348 | ```
349 | ```
350 | ## 0 similar samples removed due to correlation coefficient >= 0.1
351 | ```
352 | ```r
353 | print(genotype) # Tutorial: expect all 1401 subjects remain
354 | ```
355 | ```
356 | ## A SnpMatrix with 1401 rows and 658186 columns
357 | ## Row names: 10002 ... 11596
358 | ## Col names: rs12565286 ... rs5970564
359 | ```
360 |
361 | ### Ancestry
362 | To better understand ancestry, we plot the first two principal components of the genotype data. Principal component calculation is achieved via the `snpgdsPCA` function from SNPRelate. It is important to note that in this example we are reasonably confident that our samples are homogeneous, coming from european ancestry. Therefore, given that there are no clear outliers, we fail to remove any samples.
363 |
364 | ```r
365 | # Checking for ancestry
366 |
367 | # Find PCA matrix
368 | pca <- snpgdsPCA(genofile, sample.id = geno.sample.ids, snp.id = snpset.ibd, num.thread=1)
369 | ```
370 | ```
371 | ## Hint: it is suggested to call `snpgdsOpen' to open a SNP GDS file instead of `openfn.gds'.
372 | ```
373 |
374 | ```
375 | Principal Component Analysis (PCA) on genotypes:
376 | Excluding 788,661 SNPs (non-autosomes or non-selection)
377 | Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
378 | Working space: 1,401 samples, 72,812 SNPs
379 | using 1 (CPU) core
380 | PCA: the sum of all selected genotypes (0,1,2) = 32757268
381 | CPU capabilities: Double-Precision SSE2
382 | Thu Apr 11 20:05:42 2019 (internal increment: 3248)
383 | [==================================================] 100%, completed in 42s
384 | Thu Apr 11 20:06:24 2019 Begin (eigenvalues and eigenvectors)
385 | Thu Apr 11 20:06:25 2019 Done.
386 | ```
387 | ```r
388 | # Create data frame of first two principal comonents
389 | pctab <- data.frame(sample.id = pca$sample.id,
390 | PC1 = pca$eigenvect[,1], # the first eigenvector
391 | PC2 = pca$eigenvect[,2], # the second eigenvector
392 | stringsAsFactors = FALSE)
393 |
394 | # Plot the first two principal comonents
395 | plot(pctab$PC2, pctab$PC1, xlab="Principal Component 2", ylab="Principal Component 1",
396 | main = "Ancestry Plot")
397 | ```
398 |
399 |
400 | ```r
401 | # Close GDS file
402 | closefn.gds(genofile)
403 |
404 | # Overwrite old genotype with new filtered version
405 | save(genotype, genoBim, clinical, file=working.data.fname(3))
406 | ```
407 |
408 | ## SNP Filtering - HWE filtering on control samples - Step 4
409 | Finally, once samples are filtered, we return to SNP level filtering and apply a check of Hardy-Weinberg equilibrium. Rejection of Hardy-Weinberg equilibrium can be an indication of population substructure or genotyping errors. Given that we are performing a statistical test at every SNP, it is common to use a relatively lenient cut-off. In this example we only remove SNPs with p-values, corresponding to the HWE test statistic on CAD controls, of less than 1×10−6. We only test HWE on CAD controls due to possible violation of HWE caused by disease association.
410 |
411 | ```r
412 | # Hardy-Weinberg SNP filtering on CAD controls
413 |
414 | hardy <- 10^-6 # HWE cut-off
415 |
416 | CADcontrols <- clinical[ clinical$CAD==0, 'FamID' ]
417 | snpsum.colCont <- col.summary( genotype[CADcontrols,] )
418 | HWEuse <- with(snpsum.colCont, !is.na(z.HWE) & ( abs(z.HWE) < abs( qnorm(hardy/2) ) ) )
419 | rm(snpsum.colCont)
420 |
421 | HWEuse[is.na(HWEuse)] <- FALSE # Remove NA's as well
422 | cat(ncol(genotype)-sum(HWEuse),"SNPs will be removed due to high HWE.\n") # 1296 SNPs removed
423 | ```
424 | ```
425 | ## 1296 SNPs will be removed due to high HWE.
426 | ```
427 | ```r
428 | # Subset genotype and SNP summary data for SNPs that pass HWE criteria
429 | genotype <- genotype[,HWEuse]
430 |
431 | print(genotype) # 656890 SNPs remain
432 | ```
433 | ```
434 | ## A SnpMatrix with 1401 rows and 656890 columns
435 | ## Row names: 10002 ... 11596
436 | ## Col names: rs12565286 ... rs28729663
437 | ```
438 | ```r
439 | # Save genotype and SNVs filtered data to use in later analyses
440 | save(genotype, genoBim, clinical, file=working.data.fname(4))
441 |
--------------------------------------------------------------------------------