├── .Rbuildignore ├── DESCRIPTION ├── NAMESPACE ├── R └── manhattan.heatmap.v1.R ├── README.md ├── images ├── cells.png ├── genesbub.png ├── labels.png ├── reportbub.png ├── rsidbub.png └── zoom.png ├── inst └── extdata │ ├── 56cad.add.160614.variants.txt │ ├── cad.add.160614_manhformat.txt.gz │ └── config.txt ├── man ├── manhplot-package.Rd └── manhplusplot.Rd ├── perl ├── gen_snpfile.pl ├── glist-hg19 ├── glist-hg38 └── readme └── tests ├── testthat.R └── testthat ├── 5cad.add.160614.variants_chr1.txt ├── cad.add.160614_manhformat_chr1.txt.gz ├── config.txt └── testmanhplusplot.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | archive 4 | exampledata 5 | default-plot.pdf 6 | memory 7 | thinning.R 8 | README.md 9 | release 10 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: manhplot 2 | Type: Package 3 | Title: The Manhattan++ Plot 4 | Depends: R (>= 3.4.0) 5 | Version: 1.1 6 | Date: 2019-05-14 7 | Author: Chris Grace 8 | Maintainer: Chris Grace 9 | Description: This plot integrates annotation into a manhattan plot. The plot is implemented as a heatmap, which is binned using -log10(p-value) and chromosome position. Annotation currently supported is minor allele frequency and gene function high impact variants. 10 | License: GPL (>= 2) 11 | RoxygenNote: 6.1.1 12 | Imports: 13 | reshape2, 14 | ggplot2, 15 | ggrepel, 16 | gridExtra 17 | Suggests: 18 | R.utils, 19 | testthat 20 | URL: https://github.com/cgrace1978/manhplot/ 21 | BugReports: https://github.com/cgrace1978/manhplot/issues 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(manhplusplot) 4 | import(ggplot2) 5 | import(ggrepel) 6 | import(grDevices) 7 | import(gridExtra) 8 | import(reshape2) 9 | import(utils) 10 | -------------------------------------------------------------------------------- /R/manhattan.heatmap.v1.R: -------------------------------------------------------------------------------- 1 | #' Generate the manhattan++ plot 2 | #' 3 | #' @param infile Input GWAS summary statistics 4 | #' @param outfile Output file prefix for the manhattan++ plot 5 | #' @param configfile Configuration file 6 | #' @param snpfile Table of SNPs to visualize 7 | #' @param drawastiff If TRUE draw a Tiff file, if FALSE draw a PDF file 8 | #' @param GWS Genome wise significance pvalue threshold (5E-8 by default) 9 | #' @param FDR False discovery Rate pvalue threshold (1E-3 by default) 10 | #' @param MAF Minor Allele Frequency threshold 11 | #' @param chrname Column name for chromosome in GWAS infile 12 | #' @param posname Column name for position in GWAS infile 13 | #' @param pvalname Column name for pvalue in GWAS infile 14 | #' @param frqname column name for allele frequency in GWAS infile 15 | #' @param conseqname column name for variant annotation consequence in GWAS infile 16 | #' @param showgenes If T shows known genes as bubbles on main manhattan plot, if F show positions of interest as bubbles 17 | #' @param showrsids If showgenes is T, then show the rsids, rather than genes 18 | #' @param pos.split The bin lengths for positions 19 | #' @param pval.split The bin lengths for pvalues 20 | #' @param max.pval The maximum pvalue to display 21 | #' @details 22 | #' For file formats see github page \url{https://github.com/cgrace1978/manhplot} 23 | #' @examples 24 | #' 25 | #'\donttest{ 26 | #' library(manhplot) 27 | #' infile<-system.file("extdata","cad.add.160614_manhformat.txt.gz",package = "manhplot") 28 | #' configfile<-system.file("extdata","config.txt", package = "manhplot") 29 | #' snpfile<-system.file("extdata","56cad.add.160614.variants.txt", package = "manhplot") 30 | #' 31 | #' manhplusplot(infile = infile,outfile = file.path(tempdir(), "default-plot"), 32 | #' configfile = configfile, snpfile = snpfile) 33 | #' } 34 | #' 35 | #' @author Chris Grace 36 | #' @import utils ggplot2 reshape2 ggrepel gridExtra grDevices 37 | #' @export 38 | manhplusplot<-function(infile, outfile, configfile, snpfile, 39 | drawastiff=F, 40 | GWS=5E-8, FDR=1E-3, MAF=0.05, 41 | chrname="chr",posname="pos",pvalname="pvalue", 42 | frqname="maf",conseqname="conseq", 43 | showgenes=F,showrsids=F, 44 | pos.split=3E6,pval.split=0.125,max.pval=20){ 45 | 46 | ## the no visible binding for global variable issue with check. 47 | pvalidx<-pos<-pval<-val<-posidx<-marker<-NULL 48 | 49 | ## parameters for drawing the manhattan heatmap for internal use. 50 | pval.units<-5 ## units to display on the y axis 51 | textsize<-2 ## size of text used on labels 52 | 53 | rebuild<-T ## set to false to retain the current matrix 54 | ## name of log file to generate if debugflag is set. 55 | debugfile<-paste("manh_",format(Sys.time(), "%d-%m-%y.%H-%M-%S"),".log",sep="") 56 | debugflag<-F ## Turn logging on / off 57 | 58 | ### Assert statement 59 | ## condition to test 60 | ## message if the condition fails. 61 | waitifnot <- function(cond, mess) { 62 | if (!cond) { ## check that condition is fulfilled 63 | message(mess) 64 | message(deparse(substitute(cond)), " is not TRUE") 65 | while (TRUE) {} 66 | } 67 | } 68 | 69 | ### function returns y index on the heatmap that maps to a specific -log10(p-value) 70 | ## log - log10 pval to convert to index 71 | log10.index<-function(log){ 72 | idx<-(log / pval.split) + 0.5 73 | return(idx) 74 | } 75 | 76 | ### function returns y index on the heatmap that maps to a specified p-value 77 | ## p.val - pval to convert to index 78 | p.val.index<-function(p.val){ 79 | gws<--1*log10(p.val) ## convert the p-value to -log10 80 | idx<-log10.index(gws) ## call the log10.index function. 81 | return(idx) 82 | } 83 | 84 | ### Find the exact cell where the p-value is within on the heatmap 85 | ## p.val - p value 86 | ## pval.chunks - data structure containing cell locations for log10(pvals) 87 | p.val.cell<-function(p.val,pval.chunks){ 88 | gws<--1*log10(p.val) ## convert the p-value to -log10 89 | idx<-log10.cell(gws) ## call the log10.index function. 90 | return(idx) 91 | } 92 | 93 | ### Find the exact cell where the -log10(p-value) is within on the heatmap 94 | ## log - log10(pval) 95 | ## pval.chunks - data structure containing cell locations for log10(pvals) 96 | log10.cell<-function(log, pval.chunks){ 97 | idx<-pvals.cells.index$id[log >= pvals.cells.index$LP & log < pvals.cells.index$UP] 98 | return (idx) 99 | } 100 | 101 | ### Find the exact cell which the position is within on the heatmap 102 | ## chr - chr for index 103 | ## position - position for index 104 | ## chr.chunks - datastructure containing cell locations for chromosomes and positions. 105 | chrpos.cell<-function(chr, position, chr.chunks){ 106 | slice<-chr.chunks[chr.chunks$chr==chr & chr.chunks$s < position & chr.chunks$e > position,] 107 | 108 | return (slice$posid) 109 | } 110 | 111 | if(rebuild==T){## rebuild the heatmap matrix and other datastructures if the flag is set 112 | message("Rebuilding matrix") 113 | ## read the gwas results 114 | message("Reading the GWAS results...") 115 | d<-read.table(infile, header=T) 116 | 117 | ## map columns to those specified by the user. 118 | names<-names(d) 119 | for(i in 1:length(names)){ 120 | col<-names[i] 121 | 122 | if(col == chrname){names[i] <- "chr"} 123 | if(col == posname){names[i] <- "pos"} 124 | if(col == pvalname){names[i] <- "Pvalue"} 125 | if(col == frqname){names[i] <- "FRQ"} 126 | if(col == conseqname){names[i] <- "conseq"} 127 | } 128 | 129 | names(d)<-names 130 | 131 | ## check data file has the correct headers 132 | correct.names<-c("chr","pos","Pvalue","FRQ","conseq") 133 | for(i in (1:length(correct.names))){ 134 | if(!correct.names[i]%in%names(d)){ 135 | message(paste0(correct.names[i], " not found in data file!\n")) 136 | return() 137 | } 138 | } 139 | 140 | d<-d[!is.na(d$pos),] 141 | ## align the frequencies to the minor allele 142 | d$FRQ[d$FRQ > 0.5]<-(1-(d$FRQ[d$FRQ>0.5])) 143 | 144 | ## check that the chromosome column is in correct format. 145 | waitifnot(is.numeric(d$chr), "chr column in gwas data should be numeric, please check if encoded X, or with `chr` prefix") 146 | ## Support for the X chromosome? 147 | lastchr<-max(unique(d$chr)) 148 | 149 | ## read the snp info 150 | snp.info<-read.table(snpfile, header=T, sep="\t") 151 | 152 | ## check snp file has the correct headers 153 | correct.names<-c("markername","gene","chr","pos","eaf","OR","Pvalue","novel") 154 | for(i in (1:length(correct.names))){ 155 | if(!correct.names[i]%in%names(snp.info)){ 156 | message(paste0(correct.names[i], " not found in SNP file!\n")) 157 | return() 158 | } 159 | } 160 | 161 | snp.info<-snp.info[order(snp.info$chr, snp.info$pos, decreasing=F),] 162 | 163 | ## read the config data 164 | config<-read.table(configfile,sep="\t", header =T,stringsAsFactors = F, skip=10) 165 | 166 | ## generate the pvalue bins (using the pval.split parameter) 167 | pvals<-seq(from=0, to=max.pval, by=pval.split) # max(-log10(d$Pvalue)) 168 | pvals.cells.index<-data.frame(id=1:length(pvals),LP=pvals,UP=c(pvals[2:length(pvals)],max.pval)) 169 | 170 | final<-matrix(0, nrow = length(pvals), ncol = 0) 171 | 172 | chr.matrix.len<-as.data.frame(matrix(nrow=lastchr,ncol=3)) 173 | names(chr.matrix.len)<-c("length", "cumm", "mid") 174 | 175 | pos.chunks<-as.data.frame(matrix(nrow=0,ncol=4)) 176 | names(pos.chunks)<-c("posid","chr","s", "e") 177 | 178 | pos.idx<-0 179 | 180 | idx.count<-vector(mode="numeric",length=length(config$idx)+1) 181 | max.cellcount<-0 182 | 183 | if(debugflag==T){ ## Generate the log file if the flag is set. 184 | logfile<-file(debugfile,open="a") 185 | ## columns for the log output table 186 | cat("chr","st","en","log10p","idx","\n", file=logfile,append = T,sep="\t") 187 | } 188 | 189 | ## locations to be labelled on the plot. 190 | pos.interest<-data.frame(marker=character,log10pval=numeric,chr=numeric,pos=numeric,col=character) 191 | 192 | for(chr in 1:lastchr){ 193 | message(paste("Processing chromosome ",chr,"\r", sep=""),appendLF = F) 194 | 195 | ## extract chromosome specific data from GWAS input 196 | chr.slice<-d[d$chr==chr,] 197 | 198 | ## Generate the chromosome position bins (using the pos.split parameter) 199 | chunks<-seq(from=min(chr.slice$pos), to=max(chr.slice$pos), by=pos.split) 200 | chunks[length(chunks)+1]<-max(chr.slice$pos) 201 | 202 | ## create the matrix for this chromosome (pvals * position) 203 | mdat<-matrix(0, nrow = length(pvals), ncol = length(chunks)-1) 204 | 205 | for (i in 1:(length(chunks)-1)){ 206 | ## get slice of gwas input for each of the position bins 207 | slice<-chr.slice[chr.slice$pos >= chunks[i] & chr.slice$pos < chunks[i+1],] 208 | 209 | for (j in 1:length(pvals)){ 210 | ## get slice of gwas input for each of the pvalue bins (and position bins) 211 | if(j == length(pvals)){ ## last element in pval array - include all variants gt then this 212 | p.val.slice<-slice[-log10(slice$Pvalue) >= pvals[j],] 213 | } 214 | else{ ## otherwise slice between the current and next values in the pval array 215 | p.val.slice<-slice[-log10(slice$Pvalue) >= pvals[j] & -log10(slice$Pvalue) < pvals[j+1],] 216 | } 217 | 218 | ## determine the number of variants in the slice 219 | len<-dim(p.val.slice)[1] 220 | idx<-0 221 | 222 | ## determine the number of variants in the slice, which have the HIGH consequence flag set to 1 223 | conseq.len<-dim(p.val.slice[p.val.slice$conseq==1,])[1] ## test if any high consequences 224 | ## determine the number of variants in the slice, which have MAF less than 5% 225 | maf.len<-dim(p.val.slice[p.val.slice$FRQ<=MAF,])[1] ## test if any MAF < 5% 226 | 227 | if(len == 0){idx<-0} ## the case with no variants - blank cell. 228 | else{ 229 | ## len - number of snps in the bin 230 | ## conseq.len - does it include any high impact variants 231 | ## maf.len - does it include any MAF < 5% 232 | 233 | ## this defines the region which should be greyed out 234 | if(pvals[j] <= -log10(FDR)){ 235 | ## determine whether the chromosome is odd and even 236 | if((chr %% 2) != 0){idx<-config$idx[config$type=="oddchr"]} 237 | else{idx<-config$idx[config$type=="evenchr"]} 238 | } 239 | else{ 240 | for(k in 1:length(config$idx)){ 241 | if(config$type[k] == "val"){ ## Check the config is a valid type. 242 | len.chk<-FALSE 243 | conseq.chk<-FALSE 244 | maf.chk<-FALSE 245 | 246 | ## check counts 247 | if(len >= config$min.count[k]){ ## check that the current cell length is gt or eq than the config min count 248 | if(is.na(config$max.count[k])){ ## if the config max is NA then accept length condition 249 | len.chk<-TRUE 250 | } 251 | else if(len <= config$max.count[k]){ ## if the current cell length is lt the config max count then accept the length condition 252 | len.chk<-TRUE 253 | } 254 | } 255 | 256 | ## check HIGH impact 257 | if(config$conseq[k] == TRUE && conseq.len > 0){conseq.chk<-TRUE} ## accept if high impact is active in config and there are 1 or more high impact variants within the cell 258 | else if(config$conseq[k] == FALSE && conseq.len == 0){conseq.chk<-TRUE} ## accept if high impact is inactive and there are 0 high impact variants within the cell. 259 | 260 | ## check MAF 261 | if(config$maf[k] == TRUE && maf.len > 0){maf.chk<-TRUE} ## accept if MAF 5% active and there is one or more variant with MAF lt 5% in the cell 262 | else if(config$maf[k] == FALSE && maf.len == 0){maf.chk<-TRUE} ## accept if MAF 5% active and there are 0 variants with MAF lt 5% in the cell 263 | 264 | if(len.chk==TRUE && conseq.chk==TRUE && maf.chk==TRUE){ ## if all three clauses are correct then accept the idx for the config and exit the for loop 265 | idx<-config$idx[k] 266 | 267 | if(config$report[k]==TRUE){ ## if reporting is active for the config then add an entry to the pos.interest table. 268 | tmp.df<-data.frame( 269 | marker=paste(idx,sep=""), 270 | log10pval=pvals[j],chr=chr,pos=(chunks[i]+1), 271 | col=config$col[k]) 272 | 273 | pos.interest<-rbind(pos.interest,tmp.df) 274 | } 275 | break ## found config which fulfils cell criteria accept and exit group. 276 | } 277 | 278 | } 279 | } 280 | 281 | if(idx == 0){ ## if no idx was found in the annotations, generate a remaining idx (max idx in annotations + 1) 282 | idx<-max(config$idx)+1; ## assign to others 283 | } 284 | 285 | if(len > max.cellcount){max.cellcount<-len} ## assign the maximum cell count if length of current cell is gt than current 286 | } 287 | } 288 | 289 | idx.count[idx]<-(idx.count[idx] + 1) # increment the block count for each index 290 | 291 | mdat[j,i]<-idx 292 | 293 | if(debugflag==T){ ## log information for current cell. 294 | 295 | oddchridx<-config[config$type=="oddchr",]$idx 296 | evenchridx<-config[config$type=="evenchr",]$idx 297 | 298 | if(idx != oddchridx || idx != evenchridx || idx != 0){ ## generate the log file, dont log the greyed out regions 299 | cat(chr,chunks[i],chunks[i+1],pvals[j],idx,"\n", file=logfile,append = T,sep="\t") 300 | } 301 | } 302 | 303 | } 304 | } 305 | 306 | ## cell sizes of the chromosomes 307 | chr.matrix.len$length[chr]<-dim(mdat)[2] 308 | chr.matrix.len$cumm[chr]<-dim(mdat)[2]+ dim(final)[2] 309 | chr.matrix.len$mid[chr]<-chr.matrix.len$cumm[chr] - (chr.matrix.len$length[chr] / 2) 310 | 311 | ## bind to the final matrix 312 | final<-cbind(final,mdat) 313 | 314 | tmp.chunks<-as.data.frame(matrix(nrow=length(chunks)-1,ncol=4)) 315 | names(tmp.chunks)<-c("posid","chr","s", "e") 316 | 317 | for(j in 1:dim(tmp.chunks)[1]){ ## assign position chunk indexes to table 318 | #print(j) 319 | tmp.chunks$posid[j]<-pos.idx+j 320 | tmp.chunks$chr[j]<-chr 321 | tmp.chunks$s[j]<-chunks[j] 322 | tmp.chunks$e[j]<-chunks[j+1] 323 | } 324 | 325 | ## the cell id for each of the position chunks in the heatmap 326 | pos.chunks<-rbind(pos.chunks,tmp.chunks) 327 | pos.idx<-pos.idx+dim(tmp.chunks)[1] 328 | } 329 | 330 | if(debugflag==T){ ## close the log file if debuging is active. 331 | close(logfile) 332 | } 333 | 334 | snpcells<-vector(length=length(snp.info$markername)) 335 | 336 | for (i in 1:length(snp.info$markername)){ ## assign a position cell for each of the variants in the SNP information table. 337 | snpcells[i]<-chrpos.cell(snp.info$chr[i],snp.info$pos[i],pos.chunks) 338 | } 339 | 340 | message("\nMelting matrix...") 341 | m<-melt(final) 342 | names(m)<-c("pval","pos", "val") 343 | 344 | if(dim(pos.interest)[1] > 0){ ## if there are any positions of interest assign the cell positions on the heatmap 345 | pos.interest$pvalidx<-log10.index(pos.interest$log10pval) 346 | pos.interest$pos.idx<--1 347 | 348 | for (i in 1:length(pos.interest$marker)){ ## assign a position cell for each of the cells of interest 349 | pos.interest$pos.idx[i]<-chrpos.cell(pos.interest$chr[i],pos.interest$pos[i],pos.chunks) 350 | } 351 | } 352 | 353 | } ## END OF REBUILD SECTION 354 | 355 | 356 | ## assign the maximum variants in the cells 357 | peak.val<-ceiling(max.cellcount / 100)*100 ## nearest 10,000 to max cell value 358 | 359 | col.discrete<-c("white",config$col) 360 | 361 | col.text<-vector(mode="character",length=length(config$type)) 362 | 363 | ## Build the text for the legend using the information in the config table 364 | for(k in 1:length(col.text)){ 365 | if(config$type[k]=="val"){ 366 | 367 | max.count<-config$max.count[k] 368 | 369 | if(is.na(max.count)){ ## if the max count is assigned to NA then the highest cell count (with approp ceiling) is used 370 | max.count<-peak.val 371 | } 372 | 373 | if(config$min.count[k]==1 &&max.count==1){ 374 | col.text.tmp<-paste(config$idx[k],") ", 375 | config$min.count[k], 376 | sep="") 377 | } 378 | else{ 379 | col.text.tmp<-paste(config$idx[k],") ", 380 | config$min.count[k]," - ", 381 | max.count, 382 | sep="") 383 | } 384 | 385 | if(config$conseq[k]==FALSE && config$maf[k]==FALSE){ ## if both MAF and HIGH impact are disabled - do not add any text 386 | ##col.text[k]<-col.text.tmp 387 | } 388 | else if(config$conseq[k] == TRUE && config$maf[k]==TRUE){ ## if both MAF and HIGH impact are active - print BOTH 389 | col.text.tmp<-paste(col.text.tmp," (BOTH)",sep="") 390 | } 391 | else if(config$conseq[k] == TRUE && config$maf[k] == FALSE){ ## if only HIGH impact is active - print HIGH impact. 392 | col.text.tmp<-paste(col.text.tmp," (HIGH impact)",sep="") 393 | } 394 | else if(config$conseq[k] == FALSE && config$maf[k] == TRUE){ ## if only MAD is active - print MAF 395 | col.text.tmp<-paste(col.text.tmp," (MAF < ",MAF,")",sep="") 396 | } 397 | else{ ## all possible scenarios are covered - should not get here 398 | warning("Should not get here!\n") 399 | stopifnot(FALSE) 400 | } 401 | col.text[k]<-paste(col.text.tmp," (",idx.count[k],")",sep="") 402 | } 403 | else{ ## otherwise print the config type. 404 | col.text[k]<-config$type[k] 405 | } 406 | } 407 | 408 | col.brks<-1:length(config$type) 409 | 410 | if(max(m$val) > max(config$idx)){ ## if there are any cells which do not have a category add them to the remaining category. 411 | col.brks<-c(col.brks,max(m$val)) 412 | col.discrete<-c(col.discrete,"orange") ## hard coded colour for remaining. 413 | col.text<-c(col.text,paste("Remaining (", idx.count[max(m$val)],")",sep="")) 414 | } 415 | 416 | pval.seq<-seq(from=pval.units,to=max.pval,by=pval.units) 417 | 418 | y.labels<-c("",pval.seq) 419 | y.breaks<-c(0.5,log10.index(pval.seq)) 420 | 421 | if(showgenes==FALSE){ ## if show genes flag is not set then put all the genes in the table 422 | snp.info$novel=TRUE 423 | } 424 | 425 | snp.info.known<-snp.info[snp.info$novel==FALSE,] 426 | snp.info.novel<-snp.info[snp.info$novel==TRUE,] 427 | 428 | ## the core heatmap - generated using ggplot2 429 | main.core<-ggplot(data=m, aes(x=pos,y=pval)) + 430 | geom_tile(aes(fill = val))+ ##,colour= val), size=0.01) + 431 | theme(legend.position="left",legend.key.size=unit(0.5,"line"), 432 | legend.title=element_text(size=5), 433 | legend.text=element_text(size=5)) + 434 | geom_hline(yintercept=p.val.cell(GWS)+0.5, linetype="dashed") + ## GWS line 435 | geom_hline(yintercept=p.val.cell(FDR)+0.5, linetype="dashed") + ## FDR line 436 | scale_fill_gradientn(colours = col.discrete, 437 | guide="legend", breaks=col.brks, 438 | labels=col.text,name = "Variant Count") + 439 | scale_y_continuous("-log10(p)",labels=y.labels, 440 | breaks=y.breaks, 441 | expand=c(0,0),trans="reverse",position="right") + 442 | theme(panel.grid.major =element_blank(), panel.grid.minor = element_blank()) + coord_flip(expand=T) + 443 | theme(plot.margin = unit(c(0,0,0,0), "cm")) + 444 | scale_x_continuous("", labels=rep("",(lastchr*2)), 445 | breaks=c(chr.matrix.len$mid,chr.matrix.len$cumm), 446 | expand=c(0,0),trans="reverse",position="top") + 447 | theme(axis.ticks.y = element_blank()) + 448 | theme(axis.line.x = element_line(color="black", size = 0.5)) 449 | 450 | ## if there is one or more known SNPs in the table then label the manhattan plot with them. 451 | if(dim(snp.info.known)[1] > 0){ 452 | repel.df<-as.data.frame(matrix(nrow=dim(snp.info.known)[1],ncol=3)) 453 | names(repel.df)<-c("marker","pvalidx","posidx") 454 | 455 | repel.df$marker<-snp.info.known$gene 456 | if(showrsids==T){ 457 | repel.df$marker<-snp.info.known$markername 458 | } 459 | 460 | repel.df$pvalidx<-p.val.index(snp.info.known$Pvalue) 461 | repel.df$posidx<-snpcells[snp.info$novel==FALSE] 462 | 463 | if(dim(repel.df[repel.df$pvalidx > log10.index(max.pval),])[1] > 0){ 464 | repel.df[repel.df$pvalidx > log10.index(max.pval),]$pvalidx<-log10.index(max.pval) 465 | } 466 | 467 | final.repel.plot<-main.core+ 468 | geom_label_repel(data=repel.df, aes(posidx,pvalidx, label=marker), 469 | size=textsize, force=1, nudge_y = 10,nudge_x=10, 470 | segment.colour="black", min.segment.length = 0, 471 | segment.size=0.25, seed=500, max.iter = 5000, 472 | point.padding = NA) 473 | } 474 | 475 | ## if any positions of interest label on the manhattan plot then label them 476 | if(dim(pos.interest)[1]> 0){ 477 | main.core<-main.core+ geom_label_repel(data=pos.interest, aes(pos.idx,pvalidx, label=marker), 478 | size=textsize, force=5, nudge_y = 10,nudge_x=10, 479 | segment.colour="black", min.segment.length = 0, 480 | segment.size=0.25, seed=500, max.iter = 5000, 481 | point.padding = NA,segment.color = "black",color="black") 482 | } 483 | 484 | ## convert coordinates 0-20 to coordinates for whole table. 485 | table.pos<-function(index){ 486 | idx<-(index / 20) * max.pval 487 | return(log10.index(idx)) 488 | } 489 | 490 | ## the positions of the columns of the table (currently hard coded, with 0-20 coordinates) 491 | title.pos<-c(table.pos(17)+1, table.pos(13)+1, table.pos(11.5)+1,table.pos(10)+1,table.pos(7)+1) 492 | 493 | ## Generate table for the novel genes 494 | ## Start with a completely blank plot (table1) 495 | table1<-ggplot(data=m, aes(x=pos,y=pval)) + 496 | geom_tile(aes(fill = rep(0,dim(m)[1]))) + 497 | scale_x_continuous("", labels=rep("",(lastchr*2)), 498 | breaks=c(chr.matrix.len$mid,chr.matrix.len$cumm), 499 | expand=c(0,0),trans="reverse",position="top") + #, minor_breaks=chr.matrix.len$cumm) + 500 | scale_y_continuous("",labels=rep("",length(y.labels)), 501 | breaks=y.breaks, 502 | expand=c(0,0),trans="reverse",position="right") + 503 | coord_flip(expand=T) + 504 | scale_fill_gradientn(colours = c("white","white"), 505 | guide=FALSE, breaks=c(0,1), 506 | labels=c("0","1"),name = "") + 507 | theme(axis.ticks.y = element_blank()) + 508 | theme(axis.ticks.x = element_blank()) + 509 | theme(panel.grid.major = element_blank(), 510 | panel.grid.minor = element_blank(), 511 | panel.border = element_blank(), 512 | panel.background = element_blank()) + 513 | theme(plot.margin = unit(c(0,0,0,0), "cm")) 514 | 515 | 516 | lim<-layer_scales(table1) 517 | xmin<-lim$x$range$range[2]*-1 518 | xmax<-lim$x$range$range[1]*-1 519 | 520 | segment.indexes<-c(table.pos(17.5)+1,table.pos(19)+1,table.pos(20)+1) 521 | chr.num.pos<-(table.pos(19.75)+1) 522 | brks.pos<-(table.pos(19.7)+1) 523 | 524 | text.pos<-seq(from=xmin, to=xmax,length.out=length(snp.info.novel$markername)+1) 525 | text.pos1<-text.pos[2:length(text.pos)] 526 | title.pos1<-text.pos[1] 527 | 528 | if(dim(snp.info.novel)[1] > 0){ 529 | ## add the SNP information to the table 530 | ## Can add additional columns to the table here. 531 | table2<-table1+ 532 | annotate("text", x = text.pos1, 533 | y = title.pos[1], label = as.character(snp.info.novel$markername), 534 | angle=0,size=textsize, hjust=0) + ## markername 535 | annotate("text", x = text.pos1, 536 | y = title.pos[2], label = as.character(format(round(snp.info.novel$eaf,2),nsmall=2)), 537 | angle=0,size=textsize, hjust=0) + ## EAF 538 | annotate("text", x = text.pos1, 539 | y = title.pos[3], label = as.character(format(round(snp.info.novel$OR,2),nsmall=2)), 540 | angle=0,size=textsize, hjust=0) + ## OR 541 | annotate("text", x = text.pos1, 542 | y = title.pos[4], label = as.character(formatC(snp.info.novel$Pvalue, format = "E", digits = 2)), 543 | angle=0,size=textsize, hjust=0) + ## P-value 544 | annotate("text", x = text.pos1, 545 | y = title.pos[5], label =as.character(snp.info.novel$gene), 546 | angle=0,size=textsize, hjust=0,fontface = 'italic') + ## Nearest Gene 547 | annotate("segment", x = text.pos1, 548 | xend = snpcells[snp.info$novel==TRUE], y = segment.indexes[1], yend = segment.indexes[2], 549 | colour = "blue", linetype="dashed", size=0.5) + ## segment from midpoint to table row 550 | annotate("segment", x = snpcells[snp.info$novel==TRUE], 551 | xend = snpcells[snp.info$novel==TRUE], y = segment.indexes[2], yend = segment.indexes[3], 552 | colour = "blue", linetype="dashed", size=0.5) + ## segment from axis to mid point 553 | annotate("text", x = chr.matrix.len$mid, 554 | y = chr.num.pos, label = as.character(1:lastchr), 555 | angle=0,size=3.5, hjust=0)+ ## chromosome labels 556 | annotate("segment", x = chr.matrix.len$cumm, 557 | xend = chr.matrix.len$cumm, y = brks.pos, yend = segment.indexes[3], 558 | colour = "black", linetype="solid") + ## x axis breaks 559 | annotate("segment", x = 0, 560 | xend = xmax, y = segment.indexes[3], yend = segment.indexes[3], 561 | colour = "black", linetype="solid") ## xaxis solid line 562 | } 563 | else{ 564 | table2<-table1 565 | } 566 | 567 | ## add the title to the table. 568 | ## Can add additional table headers here. 569 | final.table.plot<-table2 + 570 | annotate("text", x = title.pos1, 571 | y = title.pos[1], label = "SNP", 572 | angle=0,size=textsize, hjust=0,fontface = 'bold') + 573 | annotate("text", x = title.pos1, 574 | y = title.pos[2], label = "EAF", 575 | angle=0,size=textsize, hjust=0,fontface = 'bold') + 576 | annotate("text", x = title.pos1, 577 | y = title.pos[3], label = "OR", 578 | angle=0,size=textsize, hjust=0,fontface = 'bold') + 579 | annotate("text", x = title.pos1, 580 | y = title.pos[4], label = "p-value", 581 | angle=0,size=textsize, hjust=0,fontface = 'bold') + 582 | annotate("text", x = title.pos1, 583 | y = title.pos[5], label = "Gene", 584 | angle=0,size=textsize, hjust=0,fontface = 'bold') 585 | 586 | ## modify the clipping of the table, so it can be merged with the heatmap. 587 | gt <- ggplot_gtable(ggplot_build(final.table.plot)) # p4 588 | gt$layout$clip[gt$layout$name == "panel"] <- "off" 589 | 590 | ## draw either as TIFF (drawastiff == T), or by default as PDF. 591 | if(drawastiff==T){ 592 | message(paste("\nGenerated tiff file: ", 593 | outfile,".tif\nWidth = 8.27in Height = 11.69in\nDefault working directory: ", 594 | getwd(), sep="")) 595 | tiff(filename = paste(outfile,".tif",sep=""),width = 8.27,height = 11.69, units="in",res=300) 596 | } else{ 597 | message(paste("\nGenerated pdf file: ", 598 | outfile,".pdf\nWidth = 8.27in Height = 11.69in\nDefault working directory: ", 599 | getwd(), sep="")) 600 | pdf(paste(outfile,".pdf",sep=""),width = 8.27,height = 11.69,onefile = F) 601 | } 602 | 603 | ## by default plot the heatmap with regions of interest bubbles (showgenes == FALSE) 604 | final.plot<-main.core 605 | 606 | ## if show genes flag is T then add repel gene labels from the SNP list to the heatmap plot 607 | if(showgenes==TRUE && dim(snp.info.known)[1] > 0){ 608 | final.plot<-final.repel.plot 609 | } 610 | 611 | ## hard code variables for positions of two plots on qplot 612 | manh.max<-7 613 | annot.min<-6.7 614 | 615 | ## merge the heatmap and the table plots together 616 | print(qplot(1:10,1:10,colour=I("white")) + 617 | annotation_custom(grob=ggplotGrob(final.plot), xmin=0.5,xmax=manh.max, ymin=1,ymax = 10) + 618 | annotation_custom(grob=gt, xmin=annot.min, xmax=10.5, ymin=1,ymax = 10) + 619 | theme(axis.title.x=element_blank(), 620 | axis.text.x=element_blank(), 621 | axis.ticks.x=element_blank()) + 622 | theme(axis.title.y=element_blank(), 623 | axis.text.y=element_blank(), 624 | axis.ticks.y=element_blank()) + 625 | theme(panel.grid.major = element_blank(), 626 | panel.grid.minor = element_blank()) + 627 | theme(plot.background = element_rect(fill = 'white', colour = 'white')) + 628 | theme(panel.background = element_rect(fill = 'white', colour = 'white'))) 629 | 630 | dev.out<-dev.off() 631 | } 632 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MANHATTAN++ 2 | 3 | MANHATTAN++ is software to generate a transposed manhattan heatmap, implemented in R. 4 | 5 | ## Getting Started 6 | 7 | You need to install the latest version of [R](https://www.r-project.org/) The R package can be run on Windows and Linux, you must specify paths to the filenames you are using as input and output. 8 | 9 | To install the software from the GIT repository: 10 | ``` 11 | install.packages("devtools") 12 | library(devtools) 13 | 14 | install_github("cgrace1978/manhplot", dependencies = T, force = T) 15 | ``` 16 | 17 | The following command will run the plot with default data in the package. The pdf (test.pdf) will be created in the current working directory in R (This can be viewed using the getwd() command): 18 | ``` 19 | library(manhplot) 20 | 21 | infile<-system.file("extdata","cad.add.160614_manhformat.txt.gz",package = "manhplot") 22 | configfile<-system.file("extdata","config.txt", package = "manhplot") 23 | snpfile<-system.file("extdata","56cad.add.160614.variants.txt", package = "manhplot") 24 | 25 | ## Run manhattan++ with the default paramaters and files included in the package 26 | manhplusplot(infile = infile,outfile = "test", configfile = configfile, snpfile = snpfile) 27 | ``` 28 | For more information on using manhattan++ please visit the [Manhattan++ wiki](https://github.com/cgrace1978/manhplot/wiki/home) 29 | 30 | ### Citation 31 | To use Manhattan++ please cite the following paper: 32 | 33 | Grace *et al* 34 | 35 | Manhattan++: displaying genome-wide association summary statistics with multiple annotation layers 36 | 37 | BMC Bioinformatics 2019; 20(1):610 38 | 39 | PubMed: [31775616](https://www.ncbi.nlm.nih.gov/pubmed/31775616) 40 | -------------------------------------------------------------------------------- /images/cells.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/cells.png -------------------------------------------------------------------------------- /images/genesbub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/genesbub.png -------------------------------------------------------------------------------- /images/labels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/labels.png -------------------------------------------------------------------------------- /images/reportbub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/reportbub.png -------------------------------------------------------------------------------- /images/rsidbub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/rsidbub.png -------------------------------------------------------------------------------- /images/zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/zoom.png -------------------------------------------------------------------------------- /inst/extdata/56cad.add.160614.variants.txt: -------------------------------------------------------------------------------- 1 | markername gene chr pos eaf OR Pvalue novel 2 | rs11206510 PCSK9 1 55496039 0.847627 1.08 2.34E-08 FALSE 3 | rs9970807 PPAP2B 1 56965664 0.915097 1.13 5E-14 FALSE 4 | rs7528419 SORT1 1 109817192 0.78582 1.12 1.97E-23 FALSE 5 | rs6689306 IL6R 1 154395946 0.447545 1.06 2.6E-09 FALSE 6 | rs67180937 MIA3 1 222823743 0.663052 1.08 1.01E-12 FALSE 7 | rs16986953 AK097927 2 19942473 0.104706 1.09 1.45E-08 FALSE 8 | 2:21378433:D APOB 2 21378433 0.745655 1.07 2.89E-08 FALSE 9 | 2:44074126:D ABCG5-ABCG8 2 44074126 0.744839 1.06 0.000000026 FALSE 10 | rs7568458 VAMP5-VAMP8-GGCX 2 85788175 0.448518 1.06 3.62E-10 FALSE 11 | rs17678683 ZEB2-ACO74093.1 2 145286559 0.087681 1.1 0.000000003 FALSE 12 | 2:203828796:I WDR12 2 203828796 0.107909 1.15 2.15E-18 FALSE 13 | 3:138099161:I MRAS 3 138099161 0.162797 1.08 2.89E-09 FALSE 14 | rs4593108 EDNRA 4 148281001 0.795349 1.07 8.82E-10 FALSE 15 | rs72689147 GUCY1A3 4 156639888 0.816978 1.07 6.07E-09 FALSE 16 | rs273909 SLC22A4-SLC22A5 5 131667353 0.116756 1.06 0.000124 FALSE 17 | rs6903956 ADTRP-C6orf105 6 11774583 0.354055 1 0.96 FALSE 18 | rs9349379 PHACTR1 6 12903957 0.431606 1.14 1.81E-42 FALSE 19 | rs17609940 ANKS1A 6 35034800 0.823672 1.03 0.03 FALSE 20 | rs56336142 KCNK5 6 39134099 0.807262 1.07 1.85E-08 FALSE 21 | rs12202017 TCF21* 6 134173151 0.699953 1.07 1.98E-11 FALSE 22 | rs55730499 SLC22A3-LPAL2-LPA 6 161005610 0.056243 1.37 5.39E-39 FALSE 23 | rs4252185 PLG 6 161123451 0.059661 1.34 1.64E-32 FALSE 24 | rs2107595 HDAC9 7 19049388 0.20047 1.08 8.05E-11 FALSE 25 | rs10953541 7q22 7 107244545 0.782727 1.05 0.0000102 FALSE 26 | rs11556924 ZC3HC1 7 129663496 0.686675 1.08 5.34E-11 FALSE 27 | rs264 LPL 8 19813180 0.852594 1.06 0.0000106 FALSE 28 | rs2954029 TRIB1 8 126490972 0.551395 1.04 0.00000261 FALSE 29 | rs2891168 9p21 9 22098619 0.488668 1.21 2.29E-98 FALSE 30 | rs2519093 ABO 9 136141870 0.190872 1.08 1.19E-11 FALSE 31 | rs2487928 KIAA1462 10 30323892 0.418221 1.06 4.41E-11 FALSE 32 | rs1870634 CXCL12 10 44480811 0.637485 1.08 5.55E-15 FALSE 33 | rs1412444 LIPA 10 91002927 0.369131 1.07 5.15E-12 FALSE 34 | rs11191416 CYP17A1-CNNM2-NT5C2 10 104604916 0.87253 1.08 4.65E-09 FALSE 35 | rs2128739 PDGFD 11 103673277 0.323536 1.07 7.05E-11 FALSE 36 | rs964184 ZNF259-APOA5-APOA1 11 116648917 0.184706 1.05 0.000056 FALSE 37 | rs2681472 ATP2B1 12 90008959 0.201306 1.08 6.17E-11 FALSE 38 | rs3184504 SH2B3 12 111884608 0.421808 1.07 1.03E-09 FALSE 39 | rs9319428 FLT1 13 28973621 0.314419 1.04 0.0000713 FALSE 40 | rs11838776 COL4A1/A2 13 111040681 0.263277 1.07 1.83E-10 FALSE 41 | rs10139550 HHIPL1 14 100145710 0.423033 1.06 1.38E-08 FALSE 42 | rs4468572 ADAMTS7 15 79124475 0.585831 1.08 4.44E-16 FALSE 43 | rs17514846 FURIN-FES 15 91416550 0.440264 1.05 0.00000031 FALSE 44 | rs216172 SMG6 17 2126504 0.349997 1.05 0.000000507 FALSE 45 | rs12936587 RAI1-PEMT-RASD1 17 17543722 0.61133 1.03 0.000824 FALSE 46 | rs46522 UBE2Z 17 46988597 0.513272 1.04 0.0000184 FALSE 47 | rs56289821 LDLR 19 11188247 0.899622 1.14 4.44E-15 FALSE 48 | rs4420638 APOE-APOC1 19 45422946 0.166036 1.1 7.07E-11 FALSE 49 | rs28451064 KCNE2 (gene desert) 21 35593827 0.121186 1.14 1.33E-15 FALSE 50 | rs17087335 REST-NOA1 4 57838583 0.214637 1.06 4.59E-08 TRUE 51 | rs3918226 NOS3 7 150690176 0.064515 1.14 1.69E-09 TRUE 52 | rs10840293 SWAP70 11 9751196 0.549821 1.06 1.28E-08 TRUE 53 | rs56062135 SMAD3 15 67455630 0.794271 1.07 4.52E-09 TRUE 54 | rs8042271 MFGE8-ABHD2 15 89574218 0.902282 1.1 3.68E-08 TRUE 55 | rs7212798 BCAS3 17 59013488 0.146516 1.08 1.88E-08 TRUE 56 | rs663129 PMAIP1-MC4R 18 57838401 0.256835 1.06 0.000000032 TRUE 57 | rs180803 POM121L9P-ADORA2A 22 24658858 0.970732 1.2 1.64E-10 TRUE 58 | -------------------------------------------------------------------------------- /inst/extdata/cad.add.160614_manhformat.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/inst/extdata/cad.add.160614_manhformat.txt.gz -------------------------------------------------------------------------------- /inst/extdata/config.txt: -------------------------------------------------------------------------------- 1 | #### CONFIG file for use with MANH++ - Do not modify the first 10 lines 2 | ## min.count: The lower cell count threshold to accept this config 3 | ## max.count: The upper cell count threshold to accept this config 4 | ## maf: Is MAF detection active for this config - is there any variants within a cell with MAF < threshold? 5 | ## conseq: Is HIGH impact consequence active? Are there any variants with HIGH impact consequence in the cell? 6 | ## col: The colour which cells for this config 7 | ## idx: the index to use for this cell in the heatmap - MUST BE CONSECUTIVE FROM START TO END - STARTING AT 1 8 | "## type: val - an config entry, oddchr - the odd chromosome, evenchr - the even chromosome" 9 | ## report: Are these annotations labeled on the heatmap 10 | ##### 11 | min.count max.count maf conseq col idx type report 12 | 1 1 FALSE FALSE black 1 val FALSE 13 | 1 1 FALSE TRUE lightpink 2 val TRUE 14 | 1 1 TRUE FALSE green 3 val FALSE 15 | 1 1 TRUE TRUE darkmagenta 4 val TRUE 16 | 2 NA FALSE FALSE blue 5 val FALSE 17 | 2 NA FALSE TRUE pink 6 val TRUE 18 | 2 NA TRUE FALSE red 7 val FALSE 19 | 2 NA TRUE TRUE cyan 8 val TRUE 20 | NA NA NA NA darkgrey 9 oddchr NA 21 | NA NA NA NA grey 10 evenchr NA 22 | -------------------------------------------------------------------------------- /man/manhplot-package.Rd: -------------------------------------------------------------------------------- 1 | \name{manhplot-package} 2 | \alias{manhplot-package} 3 | \alias{manhplot} 4 | \docType{package} 5 | \title{ 6 | \packageTitle{manhplot} 7 | } 8 | \description{ 9 | \packageDescription{manhplot} 10 | } 11 | \details{ 12 | 13 | The DESCRIPTION file: 14 | \packageDESCRIPTION{manhplot} 15 | \packageIndices{manhplot} 16 | } 17 | \author{ 18 | \packageAuthor{manhplot} 19 | 20 | Maintainer: \packageMaintainer{manhplot} 21 | } 22 | \keyword{ package } 23 | -------------------------------------------------------------------------------- /man/manhplusplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/manhattan.heatmap.v1.R 3 | \name{manhplusplot} 4 | \alias{manhplusplot} 5 | \title{Generate the manhattan++ plot} 6 | \usage{ 7 | manhplusplot(infile, outfile, configfile, snpfile, drawastiff = F, 8 | GWS = 5e-08, FDR = 0.001, MAF = 0.05, chrname = "chr", 9 | posname = "pos", pvalname = "pvalue", frqname = "maf", 10 | conseqname = "conseq", showgenes = F, showrsids = F, 11 | pos.split = 3e+06, pval.split = 0.125, max.pval = 20) 12 | } 13 | \arguments{ 14 | \item{infile}{Input GWAS summary statistics} 15 | 16 | \item{outfile}{Output file prefix for the manhattan++ plot} 17 | 18 | \item{configfile}{Configuration file} 19 | 20 | \item{snpfile}{Table of SNPs to visualize} 21 | 22 | \item{drawastiff}{If TRUE draw a Tiff file, if FALSE draw a PDF file} 23 | 24 | \item{GWS}{Genome wise significance pvalue threshold (5E-8 by default)} 25 | 26 | \item{FDR}{False discovery Rate pvalue threshold (1E-3 by default)} 27 | 28 | \item{MAF}{Minor Allele Frequency threshold} 29 | 30 | \item{chrname}{Column name for chromosome in GWAS infile} 31 | 32 | \item{posname}{Column name for position in GWAS infile} 33 | 34 | \item{pvalname}{Column name for pvalue in GWAS infile} 35 | 36 | \item{frqname}{column name for allele frequency in GWAS infile} 37 | 38 | \item{conseqname}{column name for variant annotation consequence in GWAS infile} 39 | 40 | \item{showgenes}{If T shows known genes as bubbles on main manhattan plot, if F show positions of interest as bubbles} 41 | 42 | \item{showrsids}{If showgenes is T, then show the rsids, rather than genes} 43 | 44 | \item{pos.split}{The bin lengths for positions} 45 | 46 | \item{pval.split}{The bin lengths for pvalues} 47 | 48 | \item{max.pval}{The maximum pvalue to display} 49 | } 50 | \description{ 51 | Generate the manhattan++ plot 52 | } 53 | \details{ 54 | For file formats see github page \url{https://github.com/cgrace1978/manhplot} 55 | } 56 | \examples{ 57 | 58 | \donttest{ 59 | library(manhplot) 60 | infile<-system.file("extdata","cad.add.160614_manhformat.txt.gz",package = "manhplot") 61 | configfile<-system.file("extdata","config.txt", package = "manhplot") 62 | snpfile<-system.file("extdata","56cad.add.160614.variants.txt", package = "manhplot") 63 | 64 | manhplusplot(infile = infile,outfile = file.path(tempdir(), "default-plot"), 65 | configfile = configfile, snpfile = snpfile) 66 | } 67 | 68 | } 69 | \author{ 70 | Chris Grace 71 | } 72 | -------------------------------------------------------------------------------- /perl/gen_snpfile.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use Getopt::Long qw(GetOptions); 3 | my %locus=(); 4 | my $chrcol=""; 5 | my $snpcol=""; 6 | my $poscol=""; 7 | my $pcol=""; 8 | my $betacol=""; 9 | my $eafcol=""; 10 | my $chridx=0; 11 | my $snpidx=0; 12 | my $posidx=0; 13 | my $pidx=0; 14 | my $betaidx=0; 15 | my $eafidx=0; 16 | 17 | my $debug=0; 18 | my $file=""; 19 | my $genefile="glist-hg19";#insert the file for gene coordinates depending on the build 20 | my $snpgenedist=500000;#if gene is beyond this distance from SNP either side, the gene annotation is "gene desert" 21 | my $gwasp=5e-8;#pvalue cut-off to define a peak 22 | my $boundary=500000;#distance between 2 peaks to be classified as a locus. 23 | #Can be changed to 0.5 or 1 if interested in cM boundaries 24 | #provided the posidx is pointing to cM positions 25 | my $totalchrs=0; 26 | my $outfile=""; 27 | GetOptions( 28 | 'chrom=s' => \$chrcol, 29 | 'var=s' => \$snpcol, 30 | 'pos=s' => \$poscol, 31 | 'pval=s' => \$pcol, 32 | 'beta=s' => \$betacol, 33 | 'eaf=s' => \$eafcol, 34 | 'gwasfile=s' => \$file, 35 | 'genefile=s' => \$genefile, 36 | 'gwaspcut=s' => \$gwasp, 37 | 'locusbounds=s' => \$boundary, 38 | 'snpgenebounds=s'=> \$snpgenedist, 39 | 'out=s' => \$outfile, 40 | 'help' => \$debug, 41 | ) or die "Usage: $0 --help\n"; 42 | 43 | if ($debug || $file eq ""){ 44 | print "\n\n"; 45 | print <$outfile.txt") or die "Cannot write output file\n"; 110 | print OF "chr markername pos Pvalue OR eaf gene novel\n"; 111 | #chr snp pos p beta eaf 112 | my $header=; 113 | chomp $header; 114 | #figure out column names and column numbers 115 | my @colnames=split(/\t/,$header); 116 | my $colx=0;#should add up to 6 as we need 6 column names 117 | for (my $z=0;$z<=$#colnames;$z++){ 118 | if ($colnames[$z] eq $chrcol){ 119 | $chridx=$z; 120 | $colx++; 121 | }elsif($colnames[$z] eq $snpcol){ 122 | $snpidx=$z; 123 | $colx++; 124 | }elsif($colnames[$z] eq $poscol){ 125 | $posidx=$z; 126 | $colx++; 127 | }elsif($colnames[$z] eq $pcol){ 128 | $pidx=$z; 129 | $colx++; 130 | }elsif($colnames[$z] eq $betacol){ 131 | $betaidx=$z; 132 | $colx++; 133 | }elsif($colnames[$z] eq $eafcol){ 134 | $eafidx=$z; 135 | $colx++; 136 | } 137 | } 138 | if($colx<6){ 139 | print "$colx ERROR: Not all column names specified. Please use --help for usage.\n"; 140 | exit; 141 | } 142 | 143 | print "Reading GWAS summary file ... \n\n"; 144 | my %sig=(); 145 | my $co=1; 146 | #store information of the lead SNPs only in hash. 147 | my $maxchr=0; 148 | while (my $line=){ 149 | chomp $line; 150 | my @cells=split('\t',$line); 151 | 152 | if ($cells[$chridx]=~m/\D/){ 153 | print "ERROR: Chromosomes need to be numeric\n"; 154 | exit; 155 | }elsif($cells[$chridx]>$maxchr){ 156 | $maxchr=$cells[$chridx]; 157 | } 158 | if ($cells[$pidx]<$gwasp){ 159 | $sig{$line}=$cells[$pidx];#store whole line as key. pval as value 160 | $psnps{$cells[$snpidx]}=$cells[$pidx]; 161 | $eafsnps{$cells[$snpidx]}=sprintf("%.2f", $cells[$eafidx]); 162 | my $or=exp($cells[$betaidx]); 163 | $orsnps{$cells[$snpidx]}=sprintf("%.2f", $or); 164 | } 165 | } 166 | 167 | $totalchrs=$maxchr; 168 | close IF; 169 | print "Reading Gene database file ... \n\n"; 170 | open (IF, $genefile) or die "Cannot open $genefile\n"; 171 | #19 58858171 58864865 A1BG 172 | #19 58863335 58866549 A1BG-AS1 173 | my %genesbychr=();#chr as key and array of gene details as value 174 | my %chr=("X"=>23,"Y"=>24); 175 | while (my $line=){ 176 | my ($c,$s,$e,$g)=split(/\s+/,$line); 177 | if ($c eq "X"){ 178 | $c=23; 179 | }elsif($c eq "Y"){ 180 | $c=24; 181 | } 182 | if (exists $genesbychr{$c}){#if chr exists 183 | my %genes=%{$genesbychr{$c}};#gene hash of genes for this chr 184 | if (!exists $genes{$g}){#store this gene as its not in the hash 185 | my @coors=($s,$e); 186 | $genes{$g}=\@coors; 187 | $genesbychr{$c}=\%genes; 188 | } 189 | }else{#create chr entry in the hash 190 | my @coors=($s,$e); 191 | my %genes=(); 192 | $genes{$g}=\@coors; 193 | $genesbychr{$c}=\%genes; 194 | } 195 | } 196 | 197 | close IF; 198 | print "Identifying loci ... \n"; 199 | for (my $chr=1;$chr<=$totalchrs;$chr++){#process chr by chr 200 | my @sortedchr=(); 201 | my $c=0; 202 | foreach my $line (sort { $sig{$a} <=> $sig{$b} } keys %sig){#sort rows by p-values 203 | my @cells=split('\t',$line); 204 | #print "$line\n"; 205 | if ($cells[$chridx] == $chr){ 206 | $c++; 207 | push(@sortedchr,$line); 208 | } 209 | } 210 | #print "Total in this chr=$c\n"; 211 | 212 | #Do the filtering in the subroutine 213 | my %list=%{&getloci(@sortedchr)}; 214 | 215 | #print the output for the chromosome 216 | foreach my $name (sort { $list{$a} <=> $list{$b} } keys %list){ 217 | my $snppos=$list{$name}; 218 | #get gene info 219 | my %genelist=%{$genesbychr{$chr}}; 220 | my $dist5=$snpgenedist; 221 | my $dist3=$snpgenedist; 222 | my $gene5="ABC"; 223 | my $gene3="DEF"; 224 | my $genein="GEH"; 225 | foreach my $genes (keys %genelist){ 226 | my ($s,$e)=@{$genelist{$genes}}; 227 | if ($snppos>=$s && $snppos<=$e){#simple situation 228 | $genein=$genes; 229 | last; 230 | }elsif($snppos<$s){#get 3' gene 231 | my $dist=$s-$snppos; 232 | if ($dist<=$snpgenedist && $dist<=$dist3){#gene within defined distance 233 | $dist3=$dist;#overwrite this to get the closest gene 3' 234 | $gene3=$genes; 235 | } 236 | }elsif($snppos>$e){#get 5' gene 237 | my $dist=$snppos-$e; 238 | if ($dist<=$snpgenedist && $dist<=$dist5){#gene within defined distance 239 | $dist5=$dist; 240 | $gene5=$genes; 241 | } 242 | } 243 | } 244 | #format final gene output 245 | my $geneout="gene desert";#genes are far apart as defined from SNP 246 | if ($genein ne "GEH"){#best situation 247 | $geneout=$genein; 248 | }elsif($gene5 ne "ABC" && $gene3 ne "DEF"){#found 2 genes either side of SNP & within defined dist 249 | $geneout="$gene5/$gene3"; 250 | }elsif($gene5 ne "ABC" && $gene3 eq "DEF"){#found only 1 gene 5' of SNP. 3' gene is too far 251 | $geneout=$gene5; 252 | }elsif($gene5 eq "ABC" && $gene3 ne "DEF"){#found only 1 gene 3' of SNP. 5' gene is too far 253 | $geneout=$gene3; 254 | } 255 | 256 | print OF $chr," ",$name," ",$snppos," ",$psnps{$name}," ",$orsnps{$name}," ",$eafsnps{$name}," $geneout FALSE\n"; 257 | $co++; 258 | } 259 | 260 | }#process next chromosome 261 | 262 | print "\n************************************************************************************\n\n\n"; 263 | print "WARNING: Please remember to update the \"novel\" column in the output file ($outfile.txt)\n\n"; 264 | print "\n************************************************************************************\n"; 265 | close OF; 266 | $datestring = localtime(); 267 | print "Analysis finished: $datestring\n"; 268 | 269 | exit; 270 | 271 | sub getloci(){ 272 | my (@sorted)=@_; 273 | my $total=scalar @sorted; 274 | my @sortedtmp=(); 275 | my %list=(); 276 | @sortedtmp=@sorted; 277 | my $spliced=0; 278 | my $loci=0; 279 | my @cells1=(); 280 | 281 | #keep on going through the array till the time the 282 | #number of rows dropped and numb of loci found add 283 | #up to the total number of rows in this chromosome 284 | 285 | while($spliced+$loci<=$total){ 286 | my $line1=shift @sorted;#the top row is the locus 287 | @sortedtmp=@sorted;#make a copy as you dont want to iterate the array which you are processing too 288 | $loci++;#increment locus by 1 289 | @cells1=split('\t',$line1); 290 | if (defined $cells1[$snpidx]){ 291 | #print "Y:$cells1[1] $cells1[2]\n"; 292 | $list{$cells1[$snpidx]}=$cells1[$posidx]; 293 | } 294 | #print "XX $loci $cells1[1] $cells1[2]\n"; 295 | my @coord=(); 296 | #iterate through the list of loci to drop rows (cells in array) which are nearby (+/-1 Mb) 297 | foreach my $key(keys %list){ 298 | my $specific=0; 299 | #iterate through the rows (@sortedtmp) 300 | for (my $x=0;$x<=$#sortedtmp;$x++){ 301 | #foreach $line(@sorted){ 302 | my $line=$sortedtmp[$x]; 303 | # print "$line\n"; 304 | my @cells=split('\t',$line); 305 | 306 | if (($cells[$posidx]>(($list{$key})-$boundary)) && ($cells[$posidx]<(($list{$key})+$boundary)) && $key ne $cells[$snpidx]){ 307 | #print "Splicing $cells[2] $list{$key} $key $cells[1] ",$cells[2]-$list{$key},"\n"; 308 | #splice(@sorted,$x,1); 309 | push(@coord,$x); 310 | $specific++;#keep track of number of rows filtered for this locus 311 | $spliced++;#keep track of all rows filtered for this chromosome 312 | } 313 | } 314 | if($specific>=1){#if there are any to be filtered for this locus 315 | #print "Array before=",scalar @sorted, " "; 316 | my @tmp=@{&splicearray(\@sorted,\@coord)}; 317 | #print "Splice=$specific, Array after=",scalar @tmp,"\n"; 318 | @sortedtmp=@tmp;#update the arrays 319 | @sorted=@tmp;#update the arrays 320 | #print "$loci $spliced $total\n"; 321 | } 322 | } 323 | } 324 | #sort out the last element. 325 | my $line1=shift @sorted; 326 | my @cells1=split('\t',$line1); 327 | #print "Y:$cells1[1] $cells1[2]\n"; 328 | if (defined $cells1[$snpidx]){ 329 | $list{$cells1[$snpidx]}=$cells1[$posidx]; 330 | } 331 | return \%list; 332 | } 333 | 334 | #subroutine to delete elements of the array and then tidy up the array 335 | sub splicearray(){ 336 | my ($array,$coord)=@_; 337 | my @tmp=@{$array}; 338 | foreach my $co(@{$coord}){ 339 | delete $tmp[$co]; 340 | } 341 | my @new=(); 342 | foreach(@tmp){ 343 | if( ( defined $_) and !($_ =~ /^$/ )){ 344 | push(@new, $_); 345 | } 346 | } 347 | return \@new; 348 | } 349 | -------------------------------------------------------------------------------- /perl/readme: -------------------------------------------------------------------------------- 1 | Date: 2/10/2019 2 | Author: Anuj Goel 3 | Script: gen_snpfile.pl 4 | 5 | This Perl script has been tested in Linux OS (Centos 7) using Perl version v5.16.3. 6 | The aim of this script is to read a GWAS summary statistics file and generate a list of lead variants and annotate them with nearest gene names. 7 | To be used as "snpfile" for Manhattan++ 8 | 9 | The script uses a gene database that can be downloaded from PLINK website: 10 | https://www.cog-genomics.org/plink/1.9/resources#genelist 11 | 12 | The summary statistics file can be plain text file or gzipped (tab delimited). 13 | The summary statistics file must have at least the following columns (with headers) 14 | [Required] Column name containing chromosome numbers (numeric) 15 | [Required] Column name containing variant names 16 | [Required] Column name having variant positions (basepairs) 17 | [Required] Column name having variant p-values (not log10 transformed) 18 | [Required] Column name having variant beta values (not Odds Ratios) 19 | [Required] Column name having variant effect allele frequency 20 | 21 | For help, run: 22 | perl gen_snpfile.pl --help 23 | 24 | The script defines a locus if 2 variants are at least 500 kbp apart or defined using --locusbounds and have p<5e-8 or defined using --gwaspcut. 25 | 26 | The script annotates the lead variant with gene names based on proximity to nearest gene. 27 | - If gene is beyond 500kbp (or defined using --snpgenebounds) apart either side, the variant is in a gene-desert. 28 | - If variant is within a gene start-end, the variant gets that gene annotation 29 | - If the variant is between 2 genes, the 2 nearest genes are reported for that variant 30 | - If the variant is between 2 genes and one gene is further than --snpgenebounds, only the nearest 3' or 5' gene is reported 31 | 32 | 33 | ***************** 34 | IMPORTANT 35 | **************** 36 | 37 | The user still need to update the output file to identify the loci that are novel. 38 | 39 | **************** 40 | 41 | For bugs, please report via GitHub Issues page. 42 | -Thank you. 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(manhplot) 3 | 4 | test_check("manhplot") 5 | -------------------------------------------------------------------------------- /tests/testthat/5cad.add.160614.variants_chr1.txt: -------------------------------------------------------------------------------- 1 | markername gene chr pos eaf OR Pvalue novel 2 | rs11206510 PCSK9 1 55496039 0.847627 1.08 2.34E-08 FALSE 3 | rs9970807 PPAP2B 1 56965664 0.915097 1.13 5E-14 FALSE 4 | rs7528419 SORT1 1 109817192 0.78582 1.12 1.97E-23 FALSE 5 | rs6689306 IL6R 1 154395946 0.447545 1.06 2.6E-09 FALSE 6 | rs67180937 MIA3 1 222823743 0.663052 1.08 1.01E-12 FALSE 7 | -------------------------------------------------------------------------------- /tests/testthat/cad.add.160614_manhformat_chr1.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/tests/testthat/cad.add.160614_manhformat_chr1.txt.gz -------------------------------------------------------------------------------- /tests/testthat/config.txt: -------------------------------------------------------------------------------- 1 | #### CONFIG file for use with MANH++ - Do not modify the first 10 lines 2 | ## min.count: The lower cell count threshold to accept this config 3 | ## max.count: The upper cell count threshold to accept this config 4 | ## maf: Is MAF detection active for this config - is there any variants within a cell with MAF < threshold? 5 | ## conseq: Is HIGH impact consequence active? Are there any variants with HIGH impact consequence in the cell? 6 | ## col: The colour which cells for this config 7 | ## idx: the index to use for this cell in the heatmap - MUST BE CONSECUTIVE FROM START TO END - STARTING AT 1 8 | "## type: val - an config entry, oddchr - the odd chromosome, evenchr - the even chromosome" 9 | ## report: Are these annotations labeled on the heatmap 10 | ##### 11 | min.count max.count maf conseq col idx type report 12 | 1 1 FALSE FALSE black 1 val FALSE 13 | 1 1 FALSE TRUE lightpink 2 val TRUE 14 | 1 1 TRUE FALSE green 3 val FALSE 15 | 1 1 TRUE TRUE darkmagenta 4 val TRUE 16 | 2 NA FALSE FALSE blue 5 val FALSE 17 | 2 NA FALSE TRUE pink 6 val TRUE 18 | 2 NA TRUE FALSE red 7 val FALSE 19 | 2 NA TRUE TRUE cyan 8 val TRUE 20 | NA NA NA NA darkgrey 9 oddchr NA 21 | NA NA NA NA grey 10 evenchr NA 22 | -------------------------------------------------------------------------------- /tests/testthat/testmanhplusplot.R: -------------------------------------------------------------------------------- 1 | context("Running the manhplusplot on (small) dummy data") 2 | library(manhplot) 3 | 4 | test_that("Run the manhplusplot function with default params", { 5 | infile<-test_path("cad.add.160614_manhformat_chr1.txt.gz") 6 | configfile<-test_path("config.txt") 7 | snpfile<-test_path("5cad.add.160614.variants_chr1.txt") 8 | 9 | manhplusplot(infile = infile, outfile = file.path(tempdir(), "testpdf"),configfile = configfile, snpfile = snpfile) 10 | }) 11 | 12 | test_that("Run the manhplusplot function with output as tiff file", { 13 | infile<-test_path("cad.add.160614_manhformat_chr1.txt.gz") 14 | configfile<-test_path("config.txt") 15 | snpfile<-test_path("5cad.add.160614.variants_chr1.txt") 16 | 17 | manhplusplot(infile = infile, outfile = file.path(tempdir(), "testtiff"),configfile = configfile, snpfile = snpfile, drawastiff = T) 18 | }) 19 | --------------------------------------------------------------------------------