├── .Rbuildignore
├── DESCRIPTION
├── NAMESPACE
├── R
    └── manhattan.heatmap.v1.R
├── README.md
├── images
    ├── cells.png
    ├── genesbub.png
    ├── labels.png
    ├── reportbub.png
    ├── rsidbub.png
    └── zoom.png
├── inst
    └── extdata
    │   ├── 56cad.add.160614.variants.txt
    │   ├── cad.add.160614_manhformat.txt.gz
    │   └── config.txt
├── man
    ├── manhplot-package.Rd
    └── manhplusplot.Rd
├── perl
    ├── gen_snpfile.pl
    ├── glist-hg19
    ├── glist-hg38
    └── readme
└── tests
    ├── testthat.R
    └── testthat
        ├── 5cad.add.160614.variants_chr1.txt
        ├── cad.add.160614_manhformat_chr1.txt.gz
        ├── config.txt
        └── testmanhplusplot.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | archive
 4 | exampledata
 5 | default-plot.pdf
 6 | memory
 7 | thinning.R
 8 | README.md
 9 | release
10 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: manhplot
 2 | Type: Package
 3 | Title: The Manhattan++ Plot
 4 | Depends: R (>= 3.4.0)
 5 | Version: 1.1
 6 | Date: 2019-05-14
 7 | Author: Chris Grace <cgrace@well.ox.ac.uk>
 8 | Maintainer: Chris Grace <cgrace@well.ox.ac.uk>
 9 | Description: This plot integrates annotation into a manhattan plot. The plot is implemented as a heatmap, which is binned using -log10(p-value) and chromosome position. Annotation currently supported is minor allele frequency and gene function high impact variants.
10 | License: GPL (>= 2)
11 | RoxygenNote: 6.1.1
12 | Imports:
13 |   reshape2,
14 |   ggplot2,
15 |   ggrepel,
16 |   gridExtra
17 | Suggests:
18 |     R.utils,
19 |     testthat
20 | URL: https://github.com/cgrace1978/manhplot/
21 | BugReports: https://github.com/cgrace1978/manhplot/issues
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(manhplusplot)
 4 | import(ggplot2)
 5 | import(ggrepel)
 6 | import(grDevices)
 7 | import(gridExtra)
 8 | import(reshape2)
 9 | import(utils)
10 | 


--------------------------------------------------------------------------------
/R/manhattan.heatmap.v1.R:
--------------------------------------------------------------------------------
  1 | #' Generate the manhattan++ plot
  2 | #' 
  3 | #' @param infile Input GWAS summary statistics
  4 | #' @param outfile Output file prefix for the manhattan++ plot
  5 | #' @param configfile Configuration file
  6 | #' @param snpfile Table of SNPs to visualize
  7 | #' @param drawastiff If TRUE draw a Tiff file, if FALSE draw a PDF file
  8 | #' @param GWS Genome wise significance pvalue threshold (5E-8 by default)
  9 | #' @param FDR False discovery Rate pvalue threshold (1E-3 by default)
 10 | #' @param MAF Minor Allele Frequency threshold
 11 | #' @param chrname Column name for chromosome in GWAS infile
 12 | #' @param posname Column name for position in GWAS infile
 13 | #' @param pvalname Column name for pvalue in GWAS infile
 14 | #' @param frqname column name for allele frequency in GWAS infile
 15 | #' @param conseqname column name for variant annotation consequence in GWAS infile
 16 | #' @param showgenes If T shows known genes as bubbles on main manhattan plot, if F show positions of interest as bubbles
 17 | #' @param showrsids If showgenes is T, then show the rsids, rather than genes
 18 | #' @param pos.split The bin lengths for positions
 19 | #' @param pval.split The bin lengths for pvalues
 20 | #' @param max.pval The maximum pvalue to display
 21 | #' @details 
 22 | #' For file formats see github page \url{https://github.com/cgrace1978/manhplot}
 23 | #' @examples
 24 | #'
 25 | #'\donttest{
 26 | #' library(manhplot)
 27 | #' infile<-system.file("extdata","cad.add.160614_manhformat.txt.gz",package = "manhplot")
 28 | #' configfile<-system.file("extdata","config.txt", package = "manhplot")
 29 | #' snpfile<-system.file("extdata","56cad.add.160614.variants.txt", package = "manhplot")
 30 | #'
 31 | #' manhplusplot(infile = infile,outfile = file.path(tempdir(), "default-plot"), 
 32 | #'                configfile = configfile, snpfile = snpfile)
 33 | #' }
 34 | #' 
 35 | #' @author Chris Grace
 36 | #' @import utils ggplot2 reshape2 ggrepel gridExtra grDevices
 37 | #' @export
 38 | manhplusplot<-function(infile, outfile, configfile, snpfile, 
 39 |                    drawastiff=F,
 40 |                    GWS=5E-8, FDR=1E-3, MAF=0.05,
 41 |                    chrname="chr",posname="pos",pvalname="pvalue",
 42 |                    frqname="maf",conseqname="conseq",
 43 |                    showgenes=F,showrsids=F,
 44 |                    pos.split=3E6,pval.split=0.125,max.pval=20){
 45 | 
 46 | ## the no visible binding for global variable issue with check.
 47 | pvalidx<-pos<-pval<-val<-posidx<-marker<-NULL
 48 |   
 49 | ## parameters for drawing the manhattan heatmap for internal use.
 50 | pval.units<-5 ## units to display on the y axis
 51 | textsize<-2 ## size of text used on labels
 52 | 
 53 | rebuild<-T ## set to false to retain the current matrix
 54 | ## name of log file to generate if debugflag is set.
 55 | debugfile<-paste("manh_",format(Sys.time(), "%d-%m-%y.%H-%M-%S"),".log",sep="")
 56 | debugflag<-F ## Turn logging on / off
 57 | 
 58 | ### Assert statement
 59 | ## condition to test
 60 | ## message if the condition fails.
 61 | waitifnot <- function(cond, mess) {
 62 |   if (!cond) { ## check that condition is fulfilled
 63 |     message(mess)
 64 |     message(deparse(substitute(cond)), " is not TRUE")  
 65 |     while (TRUE) {}
 66 |   }
 67 | }
 68 | 
 69 | ### function returns y index on the heatmap that maps to a specific -log10(p-value)
 70 | ## log - log10 pval to convert to index
 71 | log10.index<-function(log){
 72 |   idx<-(log / pval.split) + 0.5
 73 |   return(idx)
 74 | }
 75 | 
 76 | ### function returns y index on the heatmap that maps to a specified p-value 
 77 | ## p.val - pval to convert to index
 78 | p.val.index<-function(p.val){
 79 |   gws<--1*log10(p.val) ## convert the p-value to -log10
 80 |   idx<-log10.index(gws) ## call the log10.index function.
 81 |   return(idx)
 82 | }
 83 | 
 84 | ### Find the exact cell where the p-value is within on the heatmap
 85 | ## p.val - p value
 86 | ## pval.chunks - data structure containing cell locations for log10(pvals)
 87 | p.val.cell<-function(p.val,pval.chunks){
 88 |   gws<--1*log10(p.val) ## convert the p-value to -log10
 89 |   idx<-log10.cell(gws) ## call the log10.index function.
 90 |   return(idx)
 91 | }
 92 | 
 93 | ### Find the exact cell where the -log10(p-value) is within on the heatmap
 94 | ## log - log10(pval)
 95 | ## pval.chunks - data structure containing cell locations for log10(pvals)
 96 | log10.cell<-function(log, pval.chunks){
 97 |   idx<-pvals.cells.index$id[log >= pvals.cells.index$LP & log < pvals.cells.index$UP]
 98 |   return (idx)
 99 | }
100 | 
101 | ### Find the exact cell which the position is within on the heatmap
102 | ## chr - chr for index
103 | ## position - position for index
104 | ## chr.chunks - datastructure containing cell locations for chromosomes and positions.
105 | chrpos.cell<-function(chr, position, chr.chunks){
106 |   slice<-chr.chunks[chr.chunks$chr==chr & chr.chunks$s < position & chr.chunks$e > position,]
107 |   
108 |   return (slice$posid)
109 | }
110 | 
111 | if(rebuild==T){## rebuild the heatmap matrix and other datastructures if the flag is set
112 |   message("Rebuilding matrix")
113 |   ## read the gwas results
114 |   message("Reading the GWAS results...")
115 |   d<-read.table(infile, header=T)
116 |   
117 |   ## map columns to those specified by the user.
118 |   names<-names(d)
119 |   for(i in 1:length(names)){
120 |     col<-names[i]
121 |     
122 |     if(col == chrname){names[i] <- "chr"}
123 |     if(col == posname){names[i] <- "pos"}
124 |     if(col == pvalname){names[i] <- "Pvalue"}
125 |     if(col == frqname){names[i] <- "FRQ"}
126 |     if(col == conseqname){names[i] <- "conseq"}
127 |   }
128 |   
129 |   names(d)<-names
130 |   
131 |   ## check data file has the correct headers
132 |   correct.names<-c("chr","pos","Pvalue","FRQ","conseq")
133 |   for(i in (1:length(correct.names))){
134 |     if(!correct.names[i]%in%names(d)){
135 |       message(paste0(correct.names[i], " not found in data file!\n"))
136 |       return()
137 |     }
138 |   }
139 |   
140 |   d<-d[!is.na(d$pos),]
141 |   ## align the frequencies to the minor allele
142 |   d$FRQ[d$FRQ > 0.5]<-(1-(d$FRQ[d$FRQ>0.5]))
143 |   
144 |   ## check that the chromosome column is in correct format.
145 |   waitifnot(is.numeric(d$chr), "chr column in gwas data should be numeric, please check if encoded X, or with `chr` prefix")
146 |   ## Support for the X chromosome?
147 |   lastchr<-max(unique(d$chr))
148 |   
149 |   ## read the snp info
150 |   snp.info<-read.table(snpfile, header=T, sep="\t")
151 |   
152 |   ## check snp file has the correct headers
153 |   correct.names<-c("markername","gene","chr","pos","eaf","OR","Pvalue","novel")
154 |   for(i in (1:length(correct.names))){
155 |     if(!correct.names[i]%in%names(snp.info)){
156 |       message(paste0(correct.names[i], " not found in SNP file!\n"))
157 |       return()
158 |     }
159 |   }
160 |   
161 |   snp.info<-snp.info[order(snp.info$chr, snp.info$pos, decreasing=F),]
162 |   
163 |   ## read the config data
164 |   config<-read.table(configfile,sep="\t", header =T,stringsAsFactors = F, skip=10)
165 |   
166 |   ## generate the pvalue bins (using the pval.split parameter)
167 |   pvals<-seq(from=0, to=max.pval, by=pval.split) # max(-log10(d$Pvalue))
168 |   pvals.cells.index<-data.frame(id=1:length(pvals),LP=pvals,UP=c(pvals[2:length(pvals)],max.pval))
169 |   
170 |   final<-matrix(0, nrow = length(pvals), ncol = 0)
171 |   
172 |   chr.matrix.len<-as.data.frame(matrix(nrow=lastchr,ncol=3))
173 |   names(chr.matrix.len)<-c("length", "cumm", "mid")
174 |   
175 |   pos.chunks<-as.data.frame(matrix(nrow=0,ncol=4))
176 |   names(pos.chunks)<-c("posid","chr","s", "e")
177 |   
178 |   pos.idx<-0
179 |   
180 |   idx.count<-vector(mode="numeric",length=length(config$idx)+1)
181 |   max.cellcount<-0
182 |   
183 |   if(debugflag==T){ ## Generate the log file if the flag is set.
184 |     logfile<-file(debugfile,open="a")
185 |     ## columns for the log output table
186 |     cat("chr","st","en","log10p","idx","\n", file=logfile,append = T,sep="\t")
187 |   }
188 |   
189 |   ## locations to be labelled on the plot.
190 |   pos.interest<-data.frame(marker=character,log10pval=numeric,chr=numeric,pos=numeric,col=character)
191 |   
192 |   for(chr in 1:lastchr){
193 |     message(paste("Processing chromosome ",chr,"\r", sep=""),appendLF = F)
194 |     
195 |     ## extract chromosome specific data from GWAS input
196 |     chr.slice<-d[d$chr==chr,]
197 |     
198 |     ## Generate the chromosome position bins (using the pos.split parameter)
199 |     chunks<-seq(from=min(chr.slice$pos), to=max(chr.slice$pos), by=pos.split)
200 |     chunks[length(chunks)+1]<-max(chr.slice$pos)
201 |     
202 |     ## create the matrix for this chromosome (pvals * position)
203 |     mdat<-matrix(0, nrow = length(pvals), ncol = length(chunks)-1)
204 |     
205 |     for (i in 1:(length(chunks)-1)){
206 |       ## get slice of gwas input for each of the position bins
207 |       slice<-chr.slice[chr.slice$pos >= chunks[i] & chr.slice$pos < chunks[i+1],]
208 |       
209 |       for (j in 1:length(pvals)){
210 |         ## get slice of gwas input for each of the pvalue bins (and position bins)
211 |         if(j == length(pvals)){ ## last element in pval array - include all variants gt then this
212 |           p.val.slice<-slice[-log10(slice$Pvalue) >= pvals[j],]  
213 |         }
214 |         else{ ## otherwise slice between the current and next values in the pval array
215 |           p.val.slice<-slice[-log10(slice$Pvalue) >= pvals[j] & -log10(slice$Pvalue) < pvals[j+1],]
216 |         }      
217 |         
218 |         ## determine the number of variants in the slice
219 |         len<-dim(p.val.slice)[1]
220 |         idx<-0
221 |         
222 |         ## determine the number of variants in the slice, which have the HIGH consequence flag set to 1
223 |         conseq.len<-dim(p.val.slice[p.val.slice$conseq==1,])[1] ## test if any high consequences
224 |         ## determine the number of variants in the slice, which have MAF less than 5%
225 |         maf.len<-dim(p.val.slice[p.val.slice$FRQ<=MAF,])[1] ## test if any MAF < 5%
226 |         
227 |         if(len == 0){idx<-0} ## the case with no variants - blank cell.
228 |         else{
229 |           ## len - number of snps in the bin
230 |           ## conseq.len - does it include any high impact variants
231 |           ## maf.len - does it include any MAF < 5%
232 |           
233 |           ## this defines the region which should be greyed out
234 |           if(pvals[j] <= -log10(FDR)){
235 |             ## determine whether the chromosome is odd and even
236 |             if((chr %% 2) != 0){idx<-config$idx[config$type=="oddchr"]}
237 |             else{idx<-config$idx[config$type=="evenchr"]}
238 |           }
239 |           else{
240 |             for(k in 1:length(config$idx)){
241 |               if(config$type[k] == "val"){ ## Check the config is a valid type.
242 |                   len.chk<-FALSE
243 |                   conseq.chk<-FALSE
244 |                   maf.chk<-FALSE
245 |                   
246 |                   ## check counts
247 |                   if(len >= config$min.count[k]){ ## check that the current cell length is gt or eq than the config min count
248 |                     if(is.na(config$max.count[k])){ ## if the config max is NA then accept length condition
249 |                       len.chk<-TRUE
250 |                     }
251 |                     else if(len <= config$max.count[k]){ ## if the current cell length is lt the config max count then accept the length condition
252 |                       len.chk<-TRUE
253 |                     }
254 |                   }
255 |                   
256 |                   ## check HIGH impact
257 |                   if(config$conseq[k] == TRUE && conseq.len > 0){conseq.chk<-TRUE} ## accept if high impact is active in config and there are 1 or more high impact variants within the cell
258 |                   else if(config$conseq[k] == FALSE && conseq.len == 0){conseq.chk<-TRUE} ## accept if high impact is inactive and there are 0 high impact variants within the cell.
259 |                   
260 |                   ## check MAF
261 |                   if(config$maf[k] == TRUE && maf.len > 0){maf.chk<-TRUE} ## accept if MAF 5% active and there is one or more variant with MAF lt 5% in the cell
262 |                   else if(config$maf[k] == FALSE && maf.len == 0){maf.chk<-TRUE} ## accept if MAF 5% active and there are 0 variants with MAF lt 5% in the cell
263 |                   
264 |                   if(len.chk==TRUE && conseq.chk==TRUE && maf.chk==TRUE){ ## if all three clauses are correct then accept the idx for the config and exit the for loop
265 |                     idx<-config$idx[k]
266 |                     
267 |                     if(config$report[k]==TRUE){ ## if reporting is active for the config then add an entry to the pos.interest table.
268 |                       tmp.df<-data.frame(
269 |                         marker=paste(idx,sep=""),
270 |                         log10pval=pvals[j],chr=chr,pos=(chunks[i]+1),
271 |                         col=config$col[k])
272 |                       
273 |                       pos.interest<-rbind(pos.interest,tmp.df)
274 |                     }
275 |                     break ## found config which fulfils cell criteria accept and exit group.
276 |                   }
277 |                   
278 |               }
279 |             }
280 |             
281 |             if(idx == 0){ ## if no idx was found in the annotations, generate a remaining idx (max idx in annotations + 1)
282 |               idx<-max(config$idx)+1; ## assign to others
283 |             }
284 |             
285 |           if(len > max.cellcount){max.cellcount<-len} ## assign the maximum cell count if length of current cell is gt than current
286 |           }
287 |         }
288 | 
289 |         idx.count[idx]<-(idx.count[idx] + 1) # increment the block count for each index
290 |         
291 |         mdat[j,i]<-idx  
292 |     
293 |         if(debugflag==T){ ## log information for current cell.
294 |           
295 |           oddchridx<-config[config$type=="oddchr",]$idx
296 |           evenchridx<-config[config$type=="evenchr",]$idx
297 |           
298 |           if(idx != oddchridx || idx != evenchridx || idx != 0){ ## generate the log file, dont log the greyed out regions
299 |             cat(chr,chunks[i],chunks[i+1],pvals[j],idx,"\n", file=logfile,append = T,sep="\t")
300 |           }
301 |         }
302 |         
303 |       }
304 |     }
305 |     
306 |     ## cell sizes of the chromosomes
307 |     chr.matrix.len$length[chr]<-dim(mdat)[2]
308 |     chr.matrix.len$cumm[chr]<-dim(mdat)[2]+ dim(final)[2]
309 |     chr.matrix.len$mid[chr]<-chr.matrix.len$cumm[chr] - (chr.matrix.len$length[chr] / 2)
310 |     
311 |     ## bind to the final matrix
312 |     final<-cbind(final,mdat)
313 |     
314 |     tmp.chunks<-as.data.frame(matrix(nrow=length(chunks)-1,ncol=4))
315 |     names(tmp.chunks)<-c("posid","chr","s", "e")
316 |       
317 |     for(j in 1:dim(tmp.chunks)[1]){ ## assign position chunk indexes to table
318 |         #print(j)
319 |         tmp.chunks$posid[j]<-pos.idx+j
320 |         tmp.chunks$chr[j]<-chr
321 |         tmp.chunks$s[j]<-chunks[j]
322 |         tmp.chunks$e[j]<-chunks[j+1]
323 |     }
324 |     
325 |     ## the cell id for each of the position chunks in the heatmap
326 |     pos.chunks<-rbind(pos.chunks,tmp.chunks)
327 |     pos.idx<-pos.idx+dim(tmp.chunks)[1]
328 |   }
329 |   
330 |   if(debugflag==T){ ## close the log file if debuging is active.
331 |     close(logfile) 
332 |   }
333 |   
334 |   snpcells<-vector(length=length(snp.info$markername))
335 |   
336 |   for (i in 1:length(snp.info$markername)){ ## assign a position cell for each of the variants in the SNP information table.
337 |     snpcells[i]<-chrpos.cell(snp.info$chr[i],snp.info$pos[i],pos.chunks)
338 |   }
339 |   
340 |   message("\nMelting matrix...")
341 |   m<-melt(final)
342 |   names(m)<-c("pval","pos", "val")
343 | 
344 |   if(dim(pos.interest)[1] > 0){  ## if there are any positions of interest assign the cell positions on the heatmap
345 |     pos.interest$pvalidx<-log10.index(pos.interest$log10pval)
346 |     pos.interest$pos.idx<--1
347 |     
348 |     for (i in 1:length(pos.interest$marker)){ ## assign a position cell for each of the cells of interest
349 |       pos.interest$pos.idx[i]<-chrpos.cell(pos.interest$chr[i],pos.interest$pos[i],pos.chunks)
350 |     }
351 |   }
352 | 
353 | } ## END OF REBUILD SECTION
354 | 
355 | 
356 | ## assign the maximum variants in the cells
357 | peak.val<-ceiling(max.cellcount / 100)*100 ## nearest 10,000 to max cell value
358 | 
359 | col.discrete<-c("white",config$col)
360 | 
361 | col.text<-vector(mode="character",length=length(config$type))
362 | 
363 | ## Build the text for the legend using the information in the config table
364 | for(k in 1:length(col.text)){ 
365 |   if(config$type[k]=="val"){
366 |     
367 |     max.count<-config$max.count[k]
368 |     
369 |     if(is.na(max.count)){ ## if the max count is assigned to NA then the highest cell count (with approp ceiling) is used
370 |       max.count<-peak.val
371 |     }
372 |     
373 |     if(config$min.count[k]==1 &&max.count==1){
374 |       col.text.tmp<-paste(config$idx[k],") ",
375 |                           config$min.count[k],
376 |                           sep="")
377 |     }
378 |     else{
379 |       col.text.tmp<-paste(config$idx[k],") ",
380 |                          config$min.count[k]," - ",
381 |                          max.count,
382 |                          sep="")
383 |     }
384 |   
385 |   if(config$conseq[k]==FALSE && config$maf[k]==FALSE){ ## if both MAF and HIGH impact are disabled - do not add any text
386 |     ##col.text[k]<-col.text.tmp
387 |   }
388 |   else if(config$conseq[k] == TRUE && config$maf[k]==TRUE){ ## if both MAF and HIGH impact are active - print BOTH
389 |     col.text.tmp<-paste(col.text.tmp," (BOTH)",sep="")
390 |   }
391 |   else if(config$conseq[k] == TRUE && config$maf[k] == FALSE){ ## if only HIGH impact is active - print HIGH impact.
392 |     col.text.tmp<-paste(col.text.tmp," (HIGH impact)",sep="")
393 |   }
394 |   else if(config$conseq[k] == FALSE && config$maf[k] == TRUE){ ## if only MAD is active - print MAF
395 |     col.text.tmp<-paste(col.text.tmp," (MAF < ",MAF,")",sep="")
396 |   }
397 |   else{ ## all possible scenarios are covered - should not get here
398 |     warning("Should not get here!\n")  
399 |     stopifnot(FALSE)
400 |   }
401 |     col.text[k]<-paste(col.text.tmp," (",idx.count[k],")",sep="")
402 |   }
403 |   else{ ## otherwise print the config type.
404 |     col.text[k]<-config$type[k]
405 |   }
406 | }
407 | 
408 | col.brks<-1:length(config$type)
409 | 
410 | if(max(m$val) > max(config$idx)){ ## if there are any cells which do not have a category add them to the remaining category.
411 |   col.brks<-c(col.brks,max(m$val))
412 |   col.discrete<-c(col.discrete,"orange") ## hard coded colour for remaining.
413 |   col.text<-c(col.text,paste("Remaining (", idx.count[max(m$val)],")",sep=""))
414 | }
415 | 
416 | pval.seq<-seq(from=pval.units,to=max.pval,by=pval.units)
417 | 
418 | y.labels<-c("",pval.seq)
419 | y.breaks<-c(0.5,log10.index(pval.seq))
420 | 
421 | if(showgenes==FALSE){ ## if show genes flag is not set then put all the genes in the table
422 |   snp.info$novel=TRUE
423 | }
424 | 
425 | snp.info.known<-snp.info[snp.info$novel==FALSE,]
426 | snp.info.novel<-snp.info[snp.info$novel==TRUE,]
427 | 
428 | ## the core heatmap - generated using ggplot2
429 | main.core<-ggplot(data=m, aes(x=pos,y=pval)) + 
430 |   geom_tile(aes(fill = val))+ ##,colour= val), size=0.01) + 
431 |   theme(legend.position="left",legend.key.size=unit(0.5,"line"),
432 |         legend.title=element_text(size=5),
433 |         legend.text=element_text(size=5)) +
434 |   geom_hline(yintercept=p.val.cell(GWS)+0.5, linetype="dashed") + ## GWS line
435 |   geom_hline(yintercept=p.val.cell(FDR)+0.5, linetype="dashed") + ## FDR line
436 |   scale_fill_gradientn(colours = col.discrete, 
437 |         guide="legend", breaks=col.brks, 
438 |         labels=col.text,name = "Variant Count") + 
439 |   scale_y_continuous("-log10(p)",labels=y.labels, 
440 |         breaks=y.breaks,
441 |         expand=c(0,0),trans="reverse",position="right") +
442 |   theme(panel.grid.major =element_blank(), panel.grid.minor = element_blank())   + coord_flip(expand=T) + 
443 |   theme(plot.margin = unit(c(0,0,0,0), "cm")) +
444 |   scale_x_continuous("", labels=rep("",(lastchr*2)), 
445 |         breaks=c(chr.matrix.len$mid,chr.matrix.len$cumm),
446 |         expand=c(0,0),trans="reverse",position="top") +
447 |   theme(axis.ticks.y = element_blank()) +
448 |   theme(axis.line.x = element_line(color="black", size = 0.5))
449 |   
450 | ## if there is one or more known SNPs in the table then label the manhattan plot with them.
451 | if(dim(snp.info.known)[1] > 0){ 
452 |   repel.df<-as.data.frame(matrix(nrow=dim(snp.info.known)[1],ncol=3))
453 |   names(repel.df)<-c("marker","pvalidx","posidx")
454 |   
455 |   repel.df$marker<-snp.info.known$gene
456 |   if(showrsids==T){
457 |     repel.df$marker<-snp.info.known$markername
458 |   }
459 |   
460 |   repel.df$pvalidx<-p.val.index(snp.info.known$Pvalue) 
461 |   repel.df$posidx<-snpcells[snp.info$novel==FALSE] 
462 |   
463 |   if(dim(repel.df[repel.df$pvalidx > log10.index(max.pval),])[1] > 0){
464 |     repel.df[repel.df$pvalidx > log10.index(max.pval),]$pvalidx<-log10.index(max.pval)
465 |   }
466 |   
467 |   final.repel.plot<-main.core+
468 |     geom_label_repel(data=repel.df, aes(posidx,pvalidx, label=marker),
469 |           size=textsize, force=1, nudge_y = 10,nudge_x=10,
470 |           segment.colour="black", min.segment.length = 0,
471 |           segment.size=0.25, seed=500, max.iter = 5000,
472 |           point.padding = NA)
473 | }
474 | 
475 | ## if any positions of interest label on the manhattan plot then label them
476 | if(dim(pos.interest)[1]> 0){ 
477 |   main.core<-main.core+ geom_label_repel(data=pos.interest, aes(pos.idx,pvalidx, label=marker),
478 |         size=textsize, force=5, nudge_y = 10,nudge_x=10,
479 |         segment.colour="black", min.segment.length = 0,
480 |         segment.size=0.25, seed=500, max.iter = 5000,
481 |         point.padding = NA,segment.color = "black",color="black")
482 | }
483 | 
484 | ## convert coordinates 0-20 to coordinates for whole table.
485 | table.pos<-function(index){
486 |   idx<-(index / 20) * max.pval
487 |   return(log10.index(idx))
488 | }
489 | 
490 | ## the positions of the columns of the table (currently hard coded, with 0-20 coordinates)
491 | title.pos<-c(table.pos(17)+1, table.pos(13)+1, table.pos(11.5)+1,table.pos(10)+1,table.pos(7)+1)
492 | 
493 | ## Generate table for the novel genes
494 | ## Start with a completely blank plot (table1)
495 | table1<-ggplot(data=m, aes(x=pos,y=pval)) + 
496 |   geom_tile(aes(fill = rep(0,dim(m)[1]))) +
497 |   scale_x_continuous("", labels=rep("",(lastchr*2)),
498 |         breaks=c(chr.matrix.len$mid,chr.matrix.len$cumm),
499 |         expand=c(0,0),trans="reverse",position="top") + #, minor_breaks=chr.matrix.len$cumm) +
500 |   scale_y_continuous("",labels=rep("",length(y.labels)),
501 |         breaks=y.breaks,
502 |         expand=c(0,0),trans="reverse",position="right") +
503 |   coord_flip(expand=T) + 
504 |   scale_fill_gradientn(colours = c("white","white"), 
505 |         guide=FALSE, breaks=c(0,1), 
506 |         labels=c("0","1"),name = "")  +
507 |   theme(axis.ticks.y = element_blank()) +
508 |   theme(axis.ticks.x = element_blank()) +
509 |   theme(panel.grid.major = element_blank(),
510 |         panel.grid.minor = element_blank(),
511 |         panel.border = element_blank(),
512 |         panel.background = element_blank()) +
513 |   theme(plot.margin = unit(c(0,0,0,0), "cm"))
514 | 
515 | 
516 | lim<-layer_scales(table1)
517 | xmin<-lim$x$range$range[2]*-1
518 | xmax<-lim$x$range$range[1]*-1
519 | 
520 | segment.indexes<-c(table.pos(17.5)+1,table.pos(19)+1,table.pos(20)+1)
521 | chr.num.pos<-(table.pos(19.75)+1)
522 | brks.pos<-(table.pos(19.7)+1)
523 | 
524 | text.pos<-seq(from=xmin, to=xmax,length.out=length(snp.info.novel$markername)+1)
525 | text.pos1<-text.pos[2:length(text.pos)]
526 | title.pos1<-text.pos[1]
527 | 
528 | if(dim(snp.info.novel)[1] > 0){
529 |   ## add the SNP information to the table
530 |   ## Can add additional columns to the table here.
531 |   table2<-table1+
532 |     annotate("text", x = text.pos1,
533 |              y = title.pos[1], label = as.character(snp.info.novel$markername),
534 |              angle=0,size=textsize, hjust=0) + ## markername
535 |     annotate("text", x = text.pos1, 
536 |              y = title.pos[2], label = as.character(format(round(snp.info.novel$eaf,2),nsmall=2)),
537 |              angle=0,size=textsize, hjust=0) + ## EAF
538 |     annotate("text", x = text.pos1, 
539 |              y = title.pos[3], label = as.character(format(round(snp.info.novel$OR,2),nsmall=2)),
540 |              angle=0,size=textsize, hjust=0) + ## OR
541 |     annotate("text", x = text.pos1, 
542 |              y = title.pos[4], label = as.character(formatC(snp.info.novel$Pvalue, format = "E", digits = 2)),
543 |              angle=0,size=textsize, hjust=0) + ## P-value
544 |     annotate("text", x = text.pos1,
545 |              y = title.pos[5], label =as.character(snp.info.novel$gene),
546 |              angle=0,size=textsize, hjust=0,fontface = 'italic') + ## Nearest Gene
547 |     annotate("segment", x = text.pos1,
548 |              xend = snpcells[snp.info$novel==TRUE], y = segment.indexes[1], yend = segment.indexes[2],
549 |              colour = "blue", linetype="dashed", size=0.5) +  ## segment from midpoint to table row
550 |     annotate("segment", x = snpcells[snp.info$novel==TRUE], 
551 |              xend = snpcells[snp.info$novel==TRUE], y = segment.indexes[2], yend = segment.indexes[3],
552 |              colour = "blue", linetype="dashed", size=0.5) + ## segment from axis to mid point
553 |     annotate("text", x = chr.matrix.len$mid,
554 |              y = chr.num.pos, label = as.character(1:lastchr), 
555 |              angle=0,size=3.5, hjust=0)+ ## chromosome labels
556 |     annotate("segment", x = chr.matrix.len$cumm, 
557 |              xend = chr.matrix.len$cumm, y = brks.pos, yend = segment.indexes[3], 
558 |              colour = "black", linetype="solid") + ## x axis breaks
559 |     annotate("segment", x = 0,  
560 |              xend = xmax, y = segment.indexes[3], yend = segment.indexes[3],
561 |              colour = "black", linetype="solid")  ## xaxis solid line
562 | }
563 | else{
564 |   table2<-table1
565 | }
566 | 
567 | ## add the title to the table.
568 | ## Can add additional table headers here.
569 | final.table.plot<-table2 + 
570 |   annotate("text", x = title.pos1,
571 |            y = title.pos[1], label = "SNP",
572 |            angle=0,size=textsize, hjust=0,fontface = 'bold') +
573 |   annotate("text", x = title.pos1,
574 |            y = title.pos[2], label = "EAF",
575 |            angle=0,size=textsize, hjust=0,fontface = 'bold') +
576 |   annotate("text", x = title.pos1,
577 |            y = title.pos[3], label = "OR",
578 |            angle=0,size=textsize, hjust=0,fontface = 'bold') +
579 |   annotate("text", x = title.pos1,
580 |            y = title.pos[4], label = "p-value",
581 |            angle=0,size=textsize, hjust=0,fontface = 'bold') +
582 |   annotate("text", x = title.pos1,
583 |            y = title.pos[5], label = "Gene",
584 |            angle=0,size=textsize, hjust=0,fontface = 'bold') 
585 | 
586 | ## modify the clipping of the table, so it can be merged with the heatmap.
587 | gt <- ggplot_gtable(ggplot_build(final.table.plot)) # p4
588 | gt$layout$clip[gt$layout$name == "panel"] <- "off"
589 | 
590 | ## draw either as TIFF (drawastiff == T), or by default as PDF.
591 | if(drawastiff==T){
592 |   message(paste("\nGenerated tiff file: ", 
593 |                 outfile,".tif\nWidth = 8.27in Height = 11.69in\nDefault working directory: ", 
594 |                 getwd(), sep=""))
595 |   tiff(filename = paste(outfile,".tif",sep=""),width = 8.27,height = 11.69, units="in",res=300)
596 | } else{
597 |   message(paste("\nGenerated pdf file: ", 
598 |                 outfile,".pdf\nWidth = 8.27in Height = 11.69in\nDefault working directory: ", 
599 |                 getwd(), sep=""))
600 |   pdf(paste(outfile,".pdf",sep=""),width = 8.27,height = 11.69,onefile = F)
601 | }
602 | 
603 | ## by default plot the heatmap with regions of interest bubbles (showgenes == FALSE)
604 | final.plot<-main.core
605 | 
606 | ## if show genes flag is T then add repel gene labels from the SNP list to the heatmap plot
607 | if(showgenes==TRUE && dim(snp.info.known)[1] > 0){ 
608 |   final.plot<-final.repel.plot
609 | }
610 | 
611 | ## hard code variables for positions of two plots on qplot
612 | manh.max<-7
613 | annot.min<-6.7
614 | 
615 | ## merge the heatmap and the table plots together
616 | print(qplot(1:10,1:10,colour=I("white")) +
617 |   annotation_custom(grob=ggplotGrob(final.plot), xmin=0.5,xmax=manh.max, ymin=1,ymax = 10) +
618 |   annotation_custom(grob=gt, xmin=annot.min, xmax=10.5, ymin=1,ymax = 10) +
619 |   theme(axis.title.x=element_blank(),
620 |         axis.text.x=element_blank(),
621 |         axis.ticks.x=element_blank()) +
622 |   theme(axis.title.y=element_blank(),
623 |         axis.text.y=element_blank(),
624 |         axis.ticks.y=element_blank()) +
625 |   theme(panel.grid.major = element_blank(),
626 |         panel.grid.minor = element_blank()) +
627 |   theme(plot.background = element_rect(fill = 'white', colour = 'white')) +
628 |   theme(panel.background = element_rect(fill = 'white', colour = 'white')))
629 | 
630 | dev.out<-dev.off()
631 | }
632 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MANHATTAN++
 2 | 
 3 | MANHATTAN++ is software to generate a transposed manhattan heatmap, implemented in R.
 4 | 
 5 | ## Getting Started
 6 | 
 7 | You need to install the latest version of [R](https://www.r-project.org/) The R package can be run on Windows and Linux, you must specify paths to the filenames you are using as input and output.
 8 | 
 9 | To install the software from the GIT repository:
10 | ```
11 | install.packages("devtools")
12 | library(devtools)
13 | 
14 | install_github("cgrace1978/manhplot", dependencies = T, force = T)
15 | ```
16 | 
17 | The following command will run the plot with default data in the package. The pdf (test.pdf) will be created in the current working directory in R (This can be viewed using the getwd() command):
18 | ```
19 | library(manhplot)
20 | 
21 | infile<-system.file("extdata","cad.add.160614_manhformat.txt.gz",package = "manhplot")
22 | configfile<-system.file("extdata","config.txt", package = "manhplot")
23 | snpfile<-system.file("extdata","56cad.add.160614.variants.txt", package = "manhplot")
24 | 
25 | ## Run manhattan++ with the default paramaters and files included in the package
26 | manhplusplot(infile = infile,outfile = "test", configfile = configfile, snpfile = snpfile)
27 | ```
28 | For more information on using manhattan++ please visit the [Manhattan++ wiki](https://github.com/cgrace1978/manhplot/wiki/home)
29 | 
30 | ### Citation
31 | To use Manhattan++ please cite the following paper:
32 | 
33 | Grace *et al*
34 | 
35 | Manhattan++: displaying genome-wide association summary statistics with multiple annotation layers
36 | 
37 | BMC Bioinformatics 2019; 20(1):610
38 | 
39 | PubMed: [31775616](https://www.ncbi.nlm.nih.gov/pubmed/31775616)
40 | 


--------------------------------------------------------------------------------
/images/cells.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/cells.png


--------------------------------------------------------------------------------
/images/genesbub.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/genesbub.png


--------------------------------------------------------------------------------
/images/labels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/labels.png


--------------------------------------------------------------------------------
/images/reportbub.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/reportbub.png


--------------------------------------------------------------------------------
/images/rsidbub.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/rsidbub.png


--------------------------------------------------------------------------------
/images/zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/images/zoom.png


--------------------------------------------------------------------------------
/inst/extdata/56cad.add.160614.variants.txt:
--------------------------------------------------------------------------------
 1 | markername	gene	chr	pos	eaf	OR	Pvalue	novel
 2 | rs11206510	PCSK9	1	55496039	0.847627	1.08	2.34E-08	FALSE
 3 | rs9970807	PPAP2B	1	56965664	0.915097	1.13	5E-14	FALSE
 4 | rs7528419	SORT1	1	109817192	0.78582	1.12	1.97E-23	FALSE
 5 | rs6689306	IL6R	1	154395946	0.447545	1.06	2.6E-09	FALSE
 6 | rs67180937	MIA3	1	222823743	0.663052	1.08	1.01E-12	FALSE
 7 | rs16986953	AK097927	2	19942473	0.104706	1.09	1.45E-08	FALSE
 8 | 2:21378433:D	APOB	2	21378433	0.745655	1.07	2.89E-08	FALSE
 9 | 2:44074126:D	ABCG5-ABCG8	2	44074126	0.744839	1.06	0.000000026	FALSE
10 | rs7568458	VAMP5-VAMP8-GGCX	2	85788175	0.448518	1.06	3.62E-10	FALSE
11 | rs17678683	ZEB2-ACO74093.1	2	145286559	0.087681	1.1	0.000000003	FALSE
12 | 2:203828796:I	WDR12	2	203828796	0.107909	1.15	2.15E-18	FALSE
13 | 3:138099161:I	MRAS	3	138099161	0.162797	1.08	2.89E-09	FALSE
14 | rs4593108	EDNRA	4	148281001	0.795349	1.07	8.82E-10	FALSE
15 | rs72689147	GUCY1A3	4	156639888	0.816978	1.07	6.07E-09	FALSE
16 | rs273909	SLC22A4-SLC22A5	5	131667353	0.116756	1.06	0.000124	FALSE
17 | rs6903956	ADTRP-C6orf105	6	11774583	0.354055	1	0.96	FALSE
18 | rs9349379	PHACTR1	6	12903957	0.431606	1.14	1.81E-42	FALSE
19 | rs17609940	ANKS1A	6	35034800	0.823672	1.03	0.03	FALSE
20 | rs56336142	KCNK5	6	39134099	0.807262	1.07	1.85E-08	FALSE
21 | rs12202017	TCF21*	6	134173151	0.699953	1.07	1.98E-11	FALSE
22 | rs55730499	SLC22A3-LPAL2-LPA	6	161005610	0.056243	1.37	5.39E-39	FALSE
23 | rs4252185	PLG	6	161123451	0.059661	1.34	1.64E-32	FALSE
24 | rs2107595	HDAC9	7	19049388	0.20047	1.08	8.05E-11	FALSE
25 | rs10953541	7q22	7	107244545	0.782727	1.05	0.0000102	FALSE
26 | rs11556924	ZC3HC1	7	129663496	0.686675	1.08	5.34E-11	FALSE
27 | rs264	LPL	8	19813180	0.852594	1.06	0.0000106	FALSE
28 | rs2954029	TRIB1	8	126490972	0.551395	1.04	0.00000261	FALSE
29 | rs2891168	9p21	9	22098619	0.488668	1.21	2.29E-98	FALSE
30 | rs2519093	ABO	9	136141870	0.190872	1.08	1.19E-11	FALSE
31 | rs2487928	KIAA1462	10	30323892	0.418221	1.06	4.41E-11	FALSE
32 | rs1870634	CXCL12	10	44480811	0.637485	1.08	5.55E-15	FALSE
33 | rs1412444	LIPA	10	91002927	0.369131	1.07	5.15E-12	FALSE
34 | rs11191416	CYP17A1-CNNM2-NT5C2	10	104604916	0.87253	1.08	4.65E-09	FALSE
35 | rs2128739	PDGFD	11	103673277	0.323536	1.07	7.05E-11	FALSE
36 | rs964184	ZNF259-APOA5-APOA1	11	116648917	0.184706	1.05	0.000056	FALSE
37 | rs2681472	ATP2B1 	12	90008959	0.201306	1.08	6.17E-11	FALSE
38 | rs3184504	SH2B3	12	111884608	0.421808	1.07	1.03E-09	FALSE
39 | rs9319428	FLT1	13	28973621	0.314419	1.04	0.0000713	FALSE
40 | rs11838776	COL4A1/A2	13	111040681	0.263277	1.07	1.83E-10	FALSE
41 | rs10139550	HHIPL1	14	100145710	0.423033	1.06	1.38E-08	FALSE
42 | rs4468572	ADAMTS7	15	79124475	0.585831	1.08	4.44E-16	FALSE
43 | rs17514846	FURIN-FES	15	91416550	0.440264	1.05	0.00000031	FALSE
44 | rs216172	SMG6	17	2126504	0.349997	1.05	0.000000507	FALSE
45 | rs12936587	RAI1-PEMT-RASD1	17	17543722	0.61133	1.03	0.000824	FALSE
46 | rs46522	UBE2Z	17	46988597	0.513272	1.04	0.0000184	FALSE
47 | rs56289821	LDLR	19	11188247	0.899622	1.14	4.44E-15	FALSE
48 | rs4420638	APOE-APOC1 	19	45422946	0.166036	1.1	7.07E-11	FALSE
49 | rs28451064	KCNE2 (gene desert)	21	35593827	0.121186	1.14	1.33E-15	FALSE
50 | rs17087335	REST-NOA1	4	57838583	0.214637	1.06	4.59E-08	TRUE
51 | rs3918226	NOS3	7	150690176	0.064515	1.14	1.69E-09	TRUE
52 | rs10840293	SWAP70	11	9751196	0.549821	1.06	1.28E-08	TRUE
53 | rs56062135	SMAD3	15	67455630	0.794271	1.07	4.52E-09	TRUE
54 | rs8042271	MFGE8-ABHD2	15	89574218	0.902282	1.1	3.68E-08	TRUE
55 | rs7212798	BCAS3	17	59013488	0.146516	1.08	1.88E-08	TRUE
56 | rs663129	PMAIP1-MC4R	18	57838401	0.256835	1.06	0.000000032	TRUE
57 | rs180803	POM121L9P-ADORA2A	22	24658858	0.970732	1.2	1.64E-10	TRUE
58 | 


--------------------------------------------------------------------------------
/inst/extdata/cad.add.160614_manhformat.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/inst/extdata/cad.add.160614_manhformat.txt.gz


--------------------------------------------------------------------------------
/inst/extdata/config.txt:
--------------------------------------------------------------------------------
 1 | #### CONFIG file for use with MANH++ - Do not modify the first 10 lines							
 2 | ## min.count: The lower cell count threshold to accept this config							
 3 | ## max.count: The upper cell count threshold to accept this config							
 4 | ## maf: Is MAF detection active for this config - is there any variants within a cell with MAF < threshold?							
 5 | ## conseq: Is HIGH impact consequence active? Are there any variants with HIGH impact consequence in the cell?							
 6 | ## col: The colour which cells for this config							
 7 | ## idx: the index to use for this cell in the heatmap - MUST BE CONSECUTIVE FROM START TO END - STARTING AT 1							
 8 | "## type: val - an config entry, oddchr - the odd chromosome, evenchr - the even chromosome"							
 9 | ## report: Are these annotations labeled on the heatmap							
10 | #####							
11 | min.count	max.count	maf	conseq	col	idx	type	report
12 | 1	1	FALSE	FALSE	black	1	val	FALSE
13 | 1	1	FALSE	TRUE	lightpink	2	val	TRUE
14 | 1	1	TRUE	FALSE	green	3	val	FALSE
15 | 1	1	TRUE	TRUE	darkmagenta	4	val	TRUE
16 | 2	NA	FALSE	FALSE	blue	5	val	FALSE
17 | 2	NA	FALSE	TRUE	pink	6	val	TRUE
18 | 2	NA	TRUE	FALSE	red	7	val	FALSE
19 | 2	NA	TRUE	TRUE	cyan	8	val	TRUE
20 | NA	NA	NA	NA	darkgrey	9	oddchr	NA
21 | NA	NA	NA	NA	grey	10	evenchr	NA
22 | 


--------------------------------------------------------------------------------
/man/manhplot-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{manhplot-package}
 2 | \alias{manhplot-package}
 3 | \alias{manhplot}
 4 | \docType{package}
 5 | \title{
 6 | \packageTitle{manhplot}
 7 | }
 8 | \description{
 9 | \packageDescription{manhplot}
10 | }
11 | \details{
12 | 
13 | The DESCRIPTION file:
14 | \packageDESCRIPTION{manhplot}
15 | \packageIndices{manhplot}
16 | }
17 | \author{
18 | \packageAuthor{manhplot}
19 | 
20 | Maintainer: \packageMaintainer{manhplot}
21 | }
22 | \keyword{ package }
23 | 


--------------------------------------------------------------------------------
/man/manhplusplot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/manhattan.heatmap.v1.R
 3 | \name{manhplusplot}
 4 | \alias{manhplusplot}
 5 | \title{Generate the manhattan++ plot}
 6 | \usage{
 7 | manhplusplot(infile, outfile, configfile, snpfile, drawastiff = F,
 8 |   GWS = 5e-08, FDR = 0.001, MAF = 0.05, chrname = "chr",
 9 |   posname = "pos", pvalname = "pvalue", frqname = "maf",
10 |   conseqname = "conseq", showgenes = F, showrsids = F,
11 |   pos.split = 3e+06, pval.split = 0.125, max.pval = 20)
12 | }
13 | \arguments{
14 | \item{infile}{Input GWAS summary statistics}
15 | 
16 | \item{outfile}{Output file prefix for the manhattan++ plot}
17 | 
18 | \item{configfile}{Configuration file}
19 | 
20 | \item{snpfile}{Table of SNPs to visualize}
21 | 
22 | \item{drawastiff}{If TRUE draw a Tiff file, if FALSE draw a PDF file}
23 | 
24 | \item{GWS}{Genome wise significance pvalue threshold (5E-8 by default)}
25 | 
26 | \item{FDR}{False discovery Rate pvalue threshold (1E-3 by default)}
27 | 
28 | \item{MAF}{Minor Allele Frequency threshold}
29 | 
30 | \item{chrname}{Column name for chromosome in GWAS infile}
31 | 
32 | \item{posname}{Column name for position in GWAS infile}
33 | 
34 | \item{pvalname}{Column name for pvalue in GWAS infile}
35 | 
36 | \item{frqname}{column name for allele frequency in GWAS infile}
37 | 
38 | \item{conseqname}{column name for variant annotation consequence in GWAS infile}
39 | 
40 | \item{showgenes}{If T shows known genes as bubbles on main manhattan plot, if F show positions of interest as bubbles}
41 | 
42 | \item{showrsids}{If showgenes is T, then show the rsids, rather than genes}
43 | 
44 | \item{pos.split}{The bin lengths for positions}
45 | 
46 | \item{pval.split}{The bin lengths for pvalues}
47 | 
48 | \item{max.pval}{The maximum pvalue to display}
49 | }
50 | \description{
51 | Generate the manhattan++ plot
52 | }
53 | \details{
54 | For file formats see github page \url{https://github.com/cgrace1978/manhplot}
55 | }
56 | \examples{
57 | 
58 | \donttest{
59 | library(manhplot)
60 | infile<-system.file("extdata","cad.add.160614_manhformat.txt.gz",package = "manhplot")
61 | configfile<-system.file("extdata","config.txt", package = "manhplot")
62 | snpfile<-system.file("extdata","56cad.add.160614.variants.txt", package = "manhplot")
63 | 
64 | manhplusplot(infile = infile,outfile = file.path(tempdir(), "default-plot"), 
65 |                configfile = configfile, snpfile = snpfile)
66 | }
67 | 
68 | }
69 | \author{
70 | Chris Grace
71 | }
72 | 


--------------------------------------------------------------------------------
/perl/gen_snpfile.pl:
--------------------------------------------------------------------------------
  1 | use strict;
  2 | use Getopt::Long qw(GetOptions);
  3 | my %locus=();
  4 | my $chrcol="";
  5 | my $snpcol="";
  6 | my $poscol="";
  7 | my $pcol="";
  8 | my $betacol="";
  9 | my $eafcol="";
 10 | my $chridx=0;
 11 | my $snpidx=0;
 12 | my $posidx=0;
 13 | my $pidx=0;
 14 | my $betaidx=0;
 15 | my $eafidx=0;
 16 | 
 17 | my $debug=0;
 18 | my $file="";
 19 | my $genefile="glist-hg19";#insert the file for gene coordinates depending on the build
 20 | my $snpgenedist=500000;#if gene is beyond this distance from SNP either side, the gene annotation is "gene desert"
 21 | my $gwasp=5e-8;#pvalue cut-off to define a peak
 22 | my $boundary=500000;#distance between 2 peaks to be classified as a locus. 
 23 |                     #Can be changed to 0.5 or 1 if interested in cM boundaries 
 24 | 		    #provided the posidx is pointing to cM positions
 25 | my $totalchrs=0;
 26 | my $outfile="";
 27 | GetOptions(
 28 |     'chrom=s' => \$chrcol,
 29 |     'var=s'   => \$snpcol,
 30 |     'pos=s'   => \$poscol,
 31 |     'pval=s'  => \$pcol,
 32 |     'beta=s'  => \$betacol,
 33 |     'eaf=s'   => \$eafcol,
 34 |     'gwasfile=s' => \$file,
 35 |     'genefile=s' => \$genefile,
 36 |     'gwaspcut=s' => \$gwasp,
 37 |     'locusbounds=s' => \$boundary,
 38 |     'snpgenebounds=s'=> \$snpgenedist,
 39 |     'out=s'   => \$outfile,
 40 |     'help' => \$debug,
 41 | ) or die "Usage: $0 --help\n";
 42 | 
 43 | if ($debug || $file eq ""){
 44 | print "\n\n";
 45 |   print <<HELP;
 46 | ######################################################################
 47 | Help Documentation to generate locus file from GWAS summary statistics
 48 | Contact: Anuj Goel
 49 | Version: 1.0
 50 | Date: 1 Oct 2019
 51 | Please cite Manhattan++ software if you use this script.
 52 | 
 53 | Usage:
 54 |   --gwasfile       : [Required] The input GWAS summary file name (tab delimited). 
 55 |                    : Can be a gzipped file. Must have .gz extension.
 56 | 		   : The input file must have the below 6 columns.
 57 |   --chrom          : [Required] Column name containing chromosome numbers (numeric)
 58 |   --var            : [Required] Column name containing variant names
 59 |   --pos            : [Required] Column name having variant positions (basepairs)
 60 |   --pval           : [Required] Column name having variant p-values (not log10 transformed)
 61 |   --beta           : [Required] Column name having variant beta values (not Odds Ratios)
 62 |   --eaf            : [Required] Column name having variant effect allele frequency
 63 |   --genefile       : [Required] The input gene database file name
 64 |                    : Please download from PLINK/GitHub. (eg. glist-hg19, glist-hg38)
 65 |   --gwaspcut       : The p-value cut-off for locus (default: 5e-8)
 66 |   --locusbounds    : The distance between 2 loci (default: 500000)
 67 |   --snpgenebounds  : The max distance between SNP & gene to consider (default: 500000)
 68 |   --out            : Output file name prefix. (default: gwasfile.script.txt)
 69 |   --help           : 
 70 | #######################################################################
 71 | HELP
 72 | print "\n\n";
 73 | exit;
 74 | }
 75 | 
 76 | print "\n\n";
 77 | if ($outfile eq ""){
 78 |   $outfile="$file.script";
 79 | }
 80 | my $datestring = localtime();
 81 | print "Analysis started: $datestring\n";
 82 | print "Parameters:\n";
 83 | print "  GWAS Filename: $file\n";
 84 | print "  Variant column name: $snpcol\n";
 85 | print "  Variant chromosome column name: $chrcol\n";
 86 | print "  Variant position column name: $poscol\n";
 87 | print "  Variant beta column name: $betacol\n";
 88 | print "  Variant allele frequency column name: $eafcol\n";
 89 | print "  Variant pvalue column name: $pcol\n";
 90 | print "  GWAS threshold: $gwasp\n";
 91 | print "  Gene database to use: $genefile\n";
 92 | print "  Distance between 2 loci: $boundary\n";
 93 | print "  Max. distance between variant and gene to consider: $snpgenedist\n";
 94 | print "  Output file: $outfile.txt\n\n\n";
 95 | 
 96 | 
 97 | my %orsnps=();
 98 | my %eafsnps=();
 99 | my %psnps=();
100 | #Take care of gzip input summary statistics file
101 | if ($file=~/\.gz$/){
102 |   print "Opening gzipped file ...\nMight not work if running this script in an OS with no zcat installed\n\n";
103 |   open (IF, "zcat $file |") or die "Cannot open $file\n;exit;";
104 | }else{
105 |   open (IF, $file) or die "Cannot open $file\n";
106 | }
107 | 
108 | 
109 | open (OF ,">$outfile.txt") or die "Cannot write output file\n";
110 | print OF "chr markername pos Pvalue OR eaf gene novel\n";
111 | #chr snp pos p beta eaf
112 | my $header=<IF>;
113 | chomp $header;
114 | #figure out column names and column numbers
115 | my @colnames=split(/\t/,$header);
116 | my $colx=0;#should add up to 6 as we need 6 column names
117 | for (my $z=0;$z<=$#colnames;$z++){
118 |   if    ($colnames[$z] eq $chrcol){
119 |     $chridx=$z;
120 |     $colx++;
121 |   }elsif($colnames[$z] eq $snpcol){
122 |     $snpidx=$z;
123 |     $colx++;
124 |   }elsif($colnames[$z] eq $poscol){
125 |     $posidx=$z;
126 |     $colx++;
127 |   }elsif($colnames[$z] eq $pcol){
128 |     $pidx=$z;
129 |     $colx++;
130 |   }elsif($colnames[$z] eq $betacol){
131 |     $betaidx=$z;
132 |     $colx++;
133 |   }elsif($colnames[$z] eq $eafcol){
134 |     $eafidx=$z;
135 |     $colx++;
136 |   }
137 | }
138 | if($colx<6){
139 |   print "$colx ERROR: Not all column names specified. Please use --help for usage.\n";
140 |   exit;
141 | }
142 | 
143 | print "Reading GWAS summary file ... \n\n";
144 | my %sig=();
145 | my $co=1;
146 | #store information of the lead SNPs only in hash.
147 | my $maxchr=0;
148 | while (my $line=<IF>){
149 |   chomp $line;
150 |   my @cells=split('\t',$line);
151 |   
152 |   if ($cells[$chridx]=~m/\D/){
153 |     print "ERROR: Chromosomes need to be numeric\n";
154 |     exit;
155 |   }elsif($cells[$chridx]>$maxchr){
156 |     $maxchr=$cells[$chridx];
157 |   }
158 |   if ($cells[$pidx]<$gwasp){ 
159 |     $sig{$line}=$cells[$pidx];#store whole line as key. pval as value
160 |     $psnps{$cells[$snpidx]}=$cells[$pidx];
161 |     $eafsnps{$cells[$snpidx]}=sprintf("%.2f", $cells[$eafidx]);
162 |     my $or=exp($cells[$betaidx]);
163 |     $orsnps{$cells[$snpidx]}=sprintf("%.2f", $or);
164 |   }
165 | }
166 | 
167 | $totalchrs=$maxchr;
168 | close IF;
169 | print "Reading Gene database file ... \n\n";
170 | open (IF, $genefile) or die "Cannot open $genefile\n";
171 | #19 58858171 58864865 A1BG
172 | #19 58863335 58866549 A1BG-AS1
173 | my %genesbychr=();#chr as key and array of gene details as value
174 | my %chr=("X"=>23,"Y"=>24);
175 | while (my $line=<IF>){
176 |   my ($c,$s,$e,$g)=split(/\s+/,$line);
177 |   if ($c eq "X"){
178 |     $c=23;
179 |   }elsif($c eq "Y"){
180 |     $c=24;
181 |   }
182 |   if (exists $genesbychr{$c}){#if chr exists
183 |     my %genes=%{$genesbychr{$c}};#gene hash of genes for this chr
184 |     if (!exists $genes{$g}){#store this gene as its not in the hash
185 |        my @coors=($s,$e);
186 |        $genes{$g}=\@coors;
187 |        $genesbychr{$c}=\%genes;
188 |     }
189 |   }else{#create chr entry in the hash
190 |     my @coors=($s,$e);
191 |     my %genes=();
192 |     $genes{$g}=\@coors;
193 |     $genesbychr{$c}=\%genes;
194 |   }
195 | }
196 | 
197 | close IF;
198 | print "Identifying loci ... \n";
199 | for (my $chr=1;$chr<=$totalchrs;$chr++){#process chr by chr
200 |   my @sortedchr=();
201 |   my $c=0;
202 |   foreach my $line (sort { $sig{$a} <=> $sig{$b} } keys  %sig){#sort rows by p-values
203 |     my @cells=split('\t',$line);
204 |     #print "$line\n";
205 |     if ($cells[$chridx] == $chr){
206 |       $c++;
207 |       push(@sortedchr,$line);
208 |     }
209 |   }
210 |   #print "Total in this chr=$c\n";
211 |   
212 |   #Do the filtering in the subroutine
213 |   my %list=%{&getloci(@sortedchr)};
214 |   
215 |   #print the output for the chromosome
216 |   foreach my $name (sort { $list{$a} <=> $list{$b} } keys %list){
217 |     my $snppos=$list{$name};
218 |     #get gene info
219 |     my %genelist=%{$genesbychr{$chr}};
220 |     my $dist5=$snpgenedist;
221 |     my $dist3=$snpgenedist;
222 |     my $gene5="ABC";
223 |     my $gene3="DEF";
224 |     my $genein="GEH";
225 |     foreach my $genes (keys %genelist){
226 |       my ($s,$e)=@{$genelist{$genes}};
227 |       if ($snppos>=$s && $snppos<=$e){#simple situation
228 |          $genein=$genes;
229 |          last;
230 |       }elsif($snppos<$s){#get 3' gene
231 |         my $dist=$s-$snppos;
232 |         if ($dist<=$snpgenedist && $dist<=$dist3){#gene within defined distance
233 |           $dist3=$dist;#overwrite this to get the closest gene 3'
234 |           $gene3=$genes;
235 |         }
236 |       }elsif($snppos>$e){#get 5' gene
237 |        my $dist=$snppos-$e;
238 |        if ($dist<=$snpgenedist && $dist<=$dist5){#gene within defined distance
239 |          $dist5=$dist;
240 |          $gene5=$genes;
241 |        }
242 |       }
243 |     }
244 |     #format final gene output
245 |     my $geneout="gene desert";#genes are far apart as defined from SNP
246 |     if ($genein ne "GEH"){#best situation
247 |        $geneout=$genein;
248 |     }elsif($gene5 ne "ABC" && $gene3 ne "DEF"){#found 2 genes either side of SNP & within defined dist
249 |        $geneout="$gene5/$gene3";
250 |     }elsif($gene5 ne "ABC" && $gene3 eq "DEF"){#found only 1 gene 5' of SNP. 3' gene is too far
251 |        $geneout=$gene5;
252 |     }elsif($gene5 eq "ABC" && $gene3 ne "DEF"){#found only 1 gene 3' of SNP. 5' gene is too far
253 |        $geneout=$gene3;
254 |     }
255 | 
256 |     print OF $chr," ",$name," ",$snppos," ",$psnps{$name}," ",$orsnps{$name}," ",$eafsnps{$name}," $geneout FALSE\n";
257 |     $co++;
258 |   }
259 |   
260 | }#process next chromosome
261 | 
262 | print "\n************************************************************************************\n\n\n";
263 | print "WARNING: Please remember to update the \"novel\" column in the output file ($outfile.txt)\n\n";
264 | print "\n************************************************************************************\n";
265 | close OF;
266 | $datestring = localtime();
267 | print "Analysis finished: $datestring\n";
268 | 
269 | exit;
270 | 
271 | sub getloci(){
272 |   my (@sorted)=@_;
273 |   my $total=scalar @sorted;
274 |   my @sortedtmp=();
275 |   my %list=();
276 |   @sortedtmp=@sorted;
277 |   my $spliced=0;
278 |   my $loci=0;
279 |   my @cells1=();
280 |   
281 |   #keep on going through the array till the time the 
282 |   #number of rows dropped and numb of loci found add 
283 |   #up to the total number of rows in this chromosome
284 |   
285 |   while($spliced+$loci<=$total){
286 |     my $line1=shift @sorted;#the top row is the locus
287 |     @sortedtmp=@sorted;#make a copy as you dont want to iterate the array which you are processing too
288 |     $loci++;#increment locus by 1
289 |     @cells1=split('\t',$line1);
290 |     if (defined $cells1[$snpidx]){
291 |       #print "Y:$cells1[1] $cells1[2]\n";
292 |       $list{$cells1[$snpidx]}=$cells1[$posidx]; 
293 |     }
294 |     #print "XX $loci $cells1[1] $cells1[2]\n";
295 |     my @coord=();
296 |     #iterate through the list of loci to drop rows (cells in array) which are nearby (+/-1 Mb)
297 |     foreach my $key(keys %list){
298 |       my $specific=0;
299 |       #iterate through the rows (@sortedtmp)
300 |       for (my $x=0;$x<=$#sortedtmp;$x++){
301 |         #foreach $line(@sorted){
302 |         my $line=$sortedtmp[$x];
303 |         # print "$line\n";
304 |         my @cells=split('\t',$line);
305 |       
306 |         if (($cells[$posidx]>(($list{$key})-$boundary)) && ($cells[$posidx]<(($list{$key})+$boundary)) && $key ne $cells[$snpidx]){
307 |           #print "Splicing $cells[2] $list{$key} $key $cells[1] ",$cells[2]-$list{$key},"\n";
308 | 	  #splice(@sorted,$x,1);
309 | 	  push(@coord,$x);
310 | 	  $specific++;#keep track of number of rows filtered for this locus
311 | 	  $spliced++;#keep track of all rows filtered for this chromosome
312 |         }
313 |       }
314 |       if($specific>=1){#if there are any to be filtered for this locus
315 |         #print "Array before=",scalar @sorted, " ";
316 |         my @tmp=@{&splicearray(\@sorted,\@coord)};
317 |         #print "Splice=$specific, Array after=",scalar @tmp,"\n";
318 |         @sortedtmp=@tmp;#update the arrays
319 |         @sorted=@tmp;#update the arrays
320 |         #print "$loci $spliced $total\n";
321 |       }
322 |     }
323 |   }
324 |   #sort out the last element. 
325 |   my $line1=shift @sorted;
326 |   my @cells1=split('\t',$line1);
327 |   #print "Y:$cells1[1] $cells1[2]\n";
328 |   if (defined $cells1[$snpidx]){  
329 |     $list{$cells1[$snpidx]}=$cells1[$posidx];
330 |   }
331 |   return \%list;
332 | }
333 | 
334 | #subroutine to delete elements of the array and then tidy up the array
335 | sub splicearray(){
336 |   my ($array,$coord)=@_;
337 |   my @tmp=@{$array};
338 |   foreach my $co(@{$coord}){
339 |     delete $tmp[$co];
340 |   }
341 |   my @new=();
342 |   foreach(@tmp){
343 |     if( ( defined $_) and !($_ =~ /^$/ )){
344 |         push(@new, $_);
345 |     }
346 |   }
347 |   return \@new;
348 | }
349 | 


--------------------------------------------------------------------------------
/perl/readme:
--------------------------------------------------------------------------------
 1 | Date: 2/10/2019
 2 | Author: Anuj Goel
 3 | Script: gen_snpfile.pl
 4 | 
 5 | This Perl script has been tested in Linux OS (Centos 7) using Perl version v5.16.3.
 6 | The aim of this script is to read a GWAS summary statistics file and generate a list of lead variants and annotate them with nearest gene names.
 7 | To be used as "snpfile" for Manhattan++
 8 | 
 9 | The script uses a gene database that can be downloaded from PLINK website:
10 | https://www.cog-genomics.org/plink/1.9/resources#genelist
11 | 
12 | The summary statistics file can be plain text file or gzipped (tab delimited).
13 | The summary statistics file must have at least the following columns (with headers)
14 | [Required] Column name containing chromosome numbers (numeric)
15 | [Required] Column name containing variant names
16 | [Required] Column name having variant positions (basepairs)
17 | [Required] Column name having variant p-values (not log10 transformed)
18 | [Required] Column name having variant beta values (not Odds Ratios)
19 | [Required] Column name having variant effect allele frequency
20 | 
21 | For help, run:
22 | perl gen_snpfile.pl --help
23 | 
24 | The script defines a locus if 2 variants are at least 500 kbp apart or defined using --locusbounds and have p<5e-8 or defined using --gwaspcut.
25 | 
26 | The script annotates the lead variant with gene names based on proximity to nearest gene. 
27 |  - If gene is beyond 500kbp (or defined using --snpgenebounds)  apart either side, the variant is in a gene-desert.
28 |  - If variant is within a gene start-end, the variant gets that gene annotation
29 |  - If the variant is between 2 genes, the 2 nearest genes are reported for that variant
30 |  - If the variant is between 2 genes and one gene is further than --snpgenebounds, only the nearest 3' or 5' gene is reported
31 | 
32 | 
33 | *****************
34 | IMPORTANT
35 | ****************
36 | 
37 | The user still need to update the output file to identify the loci that are novel.
38 | 
39 | ****************
40 | 
41 | For bugs, please report via GitHub Issues page.
42 | -Thank you.
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(manhplot)
3 | 
4 | test_check("manhplot")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/5cad.add.160614.variants_chr1.txt:
--------------------------------------------------------------------------------
1 | markername	gene	chr	pos	eaf	OR	Pvalue	novel
2 | rs11206510	PCSK9	1	55496039	0.847627	1.08	2.34E-08	FALSE
3 | rs9970807	PPAP2B	1	56965664	0.915097	1.13	5E-14	FALSE
4 | rs7528419	SORT1	1	109817192	0.78582	1.12	1.97E-23	FALSE
5 | rs6689306	IL6R	1	154395946	0.447545	1.06	2.6E-09	FALSE
6 | rs67180937	MIA3	1	222823743	0.663052	1.08	1.01E-12	FALSE
7 | 


--------------------------------------------------------------------------------
/tests/testthat/cad.add.160614_manhformat_chr1.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cgrace1978/manhplot/aa1aa5abd2d571a4146c3b15748df9c26a5f1643/tests/testthat/cad.add.160614_manhformat_chr1.txt.gz


--------------------------------------------------------------------------------
/tests/testthat/config.txt:
--------------------------------------------------------------------------------
 1 | #### CONFIG file for use with MANH++ - Do not modify the first 10 lines							
 2 | ## min.count: The lower cell count threshold to accept this config							
 3 | ## max.count: The upper cell count threshold to accept this config							
 4 | ## maf: Is MAF detection active for this config - is there any variants within a cell with MAF < threshold?							
 5 | ## conseq: Is HIGH impact consequence active? Are there any variants with HIGH impact consequence in the cell?							
 6 | ## col: The colour which cells for this config							
 7 | ## idx: the index to use for this cell in the heatmap - MUST BE CONSECUTIVE FROM START TO END - STARTING AT 1							
 8 | "## type: val - an config entry, oddchr - the odd chromosome, evenchr - the even chromosome"							
 9 | ## report: Are these annotations labeled on the heatmap							
10 | #####							
11 | min.count	max.count	maf	conseq	col	idx	type	report
12 | 1	1	FALSE	FALSE	black	1	val	FALSE
13 | 1	1	FALSE	TRUE	lightpink	2	val	TRUE
14 | 1	1	TRUE	FALSE	green	3	val	FALSE
15 | 1	1	TRUE	TRUE	darkmagenta	4	val	TRUE
16 | 2	NA	FALSE	FALSE	blue	5	val	FALSE
17 | 2	NA	FALSE	TRUE	pink	6	val	TRUE
18 | 2	NA	TRUE	FALSE	red	7	val	FALSE
19 | 2	NA	TRUE	TRUE	cyan	8	val	TRUE
20 | NA	NA	NA	NA	darkgrey	9	oddchr	NA
21 | NA	NA	NA	NA	grey	10	evenchr	NA
22 | 


--------------------------------------------------------------------------------
/tests/testthat/testmanhplusplot.R:
--------------------------------------------------------------------------------
 1 | context("Running the manhplusplot on (small) dummy data")
 2 | library(manhplot)
 3 | 
 4 | test_that("Run the manhplusplot function with default params", {
 5 |   infile<-test_path("cad.add.160614_manhformat_chr1.txt.gz")
 6 |   configfile<-test_path("config.txt")
 7 |   snpfile<-test_path("5cad.add.160614.variants_chr1.txt")
 8 | 
 9 |   manhplusplot(infile = infile, outfile = file.path(tempdir(), "testpdf"),configfile = configfile, snpfile = snpfile)
10 | })
11 | 
12 | test_that("Run the manhplusplot function with output as tiff file", {
13 |   infile<-test_path("cad.add.160614_manhformat_chr1.txt.gz")
14 |   configfile<-test_path("config.txt")
15 |   snpfile<-test_path("5cad.add.160614.variants_chr1.txt")
16 |   
17 |   manhplusplot(infile = infile, outfile = file.path(tempdir(), "testtiff"),configfile = configfile, snpfile = snpfile, drawastiff = T)
18 | })
19 | 


--------------------------------------------------------------------------------