├── Data
    ├── BRCA_sample.maf
    ├── natDistBinned.txt
    └── protNames.txt
├── DifferentialMutationAnalysis.R
├── Method Overview.png
├── README.md
└── parseMaf.R


/DifferentialMutationAnalysis.R:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | # Differential Mutation Analysis
  3 | #
  4 | # The only required input is a single MAF file, a sample MAF file for BRCA is
  5 | # provided in "Data/BRCA_sample.maf" and can be tested by calling
  6 | #
  7 | # DifferentialMutationAnalysis("Data/BRCA_sample.maf")
  8 | #
  9 | # Output is a single two or three column file with protein names, their 
 10 | # uEMD scores, and optionally, supporting q-values, named after the input
 11 | # file with DiffMut appended to it (e.g. "BRCA_sample-DiffMut.txt")
 12 | #
 13 | # The code can optionally be run to search for oncogenes or tumor suprressor
 14 | # genes separately by passing "onco" or "TSG" as options for geneType
 15 | #
 16 | # DifferentialMutationAnalysis("Data/BRCA_sample.maf", geneType="onco")
 17 | #
 18 | # Finally, the code computes supporting q-values for genes. To 
 19 | # compute q-values simply pass a value p which determines the numer of
 20 | # background distributions to generate (default is 5). Note that this comes at 
 21 | # a cost to runtime. The code can be run with no permutations to quickly output
 22 | # a list of genes ranked by uEMD score.
 23 | # 
 24 | # DifferentialMutationAnalysis("Data/BRCA_sample.maf", p=0)
 25 | #
 26 | # If you have the following three R packages: "data.table", "plyr", and
 27 | # "Matrix" you can set the flag usePackages to TRUE to significantly decrease
 28 | # file read time
 29 | ###############################################################################
 30 | 
 31 | #helper function to parse MAF file
 32 | source("parseMaf.R")
 33 | 
 34 | #rank normalize mutaton or variation counts
 35 | fastRank = function(x) { 
 36 |   x[x!=0] = rank(x[x!=0], ties.method="min")+length(which(x==0)) 
 37 |   x/length(x) 
 38 | }
 39 | 
 40 | #bin counts to build histogram
 41 | bins = function(v, p=100) {
 42 |   l = length(v)
 43 |   nBins = rep(0,p)
 44 |   for(val in v){
 45 |     nBins[ceiling(val*(p-1))+1] = nBins[ceiling(val*(p-1))+1]+1
 46 |   }
 47 |   nBins = nBins/l
 48 |   nBins
 49 | }
 50 | 
 51 | #compute unidirectional EMD between mutationas and variations
 52 | uEMD = function(tBins, nBins, p=100) {
 53 |   sum = 0
 54 |   move = 0
 55 |   for(i in p:2){
 56 |     move = move+tBins[i]-nBins[i]
 57 |     sum = sum+max(move,0)
 58 |   }
 59 |   sum
 60 | }
 61 | 
 62 | #generate random uEMDs to compute FDRs
 63 | generateRandomEMDs = function(tRank, nRankBinned) {
 64 |   permRank = t(apply(tRank, 1, sample))
 65 |   permRankBinned = apply(permRank, 2, bins)
 66 |   randEMDs = sapply(1:dim(nRankBinned)[2], function(x) uEMD(permRankBinned[,x], nRankBinned[,x]))
 67 |   randEMDs
 68 | }
 69 | 
 70 | #compute FDRs based on random uEMDs
 71 | computeFDR = function(uEMDscores, randEMDs) {
 72 |   FDRs = sapply(uEMDscores, function(x) length(which(randEMDs>=x))/(length(which(uEMDscores>=x))))
 73 |   FDRs
 74 | }
 75 | 
 76 | #compute q-values from FDRs
 77 | computeQ = function(FDRs, uEMDscores) {
 78 |   qVal = sapply(1:length(FDRs), function(x) min(FDRs[uEMDscores<=uEMDscores[x]]))
 79 |   qVal
 80 | }
 81 | 
 82 | #Main Function for Differential Mutation Analysis
 83 | DifferentialMutationAnalysis = function(mafFile, geneType="all", p=5,
 84 |                                         outDir = "Output/", 
 85 |                                         protFile = "Data/protNames.txt", 
 86 |                                         natBinFile = "Data/natDistBinned.txt",
 87 |                                         usePackages = FALSE) {
 88 | 
 89 |   if(usePackages){
 90 |     library("data.table")
 91 |     library("plyr")
 92 |     library("methods")
 93 |     library("Matrix")
 94 |   }
 95 | 
 96 |   #A list of protein names
 97 |   protNames = read.table(protFile, stringsAsFactors=FALSE)$V1
 98 | 
 99 |   #load ranked binned natural variation count data
100 |   if(!usePackages){
101 |     nRankBinned = read.table(natBinFile)
102 |   }
103 |   else{
104 |     nRankBinned = fread(natBinFile, data.table=FALSE)
105 |   }
106 | 
107 |   #determine if we want to find all cancer genes or just oncogenes or TSGs
108 |   if(geneType=="onco"){
109 |     tCount = parseMaf(protNames, mafFile, usePackages, "Missense_Mutation")
110 |   } else if(geneType=="TSG"){
111 |     tCount = parseMaf(protNames, mafFile, usePackages, "Nonsense_Mutation")
112 |   } else{
113 |     tCount = parseMaf(protNames, mafFile, usePackages)
114 |   }
115 |   
116 |   #rank normalize mutations
117 |   tRank = t(apply(tCount, 1, fastRank))
118 | 
119 |   #bin the rank distribution
120 |   tRankBinned = apply(tRank, 2, bins)
121 | 
122 |   #compute uEMD scores
123 |   uEMDscore = sapply(1:length(protNames), 
124 |     function(x) uEMD(tRankBinned[,x], nRankBinned[,x]))
125 | 
126 |   #create output directory if it doesn't exist
127 |   if(!dir.exists(outDir)){ dir.create(outDir) }
128 | 
129 |   #output only uEMD scores if no q-values are needed (faster run time)
130 |   if(p==0){
131 |     write.table(cbind(protNames, uEMDscore), 
132 |       paste0(outDir, strsplit(basename(mafFile),".maf")[[1]],"-DiffMut.txt"), quote=FALSE, row.names=FALSE)
133 |   }
134 |   else{
135 |     #compute q-values, p determines number of times random uEMDs are generated
136 |     FDRs = rowSums(sapply(1:p, function(x) 
137 |       computeFDR(uEMDscore, generateRandomEMDs(tRank, nRankBinned))))/p
138 |     qVal = computeQ(FDRs, uEMDscore)
139 |     write.table(cbind(protNames, uEMDscore, qVal), 
140 |       paste0(outDir, strsplit(basename(mafFile),".maf")[[1]],"-DiffMut.txt"), quote=FALSE, row.names=FALSE)
141 |   }
142 | 
143 | }
144 | 


--------------------------------------------------------------------------------
/Method Overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Singh-Lab/Differential-Mutation-Analysis/8b40b773a90e24b0cdcb40b24c6d499d6079e746/Method Overview.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Differential Mutation Analysis
 2 | 
 3 | Differential mutation analysis is a framework that uncovers cancer genes by comparing the mutational profiles of genes across cancer genomes with their natural germline variation profiles across healthy individuals. If you want to try it out, visit [diffmut.princeton.edu](http://diffmut.princeton.edu). If you use our method please cite Pawel Przytycki and Mona Singh. "Differential analysis between somatic mutation and germline variation profiles reveals cancer-related genes." *Genome Medicine* (2017) available [here](https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-017-0465-6).
 4 | 
 5 | ![Method Overview](https://github.com/PFPrzytycki/Differential-Mutation-Analysis/blob/master/Method%20Overview.png)
 6 | 
 7 | This is the code for our method for evaluating genes for differential mutation. Our approach, outlined in the figure above, is entirely based on somatic mutations and germline variation, without any additional parameters. Briefly, for a cancer type of interest, we first count, for each individual, the number of mutations found in the exons of each gene. Similarly, we use the 1000 Genomes sequencing data to count, for each individual, how many variants appear in each gene. We define a variant as any amino acid that differs from the most common one across the healthy cohort. For each individual, we then rank normalize the mutation or variant counts across genes so that each gene is assigned a score between 0 and 1 that reflects the relative number of mutations or variants that fall within it. Next, for each gene, we aggregate its mutation and variation scores across healthy and cancer cohorts separately, resulting in a set of normalized variation scores as well as a set of normalized mutation scores. We use these sets to build a pair of histograms estimating the density of mutation and variant normalized scores. The first represents the gene’s tendency to be ranked highly amongst all genes with respect to somatic mutation across a cancer genome cohort; the other represents its tendency to be ranked highly with respect to germline variation across a healthy cohort. In order to uncover whether a gene has a mutational profile that is very different between healthy and cancer cohorts, we compute the distance between the two distributions using a modification of the classical Earth Mover’s Distance, which we refer to as a unidirectional Earth Mover’s Distance (uEMD). Finally, we rank all genes by their uEMD scores, considering higher ranking genes to be more likely to be functionally related to a given cancer type, and compute a supporting q-value for each uEMD Score.
 8 | 
 9 | 
10 | The only required input is a single MAF file, a sample MAF file for BRCA is provided in "Data/BRCA_sample.maf" and can be tested by calling
11 | 
12 | DifferentialMutationAnalysis("Data/BRCA_sample.maf")
13 | 
14 | Output is a single two or three column file with protein names, their uEMD scores, and optionally, supporting q-values, named after the input file with DiffMut appended to it (e.g. "BRCA_sample-DiffMut.txt")
15 | 
16 | The code can optionally be run to search for oncogenes or tumor suprressor genes separately by passing "onco" or "TSG" as options for geneType
17 | 
18 | DifferentialMutationAnalysis("Data/BRCA_sample.maf", geneType="onco")
19 | 
20 | Finally, the code computes supporting q-values for genes. To compute q-values simply pass a value p which determines the numer of background distributions to generate (default is 5). The code can be run with no permutations to quickly output a list of genes with their uEMD scores.
21 |  
22 | DifferentialMutationAnalysis("Data/BRCA_sample.maf", p=0)
23 | 


--------------------------------------------------------------------------------
/parseMaf.R:
--------------------------------------------------------------------------------
 1 | #A helper function to parse raw MAF files
 2 | #This function reads the provided gene name, sample id, and mutation type and then generates
 3 | #a table of counts for the number of mutations each sample has in each gene
 4 | 
 5 | #We note that additional parsing of MAF files is necessary in a few cases:
 6 | # 1. some different sample IDs refer to the same patient
 7 | # 2. some gene names are poorly annotated
 8 | #This simple parser does not address these issues
 9 | 
10 | parseMaf = function(protNames, mafFile, usePackages, mutTypes=c("Missense_Mutation", "Nonsense_Mutation")) {
11 |     
12 |   tryCatch({
13 | 
14 |     if(!usePackages){stop("usePackages")}
15 | 
16 |     #Read maf file skipping comment lines
17 |     mut = fread(mafFile, skip="Hugo_Symbol", header=TRUE, fill = TRUE)
18 |     
19 |     #Trim proteins and mutation types
20 |     mut = mut[mut$Hugo_Symbol %in% protNames & mut$Variant_Classification %in% mutTypes,]
21 |     
22 |     #If ids are TCGA barcodes, trim them for uniqueness
23 |     if(grepl("TCGA", mut$Tumor_Sample_Barcode[1])){
24 |       mut$Tumor_Sample_Barcode = sapply(mut$Tumor_Sample_Barcode, function(x) paste(strsplit(x, "-")[[1]][1:4], collapse="-"))
25 |     }
26 |     ids = unique(mut$Tumor_Sample_Barcode)
27 | 
28 |     #Count mutations per patients per gene
29 |     tCount = count(mut, vars=c("Hugo_Symbol","Tumor_Sample_Barcode"))
30 |     
31 |     #Convert counts to matrix
32 |     tCount =  spMatrix(length(ids), length(protNames),
33 |                        match(tCount$Tumor_Sample_Barcode, ids), match(tCount$Hugo_Symbol, protNames),
34 |                        tCount$freq)
35 | 
36 |     tCount
37 | 
38 |   }, error = function(err) {
39 |     print("Switching to slow read, either flag usePackages is set to FALSE or could not find one of the following columns: Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode")
40 |     ids = c()
41 |     tCount = c()
42 |     for(line in readLines(mafFile)){
43 |       #find the protein name
44 |       split = strsplit(line, "\t")[[1]]
45 |       protInd = match(split[1], protNames) 
46 | 
47 |        #check if protein name is in list
48 |       if(is.na(protInd)){next}
49 |       
50 |       #check mutation type
51 |       if(!any(sapply(mutTypes, function(x) grepl(x,line)))){next} 
52 | 
53 |       #find sample id and add to list
54 |       id = split[grepl("TCGA", split)][1] 
55 |       idInd = match(id, ids)
56 |       if(is.na(idInd)){
57 |         ids = c(ids, id)
58 |         idInd = length(ids)
59 |         tCount = rbind(tCount, rep(0, length(protNames)))
60 |       }
61 | 
62 |       #add to count for protein/sample pair
63 |       tCount[idInd, protInd] = tCount[idInd, protInd]+1
64 |     }
65 |     return(tCount)
66 |   })
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------