├── Data ├── BRCA_sample.maf ├── natDistBinned.txt └── protNames.txt ├── DifferentialMutationAnalysis.R ├── Method Overview.png ├── README.md └── parseMaf.R /DifferentialMutationAnalysis.R: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Differential Mutation Analysis 3 | # 4 | # The only required input is a single MAF file, a sample MAF file for BRCA is 5 | # provided in "Data/BRCA_sample.maf" and can be tested by calling 6 | # 7 | # DifferentialMutationAnalysis("Data/BRCA_sample.maf") 8 | # 9 | # Output is a single two or three column file with protein names, their 10 | # uEMD scores, and optionally, supporting q-values, named after the input 11 | # file with DiffMut appended to it (e.g. "BRCA_sample-DiffMut.txt") 12 | # 13 | # The code can optionally be run to search for oncogenes or tumor suprressor 14 | # genes separately by passing "onco" or "TSG" as options for geneType 15 | # 16 | # DifferentialMutationAnalysis("Data/BRCA_sample.maf", geneType="onco") 17 | # 18 | # Finally, the code computes supporting q-values for genes. To 19 | # compute q-values simply pass a value p which determines the numer of 20 | # background distributions to generate (default is 5). Note that this comes at 21 | # a cost to runtime. The code can be run with no permutations to quickly output 22 | # a list of genes ranked by uEMD score. 23 | # 24 | # DifferentialMutationAnalysis("Data/BRCA_sample.maf", p=0) 25 | # 26 | # If you have the following three R packages: "data.table", "plyr", and 27 | # "Matrix" you can set the flag usePackages to TRUE to significantly decrease 28 | # file read time 29 | ############################################################################### 30 | 31 | #helper function to parse MAF file 32 | source("parseMaf.R") 33 | 34 | #rank normalize mutaton or variation counts 35 | fastRank = function(x) { 36 | x[x!=0] = rank(x[x!=0], ties.method="min")+length(which(x==0)) 37 | x/length(x) 38 | } 39 | 40 | #bin counts to build histogram 41 | bins = function(v, p=100) { 42 | l = length(v) 43 | nBins = rep(0,p) 44 | for(val in v){ 45 | nBins[ceiling(val*(p-1))+1] = nBins[ceiling(val*(p-1))+1]+1 46 | } 47 | nBins = nBins/l 48 | nBins 49 | } 50 | 51 | #compute unidirectional EMD between mutationas and variations 52 | uEMD = function(tBins, nBins, p=100) { 53 | sum = 0 54 | move = 0 55 | for(i in p:2){ 56 | move = move+tBins[i]-nBins[i] 57 | sum = sum+max(move,0) 58 | } 59 | sum 60 | } 61 | 62 | #generate random uEMDs to compute FDRs 63 | generateRandomEMDs = function(tRank, nRankBinned) { 64 | permRank = t(apply(tRank, 1, sample)) 65 | permRankBinned = apply(permRank, 2, bins) 66 | randEMDs = sapply(1:dim(nRankBinned)[2], function(x) uEMD(permRankBinned[,x], nRankBinned[,x])) 67 | randEMDs 68 | } 69 | 70 | #compute FDRs based on random uEMDs 71 | computeFDR = function(uEMDscores, randEMDs) { 72 | FDRs = sapply(uEMDscores, function(x) length(which(randEMDs>=x))/(length(which(uEMDscores>=x)))) 73 | FDRs 74 | } 75 | 76 | #compute q-values from FDRs 77 | computeQ = function(FDRs, uEMDscores) { 78 | qVal = sapply(1:length(FDRs), function(x) min(FDRs[uEMDscores<=uEMDscores[x]])) 79 | qVal 80 | } 81 | 82 | #Main Function for Differential Mutation Analysis 83 | DifferentialMutationAnalysis = function(mafFile, geneType="all", p=5, 84 | outDir = "Output/", 85 | protFile = "Data/protNames.txt", 86 | natBinFile = "Data/natDistBinned.txt", 87 | usePackages = FALSE) { 88 | 89 | if(usePackages){ 90 | library("data.table") 91 | library("plyr") 92 | library("methods") 93 | library("Matrix") 94 | } 95 | 96 | #A list of protein names 97 | protNames = read.table(protFile, stringsAsFactors=FALSE)$V1 98 | 99 | #load ranked binned natural variation count data 100 | if(!usePackages){ 101 | nRankBinned = read.table(natBinFile) 102 | } 103 | else{ 104 | nRankBinned = fread(natBinFile, data.table=FALSE) 105 | } 106 | 107 | #determine if we want to find all cancer genes or just oncogenes or TSGs 108 | if(geneType=="onco"){ 109 | tCount = parseMaf(protNames, mafFile, usePackages, "Missense_Mutation") 110 | } else if(geneType=="TSG"){ 111 | tCount = parseMaf(protNames, mafFile, usePackages, "Nonsense_Mutation") 112 | } else{ 113 | tCount = parseMaf(protNames, mafFile, usePackages) 114 | } 115 | 116 | #rank normalize mutations 117 | tRank = t(apply(tCount, 1, fastRank)) 118 | 119 | #bin the rank distribution 120 | tRankBinned = apply(tRank, 2, bins) 121 | 122 | #compute uEMD scores 123 | uEMDscore = sapply(1:length(protNames), 124 | function(x) uEMD(tRankBinned[,x], nRankBinned[,x])) 125 | 126 | #create output directory if it doesn't exist 127 | if(!dir.exists(outDir)){ dir.create(outDir) } 128 | 129 | #output only uEMD scores if no q-values are needed (faster run time) 130 | if(p==0){ 131 | write.table(cbind(protNames, uEMDscore), 132 | paste0(outDir, strsplit(basename(mafFile),".maf")[[1]],"-DiffMut.txt"), quote=FALSE, row.names=FALSE) 133 | } 134 | else{ 135 | #compute q-values, p determines number of times random uEMDs are generated 136 | FDRs = rowSums(sapply(1:p, function(x) 137 | computeFDR(uEMDscore, generateRandomEMDs(tRank, nRankBinned))))/p 138 | qVal = computeQ(FDRs, uEMDscore) 139 | write.table(cbind(protNames, uEMDscore, qVal), 140 | paste0(outDir, strsplit(basename(mafFile),".maf")[[1]],"-DiffMut.txt"), quote=FALSE, row.names=FALSE) 141 | } 142 | 143 | } 144 | -------------------------------------------------------------------------------- /Method Overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Singh-Lab/Differential-Mutation-Analysis/8b40b773a90e24b0cdcb40b24c6d499d6079e746/Method Overview.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Differential Mutation Analysis 2 | 3 | Differential mutation analysis is a framework that uncovers cancer genes by comparing the mutational profiles of genes across cancer genomes with their natural germline variation profiles across healthy individuals. If you want to try it out, visit [diffmut.princeton.edu](http://diffmut.princeton.edu). If you use our method please cite Pawel Przytycki and Mona Singh. "Differential analysis between somatic mutation and germline variation profiles reveals cancer-related genes." *Genome Medicine* (2017) available [here](https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-017-0465-6). 4 | 5 | ![Method Overview](https://github.com/PFPrzytycki/Differential-Mutation-Analysis/blob/master/Method%20Overview.png) 6 | 7 | This is the code for our method for evaluating genes for differential mutation. Our approach, outlined in the figure above, is entirely based on somatic mutations and germline variation, without any additional parameters. Briefly, for a cancer type of interest, we first count, for each individual, the number of mutations found in the exons of each gene. Similarly, we use the 1000 Genomes sequencing data to count, for each individual, how many variants appear in each gene. We define a variant as any amino acid that differs from the most common one across the healthy cohort. For each individual, we then rank normalize the mutation or variant counts across genes so that each gene is assigned a score between 0 and 1 that reflects the relative number of mutations or variants that fall within it. Next, for each gene, we aggregate its mutation and variation scores across healthy and cancer cohorts separately, resulting in a set of normalized variation scores as well as a set of normalized mutation scores. We use these sets to build a pair of histograms estimating the density of mutation and variant normalized scores. The first represents the gene’s tendency to be ranked highly amongst all genes with respect to somatic mutation across a cancer genome cohort; the other represents its tendency to be ranked highly with respect to germline variation across a healthy cohort. In order to uncover whether a gene has a mutational profile that is very different between healthy and cancer cohorts, we compute the distance between the two distributions using a modification of the classical Earth Mover’s Distance, which we refer to as a unidirectional Earth Mover’s Distance (uEMD). Finally, we rank all genes by their uEMD scores, considering higher ranking genes to be more likely to be functionally related to a given cancer type, and compute a supporting q-value for each uEMD Score. 8 | 9 | 10 | The only required input is a single MAF file, a sample MAF file for BRCA is provided in "Data/BRCA_sample.maf" and can be tested by calling 11 | 12 | DifferentialMutationAnalysis("Data/BRCA_sample.maf") 13 | 14 | Output is a single two or three column file with protein names, their uEMD scores, and optionally, supporting q-values, named after the input file with DiffMut appended to it (e.g. "BRCA_sample-DiffMut.txt") 15 | 16 | The code can optionally be run to search for oncogenes or tumor suprressor genes separately by passing "onco" or "TSG" as options for geneType 17 | 18 | DifferentialMutationAnalysis("Data/BRCA_sample.maf", geneType="onco") 19 | 20 | Finally, the code computes supporting q-values for genes. To compute q-values simply pass a value p which determines the numer of background distributions to generate (default is 5). The code can be run with no permutations to quickly output a list of genes with their uEMD scores. 21 | 22 | DifferentialMutationAnalysis("Data/BRCA_sample.maf", p=0) 23 | -------------------------------------------------------------------------------- /parseMaf.R: -------------------------------------------------------------------------------- 1 | #A helper function to parse raw MAF files 2 | #This function reads the provided gene name, sample id, and mutation type and then generates 3 | #a table of counts for the number of mutations each sample has in each gene 4 | 5 | #We note that additional parsing of MAF files is necessary in a few cases: 6 | # 1. some different sample IDs refer to the same patient 7 | # 2. some gene names are poorly annotated 8 | #This simple parser does not address these issues 9 | 10 | parseMaf = function(protNames, mafFile, usePackages, mutTypes=c("Missense_Mutation", "Nonsense_Mutation")) { 11 | 12 | tryCatch({ 13 | 14 | if(!usePackages){stop("usePackages")} 15 | 16 | #Read maf file skipping comment lines 17 | mut = fread(mafFile, skip="Hugo_Symbol", header=TRUE, fill = TRUE) 18 | 19 | #Trim proteins and mutation types 20 | mut = mut[mut$Hugo_Symbol %in% protNames & mut$Variant_Classification %in% mutTypes,] 21 | 22 | #If ids are TCGA barcodes, trim them for uniqueness 23 | if(grepl("TCGA", mut$Tumor_Sample_Barcode[1])){ 24 | mut$Tumor_Sample_Barcode = sapply(mut$Tumor_Sample_Barcode, function(x) paste(strsplit(x, "-")[[1]][1:4], collapse="-")) 25 | } 26 | ids = unique(mut$Tumor_Sample_Barcode) 27 | 28 | #Count mutations per patients per gene 29 | tCount = count(mut, vars=c("Hugo_Symbol","Tumor_Sample_Barcode")) 30 | 31 | #Convert counts to matrix 32 | tCount = spMatrix(length(ids), length(protNames), 33 | match(tCount$Tumor_Sample_Barcode, ids), match(tCount$Hugo_Symbol, protNames), 34 | tCount$freq) 35 | 36 | tCount 37 | 38 | }, error = function(err) { 39 | print("Switching to slow read, either flag usePackages is set to FALSE or could not find one of the following columns: Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode") 40 | ids = c() 41 | tCount = c() 42 | for(line in readLines(mafFile)){ 43 | #find the protein name 44 | split = strsplit(line, "\t")[[1]] 45 | protInd = match(split[1], protNames) 46 | 47 | #check if protein name is in list 48 | if(is.na(protInd)){next} 49 | 50 | #check mutation type 51 | if(!any(sapply(mutTypes, function(x) grepl(x,line)))){next} 52 | 53 | #find sample id and add to list 54 | id = split[grepl("TCGA", split)][1] 55 | idInd = match(id, ids) 56 | if(is.na(idInd)){ 57 | ids = c(ids, id) 58 | idInd = length(ids) 59 | tCount = rbind(tCount, rep(0, length(protNames))) 60 | } 61 | 62 | #add to count for protein/sample pair 63 | tCount[idInd, protInd] = tCount[idInd, protInd]+1 64 | } 65 | return(tCount) 66 | }) 67 | 68 | } 69 | --------------------------------------------------------------------------------