├── .DS_Store ├── .Rbuildignore ├── .Rhistory ├── .Rproj.user ├── F983FB94 │ ├── build_options │ ├── cpp-definition-cache │ ├── pcs │ │ ├── files-pane.pper │ │ ├── source-pane.pper │ │ ├── windowlayoutstate.pper │ │ └── workbench-pane.pper │ ├── persistent-state │ ├── rmd-outputs │ ├── saved_source_markers │ └── sources │ │ └── prop │ │ ├── 1B64C678 │ │ ├── 1CC714F7 │ │ ├── 27C419B │ │ ├── 27EB650E │ │ ├── 2EE4D19B │ │ ├── 453A360A │ │ ├── 4668FDBC │ │ ├── 4EC81EA2 │ │ ├── 6CEE29D │ │ ├── 6E2F621B │ │ ├── 71CD7210 │ │ ├── 78F86D91 │ │ ├── 908CD31C │ │ ├── A67B5CF0 │ │ ├── C0746A86 │ │ ├── D6FEEFCC │ │ ├── D71BEEF9 │ │ ├── F8F7728B │ │ └── INDEX └── shared │ └── notebooks │ ├── patch-chunk-names │ └── paths ├── All_forests.r ├── All_forests_KRAS_Ingenuity.r ├── All_forests_KRAS_MSigDB.r ├── DESCRIPTION ├── NAMESPACE ├── OncoSig.Rproj ├── R ├── NBfunctions.R ├── OncoSig.R ├── OncoSigNB.R ├── OncoSigRF.R ├── OncoSigUnsup.R ├── Oncosig-RF │ └── OncoSig │ │ ├── .functions.R.swo │ │ ├── .functions.R.swp │ │ ├── ONCOSIG_README.md │ │ ├── ONCOSIG_README.pdf │ │ └── Test │ │ ├── OncoSig_objects.R │ │ ├── OncoSig_results.txt │ │ ├── Performance.pdf │ │ ├── gold_standard.txt │ │ └── test_network.txt ├── analysisFunctions.R ├── functionsRF.R └── rFunctions.R ├── README.md ├── man ├── NaiveBayesBin.Rd ├── OncoSig-package.Rd ├── OncoSig.Rproj ├── OncoSigNB.Rd ├── OncoSigRF.Rd ├── OncoSigUnsup.Rd ├── R │ ├── OncoSig.R │ └── OncoSigNB.Rd ├── computeLRsgivenBins.Rd ├── getFinalLR.Rd ├── getLRsgivenBin_info.Rd ├── getMaxLR.Rd ├── listToMatrix.Rd ├── replaceBinswithLR.Rd └── runNaiveBayesClassifier.Rd └── vignettes ├── .DS_Store ├── OncoSig-concordance.tex ├── OncoSig.Rnw ├── OncoSig.log ├── OncoSig.pdf ├── OncoSig.synctex ├── OncoSig.synctex.gz └── OncoSig.tex /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/.DS_Store -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | amp.matrix[amp.matrix == 2] <- "A" 2 | amp.matrix <- amp.matrix %>% as_tibble(rownames = NA) %>% 3 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>% 4 | inner_join(amp.events) %>% select(just.genes, tags, coseg.names, everything()) 5 | #dels 6 | del.events <- all.final.mat.event.types.df %>% filter(dels == "D") %>% 7 | select(coseg.names, tags, just.genes) 8 | # remove duplicates of both D and D+M 9 | if(length(unique(del.events$just.genes)) != length(del.events$just.genes)) { 10 | dups <- del.events %>% group_by(just.genes) %>% filter(n() > 1) 11 | to.keep <- dups %>% filter(tags == "D+M") 12 | del.events <- del.events %>% filter(!just.genes %in% dups$just.genes) %>% 13 | bind_rows(to.keep) 14 | } 15 | del.matrix <- del.mat[unique(del.events$just.genes), intersecting.samples , drop = F] 16 | # only get focal 17 | del.matrix[del.matrix > -2] <- NA 18 | del.matrix[del.matrix == -2] <- "D" 19 | del.matrix <- del.matrix %>% as_tibble(rownames = NA) %>% 20 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>% 21 | inner_join(del.events) %>% select(just.genes, tags, coseg.names, everything()) 22 | # fusions (if they exist) 23 | if(isFALSE(all(is.na(all.final.mat.event.types.df$fus)))) { 24 | fus.events <- all.final.mat.event.types.df %>% filter(fus == "F") %>% 25 | select(coseg.names, tags, just.genes) 26 | fus.mat <- read.table(paste0(fusion.data.dir, tumAcro, '.txt'), sep='\t', header=T, row.names=1, check.names=F) 27 | fus.matrix <- fus.mat[unique(fus.events$just.genes), intersect(colnames(fus.mat), intersecting.samples), drop = F] 28 | fus.matrix[fus.matrix == 1] <- "F" 29 | fus.matrix[fus.matrix == 0] <- NA 30 | fus.matrix <- fus.matrix %>% as_tibble(rownames = NA) %>% 31 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>% 32 | inner_join(fus.events) %>% select(just.genes, tags, coseg.names, everything()) 33 | } else { 34 | fus.matrix <- NULL 35 | } 36 | ## bind them all together 37 | tumor.events.mat <- bind_rows(mut.matrix, amp.matrix, del.matrix, fus.matrix) 38 | # combine rows of duplicates so they represent only one event 39 | dups <- tumor.events.mat %>% group_by(just.genes) %>% filter(n() > 1) 40 | if(nrow(dups) > 0) { 41 | # keep events that are just A vs D on separate lines 42 | to.keep <- dups %>% filter(tags %in% c("A", "D", "M")) 43 | remaining <- dups %>% filter(tags %in% c("A+M", "D+M")) 44 | for (gene in unique(remaining$just.genes)) { 45 | sub <- remaining[remaining$just.genes == gene,] 46 | # combine lines with same tags to merge M and A/D 47 | for (tag in unique(sub$tags)){ 48 | to.merge <- sub %>% filter(tags == tag) 49 | new.line <- apply(to.merge, 2, paste0, collapse = ";") 50 | new.line[1:3] <- unlist(to.merge[1,1:3]) 51 | to.keep <- bind_rows(to.keep, new.line) 52 | } 53 | } 54 | to.keep[to.keep == "NA;NA"] <- NA 55 | tumor.events.mat <- tumor.events.mat %>% filter(!just.genes %in% dups$just.genes) %>% 56 | bind_rows(to.keep) 57 | } 58 | trans.tumor.events.mat <- tumor.events.mat %>% column_to_rownames("coseg.names") %>% 59 | select(-just.genes, -tags) %>% t() %>% as_tibble(rownames = NA) %>% rownames_to_column("sample") 60 | # add clusters to trans.tumor.events.mat 61 | sample.clustering <- pancan.clusters[[tumAcro]] %>% enframe("sample", "cluster") %>% 62 | filter(sample %in% intersecting.samples) 63 | trans.tumor.events.mat <- inner_join(sample.clustering, trans.tumor.events.mat) %>% 64 | select(-sample) 65 | # get rid of gbm subtype 2 here before continuing 66 | if(tumAcro == "gbm"){ 67 | trans.tumor.events.mat <- filter(trans.tumor.events.mat, cluster != 2) 68 | } 69 | # check if any subtype had no events (was saved as a logical in the final events plots) 70 | no.events <- which(sapply(final.plots[["matrices"]][[tumAcro]], is.logical)) 71 | if(length(no.events > 0)) { 72 | trans.tumor.events.mat <- filter(trans.tumor.events.mat, !cluster %in% no.events) 73 | } 74 | get.percent <- function(x){ 75 | round(sum(!is.na(x))/length(x)*100) 76 | } 77 | # matrix with all the percentages for the events 78 | summary.df <- trans.tumor.events.mat %>% group_by(cluster) %>% summarise_all(get.percent) %>% 79 | column_to_rownames("cluster") %>% t() 80 | to.plot.df <- summary.df %>% as_tibble(rownames = NA) %>% 81 | rownames_to_column("event.names") 82 | # matrix with just the percentages for the events that are plotted 83 | # replace all the cells that are not MOMA events with 0s 84 | for (clus in colnames(to.plot.df)[2:ncol(to.plot.df)]) { 85 | events.this.clus <- all.final.mat.event.types[[as.numeric(clus)]]$coseg.names 86 | #to.plot.df.test <- to.plot.df %>% mutate(test1 = if_else(event.names %in% events.this.clus, to.plot.df[,clus], 0)) 87 | to.replace <- to.plot.df %>% as_tibble(rownames = NA) %>% 88 | select(event.names, all_of(clus)) %>% deframe() 89 | to.replace <- if_else(names(to.replace) %in% events.this.clus, to.replace, 90 | if_else(str_sub(names(to.replace), end = -3) %in% events.this.clus, to.replace, 0)) 91 | to.plot.df[,clus] <- to.replace 92 | } 93 | ### 94 | # do row-wise proportion test of percentages 95 | ### 96 | # first get number of samples per cluster 97 | cluster.sums <- trans.tumor.events.mat %>% group_by(cluster) %>% summarise(total = n()) 98 | # initialize final dataframe for events and p.values 99 | event.prop.df <- tibble(event = colnames(trans.tumor.events.mat[2:ncol(trans.tumor.events.mat)]), 100 | pval = 1) 101 | # create a contingency table for each event given it's occurence/non-occurence in each cluster 102 | # do a chisq test to determine if the proportions are or aren't the same 103 | for(ge in colnames(trans.tumor.events.mat[2:ncol(trans.tumor.events.mat)])) { 104 | get.non.na <- function(x) sum(!is.na(x)) 105 | sub.df <- trans.tumor.events.mat %>% select(cluster, all_of(ge)) %>% 106 | group_by(cluster) %>% summarise_all(get.non.na) %>% right_join(cluster.sums) %>% 107 | transmute(suc = get(ge), failures = total - get(ge)) %>% as.matrix() 108 | res <- chisq.test(sub.df, simulate.p.value = TRUE)$p.value 109 | event.prop.df[event.prop.df$event == ge, 2] <- res 110 | } 111 | # adjust pvals for multi hypothesis correction 112 | event.prop.df$adj.pval <- event.prop.df$stars <- p.adjust(event.prop.df$pval, method = "BH") 113 | #event.prop.df$stars[event.prop.df$stars > 0.05] <- "" 114 | event.prop.df <- mutate(event.prop.df, stars = ifelse(event.prop.df$stars < 0.0001, "****", 115 | ifelse(event.prop.df$stars < 0.001, "***", 116 | ifelse(event.prop.df$stars < 0.01, "**", 117 | ifelse(event.prop.df$stars < 0.05, "*", " "))))) 118 | # make the plot 119 | to.plot.df <- to.plot.df %>% column_to_rownames("event.names") %>% as.matrix() 120 | col_fun = colorRamp2(c(0, 1, 100), c("grey", "palegreen", "darkgreen")) 121 | if(nrow(to.plot.df) >= 60) { 122 | label.size <- 5 123 | } else if (nrow(to.plot.df) >= 50 ) { 124 | label.size <- 6 125 | } else if (nrow(to.plot.df) >= 40 ) { 126 | label.size <- 7 127 | } else if (nrow(to.plot.df) >= 30 ) { 128 | label.size <- 8 129 | } else { 130 | label.size <- 9 131 | } 132 | # make row annotation with astericks for chi.sq test 133 | ha <- rowAnnotation(signif = anno_text(event.prop.df$stars, gp = gpar(fontsize = label.size))) 134 | # make rowlabels that say cluster 135 | col.labels <- structure(paste0("Cluster ", colnames(to.plot.df)), names = colnames(to.plot.df)) 136 | # make the title and subtitle with p values 137 | title <- paste0("Events in ", toupper(tumAcro), "", 138 | "
p values: *** < 0.001 | ** < 0.01 | * < 0.05 " ) 139 | ht <- Heatmap(to.plot.df, col = col_fun, rect_gp = gpar(col = "white", lwd = 1), 140 | column_names_rot = 45, column_labels = col.labels, 141 | cell_fun = function(j, i, x, y, width, height, fill) { 142 | if(to.plot.df[i, j] > 0) { 143 | grid.text(sprintf("%.0f", to.plot.df[i, j]), x, y, gp = gpar(fontsize = label.size)) 144 | } else { 145 | grid.text(sprintf("%.0f", summary.df[i, j]), x, y, gp = gpar(fontsize = label.size)) 146 | } 147 | }, 148 | heatmap_legend_param = list(title = "% Samples \nin Cluster"), 149 | row_names_gp = gpar(fontsize = label.size), 150 | column_title = gt_render(title), 151 | show_row_dend = F, show_column_dend = F, 152 | right_annotation = ha) 153 | p <- grid::grid.grabExpr(draw(ht, padding = unit(c(2, 10, 2, .5), "mm"))) 154 | cosegregation.plots[[tumAcro]] <- p 155 | } 156 | m1 <- marrangeGrob(cosegregation.plots, ncol = 1, nrow = 1) 157 | ggsave(filename = paste0(output.folder, "cosegregation.plots.focal.only.pdf"), m1, 158 | width = 8.5, height = 11, units = c("in"), 159 | dpi = 300) 160 | ggsave(filename = "~/Desktop/cosegregation.plots.focal.only.pdf", m1, 161 | width = 8.5, height = 11, units = c("in"), 162 | dpi = 300) 163 | View(cosegregation.plots) 164 | cosegregation.plots <- list() 165 | for (tumAcro in tumor.types) { 166 | print(paste("Making plots for:", tumAcro)) 167 | # First clean up current event names for each subtype 168 | # Make table with event names and type 169 | all.final.mat.event.types <- list() 170 | for (clus in seq_along(final.plots[["matrices"]][[tumAcro]])) { 171 | final.mat <- final.plots[["matrices"]][[tumAcro]][[clus]] 172 | if (is.null(final.mat)) { 173 | all.final.mat.event.types[[clus]] <- NULL 174 | next 175 | } 176 | final.plot.names <- str_split_fixed(rownames(final.mat), pattern = "::", n = 2) 177 | # already added this to the original plot script 178 | # if(length(unique(final.plot.names[,1])) == nrow(final.mat)) { 179 | # rownames(final.mat) <- final.plot.names[,1] 180 | # } else { 181 | # # replace duplicate names with gene::type, otherwise just the name 182 | # dups <- final.plot.names[duplicated(final.plot.names[,1]),1] 183 | # new.names <- c() 184 | # 185 | # for (idx in seq_along(final.plot.names[,1])) { 186 | # ge <- final.plot.names[idx,1] 187 | # if (ge %in% dups) { 188 | # new.names <- c(new.names, rownames(final.mat)[idx]) 189 | # } else { 190 | # new.names <- c(new.names, ge) 191 | # } 192 | # } 193 | # rownames(final.mat) <- new.names 194 | # } 195 | # 196 | # resave new final mat to object for plotting heatmaps later 197 | # final.plots[["matrices"]][[tumAcro]][[clus]] <- final.mat 198 | # scrape final events matrix for types to create tags 199 | final.mat.event.types <- apply(final.mat, 1, function(x){paste0(unique(x, na.rm = T), collapse = "_")}) %>% 200 | enframe() 201 | final.mat.event.types <- final.mat.event.types %>% 202 | mutate(dels = if_else(str_detect(final.mat.event.types$value, "highdel"), "D", NA_character_)) %>% 203 | mutate(amps = if_else(str_detect(final.mat.event.types$value, "highamp"), "A", NA_character_)) %>% 204 | mutate(muts = if_else(str_detect(final.mat.event.types$value, "mut"), "M", NA_character_)) %>% 205 | mutate(fus = if_else(str_detect(final.mat.event.types$value, "fus"), "F", NA_character_)) 206 | tags <- final.mat.event.types %>% 207 | column_to_rownames("name") %>% select(-value) %>% 208 | as.matrix() %>% 209 | apply(X = ., MARGIN = 1, FUN = function(x){paste0(na.omit(x), collapse = "+")}) 210 | # make sure these names don't have the :: so they can be unified later 211 | cosegregation.names <- paste(final.plot.names[,1], tags, sep = " - ") 212 | final.mat.event.types$coseg.names <- cosegregation.names 213 | final.mat.event.types$tags <- tags 214 | final.mat.event.types$just.genes <- final.plot.names[,1] 215 | # replace any "multi" regions with the gene names 216 | multi.indices <- which(grepl("multi",final.mat.event.types$coseg.names)) 217 | if(length(multi.indices) > 0){ 218 | for (idx in multi.indices) { 219 | type <- if_else(grepl("D", final.mat.event.types$coseg.names[idx]), "del", "amp") 220 | region.name <- str_sub(final.plot.names[idx,1], end = -7) 221 | cluster <- paste0("cluster", clus) 222 | region.genes <- pluck(multi.gene.list, tumAcro, cluster, type, region.name) 223 | final.mat.event.types$just.genes[idx] <- region.genes[1] 224 | } 225 | } 226 | all.final.mat.event.types[[clus]] <- final.mat.event.types 227 | } 228 | all.final.mat.event.types.df <- all.final.mat.event.types %>% discard(is.null) %>% map(select, -value, -name) %>% 229 | reduce(full_join) %>% mutate(just.genes = str_split_fixed(just.genes, ";", n = 2)[,1]) 230 | #### get rid of duplicates from having A/D + M and just M alone 231 | # dups <- all.final.mat.event.types.df %>% group_by(just.genes) %>% filter(n() > 1) 232 | # all.final.mat.event.types.remove.dups <- filter(all.final.mat.event.types.df, !just.genes %in% dups$just.genes) 233 | # 234 | # for (event in unique(dups$just.genes)) { 235 | # sub.df <- dups %>% filter(just.genes == event) 236 | # name.split <- str_split_fixed(sub.df$coseg.names, " - ", 2)[,2] 237 | # 238 | # ## combine based on all different combos of duplicates 239 | # if(all(c("A+M", "D+M") %in% name.split) & nrow(sub.df) == 2) { 240 | # # just has two tags A+M and D+M 241 | # # don't merge! 242 | # all.final.mat.event.types.remove.dups <- bind_rows(all.final.mat.event.types.remove.dups, sub.df) 243 | # } else if (all(c("A+M", "D+M") %in% name.split) & nrow(sub.df) > 2) { 244 | # # has A+M, D+M and A/D/M alone 245 | # # only keep the A+M and D+M 246 | # 247 | # } 248 | # } 249 | ### load required event matrices 250 | ### Collect percentages for occurence per subtype 251 | # load all in first to get intersecting samples 252 | mut.mat <- get(load(paste0(snp.dir,"hugo-ids/", tumAcro, "-rawsnp.HUGO.rda"))) 253 | # cnvs, have to amps and dels separately 254 | # thresh.by.gene <- read.table(paste0(gistic.dir, tumAcro, '/all_thresholded.by_genes.txt'), header=T, sep='\t', row.names=1, check.names=F) 255 | thresh.by.gene <- vroom(paste0(gistic.dir, tumAcro, '/all_thresholded.by_genes.txt')) %>% column_to_rownames("Gene Symbol") 256 | # first two columns are metadata and sample names are in long format 257 | short.sample.ids <- sapply(colnames(thresh.by.gene)[3:ncol(thresh.by.gene)], function(x) substr(x,1,15), USE.NAMES = F) 258 | colnames(thresh.by.gene) <- c(colnames(thresh.by.gene)[1:2], short.sample.ids) 259 | cnv <- thresh.by.gene 260 | amp.mat <- del.mat <- cnv <- cnv[,3:ncol(cnv)] 261 | # get sample intersection 262 | intersecting.samples <- intersect(colnames(mut.mat), colnames(cnv)) %>% 263 | intersect(viper.names) 264 | # mutations 265 | mut.events <- all.final.mat.event.types.df %>% filter(muts == "M") %>% 266 | select(coseg.names, tags, just.genes) 267 | mut.matrix <- mut.mat[unique(mut.events$just.genes), intersecting.samples, drop = F] 268 | mut.matrix[mut.matrix == 1] <- "M" 269 | mut.matrix[mut.matrix == 0] <- NA 270 | mut.matrix <- mut.matrix %>% as_tibble(rownames = NA) %>% 271 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>% 272 | inner_join(mut.events) %>% select(just.genes, tags, coseg.names, everything()) 273 | #amps 274 | amp.events <- all.final.mat.event.types.df %>% filter(amps == "A") %>% 275 | select(coseg.names, tags, just.genes) 276 | # remove duplicates of both A and A+M 277 | if(length(unique(amp.events$just.genes)) != length(amp.events$just.genes)) { 278 | dups <- amp.events %>% group_by(just.genes) %>% filter(n() > 1) 279 | to.keep <- dups %>% filter(tags == "A+M") 280 | amp.events <- amp.events %>% filter(!just.genes %in% dups$just.genes) %>% 281 | bind_rows(to.keep) 282 | } 283 | amp.matrix <- amp.mat[unique(amp.events$just.genes), intersecting.samples, drop = F] 284 | # only get focal 285 | amp.matrix[amp.matrix < 2] <- NA 286 | amp.matrix[amp.matrix == 2] <- "A" 287 | amp.matrix <- amp.matrix %>% as_tibble(rownames = NA) %>% 288 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>% 289 | inner_join(amp.events) %>% select(just.genes, tags, coseg.names, everything()) 290 | #dels 291 | del.events <- all.final.mat.event.types.df %>% filter(dels == "D") %>% 292 | select(coseg.names, tags, just.genes) 293 | # remove duplicates of both D and D+M 294 | if(length(unique(del.events$just.genes)) != length(del.events$just.genes)) { 295 | dups <- del.events %>% group_by(just.genes) %>% filter(n() > 1) 296 | to.keep <- dups %>% filter(tags == "D+M") 297 | del.events <- del.events %>% filter(!just.genes %in% dups$just.genes) %>% 298 | bind_rows(to.keep) 299 | } 300 | del.matrix <- del.mat[unique(del.events$just.genes), intersecting.samples , drop = F] 301 | # only get focal 302 | del.matrix[del.matrix > -2] <- NA 303 | del.matrix[del.matrix == -2] <- "D" 304 | del.matrix <- del.matrix %>% as_tibble(rownames = NA) %>% 305 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>% 306 | inner_join(del.events) %>% select(just.genes, tags, coseg.names, everything()) 307 | # fusions (if they exist) 308 | if(isFALSE(all(is.na(all.final.mat.event.types.df$fus)))) { 309 | fus.events <- all.final.mat.event.types.df %>% filter(fus == "F") %>% 310 | select(coseg.names, tags, just.genes) 311 | fus.mat <- read.table(paste0(fusion.data.dir, tumAcro, '.txt'), sep='\t', header=T, row.names=1, check.names=F) 312 | fus.matrix <- fus.mat[unique(fus.events$just.genes), intersect(colnames(fus.mat), intersecting.samples), drop = F] 313 | fus.matrix[fus.matrix == 1] <- "F" 314 | fus.matrix[fus.matrix == 0] <- NA 315 | fus.matrix <- fus.matrix %>% as_tibble(rownames = NA) %>% 316 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>% 317 | inner_join(fus.events) %>% select(just.genes, tags, coseg.names, everything()) 318 | } else { 319 | fus.matrix <- NULL 320 | } 321 | ## bind them all together 322 | tumor.events.mat <- bind_rows(mut.matrix, amp.matrix, del.matrix, fus.matrix) 323 | # combine rows of duplicates so they represent only one event 324 | dups <- tumor.events.mat %>% group_by(just.genes) %>% filter(n() > 1) 325 | if(nrow(dups) > 0) { 326 | # keep events that are just A vs D on separate lines 327 | to.keep <- dups %>% filter(tags %in% c("A", "D", "M")) 328 | remaining <- dups %>% filter(tags %in% c("A+M", "D+M")) 329 | for (gene in unique(remaining$just.genes)) { 330 | sub <- remaining[remaining$just.genes == gene,] 331 | # combine lines with same tags to merge M and A/D 332 | for (tag in unique(sub$tags)){ 333 | to.merge <- sub %>% filter(tags == tag) 334 | new.line <- apply(to.merge, 2, paste0, collapse = ";") 335 | new.line[1:3] <- unlist(to.merge[1,1:3]) 336 | to.keep <- bind_rows(to.keep, new.line) 337 | } 338 | } 339 | to.keep[to.keep == "NA;NA"] <- NA 340 | tumor.events.mat <- tumor.events.mat %>% filter(!just.genes %in% dups$just.genes) %>% 341 | bind_rows(to.keep) 342 | } 343 | trans.tumor.events.mat <- tumor.events.mat %>% column_to_rownames("coseg.names") %>% 344 | select(-just.genes, -tags) %>% t() %>% as_tibble(rownames = NA) %>% rownames_to_column("sample") 345 | # add clusters to trans.tumor.events.mat 346 | sample.clustering <- pancan.clusters[[tumAcro]] %>% enframe("sample", "cluster") %>% 347 | filter(sample %in% intersecting.samples) 348 | trans.tumor.events.mat <- inner_join(sample.clustering, trans.tumor.events.mat) %>% 349 | select(-sample) 350 | # get rid of gbm subtype 2 here before continuing 351 | if(tumAcro == "gbm"){ 352 | trans.tumor.events.mat <- filter(trans.tumor.events.mat, cluster != 2) 353 | } 354 | # check if any subtype had no events (was saved as a logical in the final events plots) 355 | no.events <- which(sapply(final.plots[["matrices"]][[tumAcro]], is.logical)) 356 | if(length(no.events > 0)) { 357 | trans.tumor.events.mat <- filter(trans.tumor.events.mat, !cluster %in% no.events) 358 | } 359 | get.percent <- function(x){ 360 | round(sum(!is.na(x))/length(x)*100) 361 | } 362 | # matrix with all the percentages for the events 363 | summary.df <- trans.tumor.events.mat %>% group_by(cluster) %>% summarise_all(get.percent) %>% 364 | column_to_rownames("cluster") %>% t() 365 | to.plot.df <- summary.df %>% as_tibble(rownames = NA) %>% 366 | rownames_to_column("event.names") 367 | # matrix with just the percentages for the events that are plotted 368 | # replace all the cells that are not MOMA events with 0s 369 | for (clus in colnames(to.plot.df)[2:ncol(to.plot.df)]) { 370 | events.this.clus <- all.final.mat.event.types[[as.numeric(clus)]]$coseg.names 371 | #to.plot.df.test <- to.plot.df %>% mutate(test1 = if_else(event.names %in% events.this.clus, to.plot.df[,clus], 0)) 372 | to.replace <- to.plot.df %>% as_tibble(rownames = NA) %>% 373 | select(event.names, all_of(clus)) %>% deframe() 374 | to.replace <- if_else(names(to.replace) %in% events.this.clus, to.replace, 375 | if_else(str_sub(names(to.replace), end = -3) %in% events.this.clus, to.replace, 0)) 376 | to.plot.df[,clus] <- to.replace 377 | } 378 | ### 379 | # do row-wise proportion test of percentages 380 | ### 381 | # first get number of samples per cluster 382 | cluster.sums <- trans.tumor.events.mat %>% group_by(cluster) %>% summarise(total = n()) 383 | # initialize final dataframe for events and p.values 384 | event.prop.df <- tibble(event = colnames(trans.tumor.events.mat[2:ncol(trans.tumor.events.mat)]), 385 | pval = 1) 386 | # create a contingency table for each event given it's occurence/non-occurence in each cluster 387 | # do a chisq test to determine if the proportions are or aren't the same 388 | for(ge in colnames(trans.tumor.events.mat[2:ncol(trans.tumor.events.mat)])) { 389 | get.non.na <- function(x) sum(!is.na(x)) 390 | sub.df <- trans.tumor.events.mat %>% select(cluster, all_of(ge)) %>% 391 | group_by(cluster) %>% summarise_all(get.non.na) %>% right_join(cluster.sums) %>% 392 | transmute(suc = get(ge), failures = total - get(ge)) %>% as.matrix() 393 | res <- chisq.test(sub.df, simulate.p.value = TRUE)$p.value 394 | event.prop.df[event.prop.df$event == ge, 2] <- res 395 | } 396 | # adjust pvals for multi hypothesis correction 397 | event.prop.df$adj.pval <- event.prop.df$stars <- p.adjust(event.prop.df$pval, method = "BH") 398 | #event.prop.df$stars[event.prop.df$stars > 0.05] <- "" 399 | event.prop.df <- mutate(event.prop.df, stars = ifelse(event.prop.df$stars < 0.0001, "****", 400 | ifelse(event.prop.df$stars < 0.001, "***", 401 | ifelse(event.prop.df$stars < 0.01, "**", 402 | ifelse(event.prop.df$stars < 0.05, "*", " "))))) 403 | # make the plot 404 | to.plot.df <- to.plot.df %>% column_to_rownames("event.names") %>% as.matrix() 405 | col_fun = colorRamp2(c(0, 1, 100), c("grey", "palegreen", "darkgreen")) 406 | if(nrow(to.plot.df) >= 60) { 407 | label.size <- 5 408 | } else if (nrow(to.plot.df) >= 50 ) { 409 | label.size <- 6 410 | } else if (nrow(to.plot.df) >= 40 ) { 411 | label.size <- 7 412 | } else if (nrow(to.plot.df) >= 30 ) { 413 | label.size <- 8 414 | } else { 415 | label.size <- 9 416 | } 417 | # make row annotation with astericks for chi.sq test 418 | ha <- rowAnnotation(signif = anno_text(event.prop.df$stars, gp = gpar(fontsize = label.size))) 419 | # make rowlabels that say cluster 420 | col.labels <- structure(paste0("Cluster ", colnames(to.plot.df)), names = colnames(to.plot.df)) 421 | # make the title and subtitle with p values 422 | title <- paste0("Events in ", toupper(tumAcro), "", 423 | "
p values: *** < 0.001 | ** < 0.01 | * < 0.05 " ) 424 | ht <- Heatmap(to.plot.df, col = col_fun, rect_gp = gpar(col = "white", lwd = 1), 425 | column_names_rot = 45, column_labels = col.labels, 426 | cell_fun = function(j, i, x, y, width, height, fill) { 427 | if(to.plot.df[i, j] > 0) { 428 | grid.text(sprintf("%.0f", to.plot.df[i, j]), x, y, gp = gpar(fontsize = label.size)) 429 | } else { 430 | grid.text(sprintf("%.0f", summary.df[i, j]), x, y, gp = gpar(fontsize = label.size)) 431 | } 432 | }, 433 | heatmap_legend_param = list(title = "% Samples \nin Cluster"), 434 | row_names_gp = gpar(fontsize = label.size), 435 | column_title = gt_render(title), 436 | show_row_dend = T, show_column_dend = T, 437 | right_annotation = ha) 438 | p <- grid::grid.grabExpr(draw(ht, padding = unit(c(2, 10, 2, .5), "mm"))) 439 | cosegregation.plots[[tumAcro]] <- p 440 | } 441 | m1 <- marrangeGrob(cosegregation.plots, ncol = 1, nrow = 1) 442 | ggsave(filename = paste0(output.folder, "cosegregation.plots.focal.only.pdf"), m1, 443 | width = 8.5, height = 11, units = c("in"), 444 | dpi = 300) 445 | ggsave(filename = "~/Desktop/cosegregation.plots.focal.only.pdf", m1, 446 | width = 8.5, height = 11, units = c("in"), 447 | dpi = 300) 448 | setwd("~/Documents/Github/OncoSig") 449 | library(OncoSig) 450 | df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 451 | library(randomForest) 452 | Network_location="./Input_data_files/COAD/original_network.txt" 453 | Network=read.delim(Network_location,header=F) 454 | Network$V1=as.character(Network$V1) 455 | Network$V2=as.character(Network$V2) 456 | Network$V3=as.numeric(Network$V3) 457 | Network=as.matrix(Network) 458 | Network[,3]=as.numeric(Network[,3]) 459 | Network_matrix=listToMatrix(Network) 460 | Gold_Standard_location= "./Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt" 461 | Gold_Standard=read.delim(Gold_Standard_location,header=F) 462 | Gold_Standard$V1=as.character(Gold_Standard$V1) 463 | remove(Network_matrix) 464 | Query_output_results=OncoSigRF(Network_matrix_df, Gold_Standard_in_Network_names, Fraction_Gold_sample=0.5, ntrees=50, max_iterations=50, balance=1, to_save=1) 465 | Query_output_results_scores=as.data.frame(Query_output_results[[1]]) 466 | Query_output_results=OncoSigRF(Network_matrix_df, Gold_Standard, Fraction_Gold_sample=0.5, ntrees=50, max_iterations=50, balance=1, to_save=1) 467 | Query_output_results=OncoSigRF(Network_matrix, Gold_Standard, Fraction_Gold_sample=0.5, ntrees=50, max_iterations=50, balance=1, to_save=1) 468 | Query_output_results=OncoSigRF(Network, Gold_Standard, Fraction_Gold_sample=0.5, ntrees=50, max_iterations=50, balance=1, to_save=1) 469 | df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 470 | df_1=read.delim("./Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 471 | df_1=read.delim("./Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 472 | df_2=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE) 473 | df_2=read.delim("./Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE) 474 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6), 475 | c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999), 476 | c(0,0.01,0.05)) 477 | predictions=OncoSigNB(training_set = df_1,testing_set = df_2, 478 | the_bins=the_bins,correlated_features =list()) 479 | predictions[1:5] 480 | Network_location="./Input_data_files/COAD/original_network.txt" 481 | Network=read.delim(Network_location,header=F) 482 | Network$V1=as.character(Network$V1) 483 | Network$V2=as.character(Network$V2) 484 | Network$V3=as.numeric(Network$V3) 485 | Network=as.matrix(Network) 486 | Network[,3]=as.numeric(Network[,3]) 487 | Network_matrix=listToMatrix(Network) 488 | # part 2 489 | Network_location="./Input_data_files/LUAD/original_network_sample.txt" 490 | Network=read.delim(Network_location,header=F) 491 | Network$V1=as.character(Network$V1) 492 | Network$V2=as.character(Network$V2) 493 | Network$V3=as.numeric(Network$V3) 494 | Network=as.matrix(Network) 495 | Network[,3]=as.numeric(Network[,3]) 496 | Network[1:5m] 497 | Network[1:5,] 498 | Network_matrix=listToMatrix(Network) 499 | Gold_Standard_location= "./Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt" 500 | Gold_Standard=read.delim(Gold_Standard_location,header=F) 501 | Gold_Standard$V1=as.character(Gold_Standard$V1) 502 | Network_matrix_df=as.data.frame(Network_matrix) 503 | Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df), Gold_Standard$V1) 504 | Negative_Set_names=setdiff(rownames(Network_matrix_df), Gold_Standard_in_Network_names) 505 | remove(Network_matrix) 506 | Query_output_results=OncoSigRF(Network_matrix_df, Gold_Standard_in_Network_names, max_iterations=5) 507 | Query_output_results_scores=as.data.frame(Query_output_results[[1]]) 508 | View(Query_output_results_scores) 509 | KRAS_features= "./Input_data_files/LUAD/OncoSigUnsup/feature_list_KRAS.txt" 510 | EGFR_forest= "./Input_data_files/LUAD/OncoSigUnsup/All_forests_EGFR.r" 511 | results=OncoSigUnsup(KRAS_features,EGFR_forest) 512 | View(results) 513 | -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/build_options: -------------------------------------------------------------------------------- 1 | auto_roxygenize_for_build_and_reload="1" 2 | auto_roxygenize_for_build_package="1" 3 | auto_roxygenize_for_check="1" 4 | live_preview_website="1" 5 | makefile_args="" 6 | preview_website="1" 7 | website_output_format="all" 8 | -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/cpp-definition-cache: -------------------------------------------------------------------------------- 1 | [ 2 | ] -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/pcs/files-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "path" : "~/Documents/Github/OncoSig_main/OncoSig/Input_data_files/LUAD/OncoSigUnsup", 3 | "sortOrder" : [ 4 | { 5 | "ascending" : true, 6 | "columnIndex" : 2 7 | } 8 | ] 9 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/pcs/source-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "activeTab" : -1 3 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/pcs/windowlayoutstate.pper: -------------------------------------------------------------------------------- 1 | { 2 | "left" : { 3 | "panelheight" : 554, 4 | "splitterpos" : 181, 5 | "topwindowstate" : "HIDE", 6 | "windowheight" : 592 7 | }, 8 | "right" : { 9 | "panelheight" : 554, 10 | "splitterpos" : 355, 11 | "topwindowstate" : "NORMAL", 12 | "windowheight" : 592 13 | } 14 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/pcs/workbench-pane.pper: -------------------------------------------------------------------------------- 1 | { 2 | "TabSet1" : 0, 3 | "TabSet2" : 0, 4 | "TabZoom" : { 5 | } 6 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/persistent-state: -------------------------------------------------------------------------------- 1 | build-last-errors="[]" 2 | build-last-errors-base-dir="~/Documents/Github/OncoSig_main/OncoSig/" 3 | build-last-outputs="[{\"output\":\"==> devtools::document(roclets = c('rd', 'collate', 'namespace'))\\n\\n\",\"type\":0},{\"output\":\"Updating OncoSig documentation\\n\",\"type\":2},{\"output\":\"First time using roxygen2. Upgrading automatically...\\n\",\"type\":2},{\"output\":\"Loading OncoSig\\n\",\"type\":2},{\"output\":\"Warning: The existing 'NAMESPACE' file was not generated by roxygen2, and will not be overwritten.\\nWarning message:\\nroxygen2 requires Encoding: UTF-8 \\n\",\"type\":2},{\"output\":\"Documentation completed\\n\\n\",\"type\":1},{\"output\":\"==> R CMD INSTALL --no-multiarch --with-keep.source OncoSig\\n\\n\",\"type\":0},{\"output\":\"* installing to library ‘/Library/Frameworks/R.framework/Versions/4.0/Resources/library’\\n\",\"type\":1},{\"output\":\"* installing *source* package ‘OncoSig’ ...\\n\",\"type\":1},{\"output\":\"** using staged installation\\n\",\"type\":1},{\"output\":\"** R\\n\",\"type\":1},{\"output\":\"** byte-compile and prepare package for lazy loading\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** help\\n\",\"type\":1},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/OncoSigRF.Rd:88: unexpected section header '\\\\keyword'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/OncoSigRF.Rd:89: unexpected section header '\\\\keyword'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/OncoSigRF.Rd:90: unexpected section header '\\\\keyword'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/OncoSigRF.Rd:91: unexpected END_OF_INPUT '\\n\",\"type\":2},{\"output\":\"'\\n\",\"type\":1},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/listToMatrix.Rd:10: unexpected UNKNOWN '\\\\warning'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/listToMatrix.Rd:13: unexpected '}'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/listToMatrix.Rd:11: All text must be in a section\\n\",\"type\":2},{\"output\":\"\",\"type\":1},{\"output\":\"*** installing help indices\\n\",\"type\":1},{\"output\":\"** building package indices\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** installing vignettes\\n\",\"type\":1},{\"output\":\"** testing if installed package can be loaded from temporary location\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** testing if installed package can be loaded from final location\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** testing if installed package keeps a record of temporary installation path\\n\",\"type\":1},{\"output\":\"* DONE (OncoSig)\\n\",\"type\":1},{\"output\":\"\",\"type\":1}]" 4 | compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}" 5 | files.monitored-path="" 6 | find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":false,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}" 7 | imageDirtyState="0" 8 | saveActionState="0" 9 | -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/rmd-outputs: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/saved_source_markers: -------------------------------------------------------------------------------- 1 | {"active_set":"","sets":[]} -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/1B64C678: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "35,21", 3 | "scrollLine" : "32" 4 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/1CC714F7: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/27C419B: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/27EB650E: -------------------------------------------------------------------------------- 1 | { 2 | "cursorPosition" : "24,64", 3 | "scrollLine" : "16" 4 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/2EE4D19B: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/453A360A: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/4668FDBC: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/4EC81EA2: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/6CEE29D: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/6E2F621B: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/71CD7210: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/78F86D91: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/908CD31C: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/A67B5CF0: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/C0746A86: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/D6FEEFCC: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/D71BEEF9: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/F8F7728B: -------------------------------------------------------------------------------- 1 | { 2 | } -------------------------------------------------------------------------------- /.Rproj.user/F983FB94/sources/prop/INDEX: -------------------------------------------------------------------------------- 1 | ~%2FDocuments%2FGithub%2FOncoSig%2FNAMESPACE="D71BEEF9" 2 | ~%2FDocuments%2FGithub%2FOncoSig%2FRead-and-delete-me="D6FEEFCC" 3 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FDESCRIPTION="A67B5CF0" 4 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FNAMESPACE="78F86D91" 5 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FR%2FOncoSig.R="6E2F621B" 6 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FR%2FOncoSigNB.Rd="C0746A86" 7 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FRead-and-delete-me="2EE4D19B" 8 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FcomputeLRsgivenBins.Rd="453A360A" 9 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FgetFinalLR.Rd="1CC714F7" 10 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FgetLRsgivenBin_info.Rd="908CD31C" 11 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FgetMaxLR.Rd="4668FDBC" 12 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FlistToMatrix.Rd="4EC81EA2" 13 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FDiana_Dec4_Run_RF_COAD_KRAS.txt="27EB650E" 14 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2FInput_data_files%2FLUAD%2FREADME.txt="F8F7728B" 15 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2FREADME.md="27C419B" 16 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2Fvignettes%2FOncoSig-concordance.tex="71CD7210" 17 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2Fvignettes%2FOncoSig.log="6CEE29D" 18 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2Fvignettes%2FOncoSig.tex="1B64C678" 19 | -------------------------------------------------------------------------------- /.Rproj.user/shared/notebooks/patch-chunk-names: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/.Rproj.user/shared/notebooks/patch-chunk-names -------------------------------------------------------------------------------- /.Rproj.user/shared/notebooks/paths: -------------------------------------------------------------------------------- 1 | /Users/sunnyjones/Documents/Github/OncoSig_main/Diana_Dec4_Run_RF_COAD_KRAS.txt="87298F43" 2 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/Input_data_files/LUAD/README.txt="2A438251" 3 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/README.md="A09906A" 4 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/R/OncoSigNB.Rd="DBC78E44" 5 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/computeLRsgivenBins.Rd="D96AD446" 6 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/getFinalLR.Rd="422015D3" 7 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/getLRsgivenBin_info.Rd="2B1FC101" 8 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/getMaxLR.Rd="B669C0BF" 9 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/listToMatrix.Rd="96DBECF" 10 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/vignettes/OncoSig-concordance.tex="E951F25E" 11 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/vignettes/OncoSig.log="8523CE22" 12 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/vignettes/OncoSig.tex="1D4E4DDC" 13 | -------------------------------------------------------------------------------- /All_forests.r: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/All_forests.r -------------------------------------------------------------------------------- /All_forests_KRAS_Ingenuity.r: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/All_forests_KRAS_Ingenuity.r -------------------------------------------------------------------------------- /All_forests_KRAS_MSigDB.r: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/All_forests_KRAS_MSigDB.r -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: OncoSig 2 | Type: Package 3 | Title: What the package does (short line) 4 | Version: 1.0 5 | Date: 2018-10-14 6 | Author: Who wrote it 7 | Maintainer: Who to complain to 8 | Description: More about what it does (maybe more than one line) 9 | License: What license is it under? 10 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /OncoSig.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageRoxygenize: rd,collate,namespace 19 | -------------------------------------------------------------------------------- /R/NBfunctions.R: -------------------------------------------------------------------------------- 1 | source("R/rFunctions.R") 2 | #These sets of functions allow the creation of a binary Naive Bayes classifer, given two sets of data. 3 | #1. The feature matrix 4 | #2. The bin parameters. 5 | 6 | #1.Feature Matrix 7 | #The first column in the feature matrix is the name of the instance (e.g. the name of a protein) 8 | #The second column is the feature value (either a 1 or 0) 9 | #All later columns are feature values, which can be either discrete or continous numeric values, or "NA" 10 | #For example: 11 | # V1 df_labels feature_1 feature_2 12 | #1 Q16539 1 6.76840e+02 1.000e+00 13 | #2 P78383 1 NA NA 14 | #3 P30281 1 NA 1.000e+00 15 | 16 | #2. Bin Parameters 17 | #bin parameters are passed to the function as lists of lists, with one list per feature: For example 18 | #the_bins=list(c(0,40,200,1200),c(0,.1) 19 | #will bin the first feature into bins corresponding to: 0-40,40-200,200-1200,1200-Inf. "NA's" (i.e. no feature present) are given a seperate bin for each feature. 20 | #NOTE: Due to the way that "NA"'s are imputed, features should not have any features less than -99999999999999999999999, or greater than 9999999999999999999999999999999999999999. 21 | #NOTE: These functions assume that Naive Bayes is used as a binary classifier, where there are only two labels in the response vector:1 or 0. 22 | 23 | #This function takes in a dataframe and a bins for each feature and returns, for each bin, the corresponding likelihood Ratios (LR) 24 | #Where LR=p(1|bin)/(p(1)) 25 | NaiveBayesBin <-function (df_1,the_bins){ 26 | #impute all NAs with a very small value temporarily. 27 | the_min=-999999999999999999999999999999999999999999999 28 | the_max=9999999999999999999999999999999999999999999 29 | df_1[is.na(df_1)] <- -99999999999999999999999 30 | df_1_copy=df_1 31 | 32 | #go through the data frame with the assigned breaks and create the bins 33 | #the first two columns must be the name of the entry and the labels; all future columns are features 34 | #for each feature, find the proper bins, which is 2 less than the according column 35 | for (i in 3:ncol(df_1)){ 36 | df_1_copy[,i]=.bincode(df_1_copy[,i],c(the_min,the_bins[i-2][[1]],the_max),right=FALSE) 37 | #print(i) 38 | } 39 | new_bin_info=list() 40 | for (i in 3:ncol(df_1)){ 41 | new_bin_info=append(new_bin_info,list(getLRsgivenBin_info(df_1_copy[,i],the_bins[i-2][[1]],df_1_copy[,2]))) 42 | 43 | } 44 | return(new_bin_info) 45 | 46 | 47 | } 48 | #After training on a training set, this function computes LRs on a new testing set. Note that labels must be provided for the testing set as well. 49 | computeLRsgivenBins <- function (df_1,the_bins,the_bins_info){ 50 | #impute all NAs with a very small value temporarily. 51 | the_min=-999999999999999999999999999999999999999999999 52 | the_max=9999999999999999999999999999999999999999999 53 | df_1[is.na(df_1)] <- -99999999999999999999999 54 | df_1_copy=df_1 55 | 56 | #go through the data frame with the assigned breaks and create the bins 57 | #the first two columns must be the name of the entry and the labels; all future columns are features 58 | #for each feature, find the proper bins, which is 2 less than the according column 59 | for (i in 3:ncol(df_1)){ 60 | df_1_copy[,i]=.bincode(df_1_copy[,i],c(the_min,the_bins[i-2][[1]],the_max),right=FALSE) 61 | #print(i) 62 | } 63 | #replace bins with LR 64 | for (i in 3:ncol(df_1)){ 65 | df_1_copy[,i]=replaceBinswithLR(df_1_copy[,i],the_bins_info[i-2][[1]]) 66 | #print(i) 67 | } 68 | return(df_1_copy) 69 | } 70 | 71 | #given a bin vector and the gold standard vector (i.e. the two vectors of the same length), return the Likelihood Ratio vector 72 | getLRsgivenBin_info <- function (bin_vector,the_bin,label_vector){ 73 | #get bins info 74 | the_bins_new=the_bin 75 | bin_vector_new=bin_vector 76 | prior=table(label_vector)[2]/table(label_vector)[1] 77 | bin_vector_2=unique(sort(bin_vector)) 78 | bin_vector_3=rep(0,length(bin_vector_2)) 79 | for (i in 1:length(bin_vector_2)) { 80 | the_num=bin_vector_2[i] 81 | ratio_1=table(label_vector[bin_vector==i])[2]/table(label_vector[bin_vector==i])[1] 82 | LR=ratio_1/prior 83 | bin_vector_3[i]=LR 84 | 85 | } 86 | names(bin_vector_3)=bin_vector_2 87 | #for (i in 1:length(bin_vector_new)) { 88 | # the_bin_value=bin_vector_new[i] 89 | # bin_vector_new[i]=bin_vector_3[the_bin_value] 90 | #} 91 | 92 | return(bin_vector_3) 93 | 94 | } 95 | 96 | #get the final LR given the dataframe. Columns starting at 3 are feature values 97 | getFinalLR <- function(df_1){ 98 | to_return=lapply(1:nrow(df_1), 99 | function(x){ 100 | prod(df_1[x,3:ncol(df_1)]) 101 | 102 | } 103 | 104 | 105 | ) 106 | return (to_return) 107 | 108 | } 109 | 110 | #given the specified columns, return the maximum LR for each case 111 | getMaxLR <- function(df_1){ 112 | the_max_results=lapply(1:nrow(df_1), 113 | function(x){ 114 | max(df_1[x,]) 115 | } 116 | ) 117 | the_max_results=unlist(the_max_results) 118 | return(the_max_results) 119 | } 120 | #Given the bin info, and bined data, replace each bin with the corresponding Likelihood ratio. 121 | replaceBinswithLR <- function(bin_vector,the_bin_info){ 122 | new_bin_vector=lapply(1:len(bin_vector), 123 | function(x){ 124 | the_bin=as.character(bin_vector[x]) 125 | bin_value=the_bin_info[the_bin] 126 | #print(x) 127 | bin_value 128 | 129 | 130 | } 131 | 132 | ) 133 | new_bin_vector=unlist(new_bin_vector) 134 | return(new_bin_vector) 135 | 136 | } 137 | -------------------------------------------------------------------------------- /R/OncoSig.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/OncoSig.R -------------------------------------------------------------------------------- /R/OncoSigNB.R: -------------------------------------------------------------------------------- 1 | OncoSigNB <- function (training_set,testing_set,the_bins,correlated_features){ 2 | message("\tBinning features based on holdout set\n") 3 | the_bins_info=NaiveBayesBin(training_set,the_bins) 4 | testing_set=computeLRsgivenBins(testing_set,the_bins,the_bins_info) 5 | #Get the maximum of correlated features if they were passed 6 | if (len(correlated_features)>0){ 7 | mass_spec_features=correlated_features 8 | the_mass_spec_features=testing_set[mass_spec_features] 9 | #print(the_mass_spec_features) 10 | message("\tCorrecting for correlated features \n") 11 | #message("test") 12 | ms_max=getMaxLR(the_mass_spec_features) 13 | 14 | testing_set[mass_spec_features] <- 1 15 | testing_set$MS_max=ms_max 16 | } 17 | message("\tExtracting predicted LR_posterior holdout set\n") 18 | the_results=unlist(getFinalLR(testing_set)) 19 | names(the_results)=testing_set$V1 20 | 21 | return(the_results) 22 | } 23 | -------------------------------------------------------------------------------- /R/OncoSigRF.R: -------------------------------------------------------------------------------- 1 | 2 | OncoSigRF <- function(Network_matrix_df,Gold_Standard_in_Network_names, Fraction_Gold_sample=NULL, ntrees=NULL, max_iterations= 3 | NULL,balance=NULL,to_save=NULL){ 4 | #This function runs the Random Forest Learner using the Network Provided and the gold standard provided 5 | 6 | if(is.null(Fraction_Gold_sample)) {Fraction_Gold_sample=.5} 7 | if(is.null(ntrees)) {ntrees=50} 8 | if(is.null(max_iterations)) {max_iterations=20} 9 | if(is.null(balance)) {balance=1} 10 | if(is.null(to_save)) {to_save=0} 11 | message("Running OncoSig") 12 | message("Fraction of Gold Standard to train on for each Random Forest: ", Fraction_Gold_sample,sep="") 13 | message("Number of Trees per Iteration: ", ntrees,sep="") 14 | message("Number of Iterations: ", max_iterations,sep="") 15 | message("Balance: ", balance,sep="") 16 | #Sample gold_standard, user can Change this 17 | #ntrees=50 #Give each Random Fores 50 Trees, user can change this 18 | #max_iterations=100 #How many Iterations to do 19 | #Number to sample from the sets: 20 | Num_to_sample=floor(Fraction_Gold_sample*length(Gold_Standard_in_Network_names)) 21 | #balance=3 #Change this depending on whether you want a balanced classifier or not, 1 means a balanced classifier, this will create more errors overall 22 | message("Number of positive results to sample: ", Num_to_sample,sep="") 23 | 24 | Num_to_sample_negative=Num_to_sample*balance; 25 | message("Number of negative results to sample: ", Num_to_sample_negative,sep="") 26 | QueryResults_scores=data.frame(row.names=rownames(Network_matrix_df)) 27 | importance_df=data.frame() 28 | all_forests=list() 29 | for (i in 1:max_iterations){ 30 | Gold_sample=sample(Gold_Standard_in_Network_names,Num_to_sample) 31 | Negative_sample=sample(Negative_Set_names,Num_to_sample_negative) 32 | label_vector=c(rep(1,Num_to_sample),rep(0,Num_to_sample_negative)) 33 | label_vector=as.factor(label_vector) 34 | 35 | Not_in_Gold_or_Negative_Sample=setdiff(rownames(Network_matrix_df),c(Gold_sample,Negative_sample)) 36 | #You can Cadd dotrace=TRUE if you want to see the trace of the random Forests 37 | message("Performing Random Forest",sep="") 38 | #Testing with fast by only passing it part of the matrix in the first place 39 | #my_col_sample=sample(colnames(Network_matrix_df),3000) 40 | 41 | 42 | #result=randomForest(Network_matrix_df[c(Gold_sample,Negative_sample),],label_vector,ntree = ntrees,importance=TRUE,do.trace=FALSE) 43 | #set mtry 44 | 45 | mtry=floor(ncol(Network_matrix_df)**.5) 46 | message("mtry equals ",mtry) 47 | result=randomForest(Network_matrix_df[c(Gold_sample,Negative_sample),],label_vector,ntree = ntrees,importance=TRUE,do.trace=TRUE,mtry=mtry) 48 | if (to_save==1){ 49 | all_forests[[i]]=result 50 | } 51 | Query_results=predict(result,type="prob",newdata=Network_matrix_df[Not_in_Gold_or_Negative_Sample,]) 52 | QueryResults_scores[Not_in_Gold_or_Negative_Sample,i]=Query_results[Not_in_Gold_or_Negative_Sample,2] 53 | 54 | #Get the Importance, using mean decrease accuracy 55 | importance=as.data.frame(result$importance);importance=importance[order(importance$MeanDecreaseAccuracy,decreasing=T),] 56 | importance_vector=importance$MeanDecreaseAccuracy 57 | names(importance_vector)=rownames(importance) 58 | importance_df[names(importance_vector),i]=importance_vector 59 | #If the matrix is very large and you cannot query the results all at once, do it in chunks 60 | 61 | #How Converged are we if i>1 62 | if (i>2){ 63 | #QueryResults_scores_Complete_cases=QueryResults_scores[complete.cases(QueryResults_scores),] 64 | #old_ones=rowMeans(QueryResults_scores_Complete_cases[,c(1:c(i-1))]) 65 | #old_plus_new=rowMeans(QueryResults_scores_Complete_cases[,c(1:c(i))]) 66 | #old_plus_new=rowMeans(QueryResults_scores_Complete_cases[,c(1:c(i))]) 67 | #Turn off warning for correlaiton, otherwise it spits back tie-related errors 68 | #options(warn=-1) 69 | #Correlation=cor.test(old_ones,old_plus_new,method="spearman")$estimate 70 | #options(warn=0) 71 | message("At iteration ", i,sep="") 72 | 73 | } 74 | Gold_sample_old=Gold_sample 75 | Negative_sample_old=Negative_sample 76 | } 77 | QueryResults_scores_average=as.data.frame(rowMeans(QueryResults_scores,na.rm=TRUE)) 78 | colnames(QueryResults_scores_average)=c("Score") 79 | QueryResults_scores_average=as.data.frame(QueryResults_scores_average) 80 | if (to_save==1){ 81 | save(all_forests,file="All_forests.r") 82 | } 83 | return(list(QueryResults_scores_average,QueryResults_scores, importance_df)) 84 | 85 | } 86 | -------------------------------------------------------------------------------- /R/OncoSigUnsup.R: -------------------------------------------------------------------------------- 1 | source("./R/rFunctions.R") 2 | OncoSigUnsup <- function(Network_location, forest_location){ 3 | load(forest_location,verbose=T) 4 | 5 | Network=read.delim(Network_location,header=F) 6 | Network$V1=as.character(Network$V1) 7 | Network$V2=as.character(Network$V2) 8 | Network$V3=as.numeric(Network$V3) 9 | Network=as.matrix(Network) 10 | 11 | #Convert to Matrix. Inputes missing values as 0, so make sure your scores range from greater than zero to higher! 12 | Network[,3]=as.numeric(Network[,3]) 13 | Network_matrix=listToMatrix(Network) 14 | 15 | result_matrix=matrix(nrow=nrow(Network_matrix),ncol=length(all_forests)) 16 | rownames(result_matrix)=rownames(Network_matrix) 17 | for (i in 1:length(all_forests)){ 18 | Query_results=predict(all_forests[[i]],newdata = Network_matrix,type="prob") 19 | result_matrix[,i]=Query_results[,2] 20 | } 21 | the_means=rowMeans(result_matrix) 22 | the_means_df=as.data.frame(the_means) 23 | return(the_means_df) 24 | } 25 | -------------------------------------------------------------------------------- /R/Oncosig-RF/OncoSig/.functions.R.swo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/.functions.R.swo -------------------------------------------------------------------------------- /R/Oncosig-RF/OncoSig/.functions.R.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/.functions.R.swp -------------------------------------------------------------------------------- /R/Oncosig-RF/OncoSig/ONCOSIG_README.md: -------------------------------------------------------------------------------- 1 | 2 | # OncoSig-RF Overview 3 | Written by: Joshua Broyde (2/23/2018) 4 | 5 | OncoSig-RF is an algorithm for determinging novel sets of proteins that support the activity of an oncogene or tumor suppressor (i.e. Oncoprotein-Centric Map, or OC-map). Given a gold standard and an input molecular interaction network, the algorithm uses a random forest classifier to discover novel members of the OC-Map. 6 | 7 | # OncoSig-RF Input and Options 8 | 9 | OncoSig-RF Requires 2 input files,
10 | 1.a network of interactions and
11 | 2.A gold standard of pathway members (i.e. members of the OC-Map) to train on. 12 | 13 | The network is a tab delimited dataframe of 3 columns, Gene A, Gene B and the strength of the interaction. This can range from 0 to infinite, but should not be a negative number. Note that the dataframe must be redundant, so that the interaction is represented twice, between A and B and B and A. 14 | 15 | The other required inputs are as follows:
16 | The maximum number of iterations for monte-carlo cross validation. (Default=50)
17 | The fraction of the gold standard to train on at eact iteration. (Default=.5)
18 | The number of trees to to create in each forest in the random forest classifier. (Default=50)
19 | The number of proteins in the negative standard for each training in the random forest (default=same number as picked for the gold standard). This is the balance option. (Default=1) 20 | Note the the "balance" option picks the same *number* of proteins, not percentage, so if there are 200 proteins in the gold standard, by default 50% are picked then at each.
21 | 22 | To run OncoSig-RF is from the bash command line: 23 | 24 | source location/to/OncoSig-RF/runOncoSig-RFscript_wrapper.sh location/input_network.txt 25 | 26 | # OncoSig-RF Example 27 | 28 | An example script for running OncoSig-RF is included in the "runOncoSig-RFscript.R" script. We will now run OncoSig-RF step-by step from that script to discover novel members of Kras-regulated pathways. 29 | We will run OncoSig-RF using `test_network.txt` in the `test` directory. 30 | 31 | In this example `test_network.txt` is the network file, `gold_standard.txt` is the gold standard. There are 50 iterations, .5 of the gold standard is used for training for each iteration, there are 75 trees in each iteration, and the balance equals 1, which means that the same number of negative proteins are taken at each iteration for training. If balance were equal to 2, twice as many negative proteins would be samples at each iteration. 32 | 33 | OncoSig-RF uses the randomForest and MASS package as well as internal functions: 34 | 35 | library(randomForest) 36 | library(MASS) 37 | library(Matrix) 38 | library('getopt') 39 | #Change this depending on where the functions are located 40 | source("location/of/OncoSig-RF/functions.R") 41 | 42 | If you do not have randomForest of MASS, then first install them: 43 | 44 | install.packages("randomForest") 45 | install.packages("MASS") 46 | 47 | Get the location the network and Gold Standard: 48 | 49 | Network_location="Test/test_network.txt" 50 | Gold_Standard_location="Test/gold_standard.txt" 51 | 52 | Read in the network and Gold Standard and the other paramters.The network must be tab delimited. The first two columns are the names, and third column is the strength of the interaction: 53 | 54 | arg <- commandArgs(trailingOnly = TRUE) 55 | args=as.vector(arg); 56 | Network_location=args[[1]] 57 | message("Network location: ", Network_location, sep="" ) 58 | Gold_Standard_location=args[[2]] 59 | message("Gold_Standard_location: ",Gold_Standard_location,sep="") 60 | max_iterations=args[[3]] 61 | max_iterations=as.numeric(max_iterations) 62 | Fraction_Gold_sample=args[[4]] 63 | Fraction_Gold_sample=as.numeric(Fraction_Gold_sample) 64 | ntrees=args[[5]] 65 | ntrees=as.numeric(ntrees) 66 | balance=as.numeric(args[7]) 67 | 68 | 69 | The Network matrix looks like this. Note that it is symmetric: 70 | 71 | Q13131_PREPPI P14625 1.111887e+03 72 | P14625_PREPPI Q13131 1.111887e+03 73 | P37058_PREPPI P15428 1.502400e+03 74 | P15428_PREPPI P37058 1.502400e+03 75 | Q8IY84_PREPPI Q9Y3S1 7.255526e+02 76 | Q9Y3S1_PREPPI Q8IY84 7.255526e+02 77 | Q13315_PREPPI Q96T68 2.535267e+03 78 | Q96T68_PREPPI Q13315 2.535267e+03 79 | P27348_PREPPI O75385 1.084084e+04 80 | O75385_PREPPI P27348 1.084084e+04 81 | 82 | In this particular example, only PREPPI protein-protein interactions are represented. However, other interaction types may be included as well. 83 | 84 | Next, we will convert the network list (e.g. an adjacency list) to an adjacency matrix. Note that this may take a few minutes if the network is very large. 85 | 86 | 87 | Network=read.delim(Network_location,header=F) 88 | Network$V1=as.character(Network$V1) 89 | Network$V2=as.character(Network$V2) 90 | Network$V3=as.numeric(Network$V3) 91 | Network=as.matrix(Network) 92 | Gold_Standard=read.delim(Gold_Standard_location,header=F) 93 | Gold_Standard$V1=as.character(Gold_Standard$V1) 94 | 95 | This converts the network to a matrix. Inputes non-interactions as 0. 96 | Network[,3]=as.numeric(Network[,3]) 97 | Network_matrix=listToMatrix(Network) 98 | 99 | The Network_matrix looks like this. Zero indicates no edge between nodes: 100 | 101 | Q13131_PREPPI P14625_PREPPI P37058_PREPPI P15428_PREPPI 102 | P14625 1111.887 25258.640 0.000 0.0 103 | Q13131 8911.691 1111.887 0.000 0.0 104 | P15428 0.000 0.000 1502.400 0.0 105 | P37058 0.000 0.000 2079.157 1502.4 106 | 107 | Convert Matrix to Dataframe for future steps 108 | Network_matrix_df=as.data.frame(Network_matrix) 109 | 110 | Remove members of the gold standard that are not present in the network. 111 | 112 | Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df),Gold_Standard$V1) 113 | 114 | Retrieve the negative set (i.e. all proteins not in the gold standard): 115 | Negative_Set_names=setdiff(rownames(Network_matrix_df),Gold_Standard_in_Network_names) 116 | 117 | Next, run the random forest classifier. The Random forest classifier will train on a portion of the gold standard and a sample of negative standard of the same size. A variant of repeated random sub-sampling validation is used to train the classifier. To do this, a fraction of the gold standard is randomly sampled from the data, and a random sample of the negative set of the same size is also sampled. A random forest is created with a number of trees. To generate new predictions, the score of each protein is predicted only with the random forests that were not used to train it. In the example script, the Kras gold standard has 250 members, so each random forest will be trained on 250 (125 + 125) proteins total. 118 | 119 | If the set that you are using has a very small number of proteins in in (e.g. 3-30), I recommend using a larger fraction of the gold standard and 120 | more iterations. 121 | 122 | 123 | Query_output_results=runOncoSig-RF(Network_matrix_df,Gold_Standard_in_Network_names,max_iterations = max_iterations, Fraction_Gold_sample =Fraction_Gold_sample,ntrees = ntrees, balance = balance) 124 | Query_output_results_scores=Query_output_results[[1]] 125 | write.table(Query_output_results_scores,file="OncoSig-RF_results.txt",row.names = TRUE, col.names=FALSE,quote = FALSE,sep="\t") 126 | save(Query_output_results,Query_output_results_scores,Gold_Standard_location,Network_location,Gold_Standard_in_Network_names, file="OncoSig-RF_objects.R") 127 | 128 | Now evaluate performance using a ROC Curve: 129 | 130 | library(ROCR) 131 | Query_output_results_scores=Query_output_results[[1]] 132 | #See how good the performance is: 133 | Query_output_results_scores$label=0 134 | Query_output_results_scores[Gold_Standard_in_Network_names,2]=1 135 | pred=prediction(Query_output_results_scores$Score,Query_output_results_scores$label) 136 | pdf("Performance.pdf",height=5,width=5) 137 | perf=performance(pred,measure = "tpr", x.measure = "fpr") 138 | plot(perf,col='red') #Plot the ROC curve 139 | abline(a=0,b=1); 140 | -------------------------------------------------------------------------------- /R/Oncosig-RF/OncoSig/ONCOSIG_README.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/ONCOSIG_README.pdf -------------------------------------------------------------------------------- /R/Oncosig-RF/OncoSig/Test/OncoSig_objects.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/Test/OncoSig_objects.R -------------------------------------------------------------------------------- /R/Oncosig-RF/OncoSig/Test/Performance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/Test/Performance.pdf -------------------------------------------------------------------------------- /R/Oncosig-RF/OncoSig/Test/gold_standard.txt: -------------------------------------------------------------------------------- 1 | C9J798 2 | O00329 3 | O00459 4 | O14610 5 | O14775 6 | O14807 7 | O14827 8 | O14920 9 | O14939 10 | O15111 11 | O15123 12 | O15211 13 | O15264 14 | O15399 15 | O15496 16 | O15520 17 | O15530 18 | O43320 19 | O43374 20 | O43561 21 | O43915 22 | O43921 23 | O60258 24 | O60262 25 | O60733 26 | O75914 27 | O76093 28 | O95267 29 | O95294 30 | O95750 31 | O96013 32 | P00519 33 | P00533 34 | P01111 35 | P01112 36 | P01116 37 | P01127 38 | P01133 39 | P01138 40 | P01308 41 | P04049 42 | P04054 43 | P04085 44 | P04629 45 | P05019 46 | P05129 47 | P05230 48 | P05771 49 | P06213 50 | P06493 51 | P07196 52 | P07333 53 | P08069 54 | P08138 55 | P08581 56 | P08620 57 | P09038 58 | P09603 59 | P09619 60 | P0C869 61 | P10301 62 | P10721 63 | P10767 64 | P11233 65 | P11234 66 | P11362 67 | P11487 68 | P12034 69 | P12931 70 | P14210 71 | P14555 72 | P14921 73 | P15036 74 | P15056 75 | P15153 76 | P15692 77 | P16220 78 | P16234 79 | P16520 80 | P16885 81 | P17252 82 | P17612 83 | P17948 84 | P19174 85 | P19419 86 | P19838 87 | P20339 88 | P20827 89 | P20936 90 | P21359 91 | P21583 92 | P21781 93 | P21802 94 | P22455 95 | P22607 96 | P22612 97 | P22694 98 | P27361 99 | P27482 100 | P27986 101 | P28482 102 | P29317 103 | P29353 104 | P31371 105 | P31749 106 | P31751 107 | P31946 108 | P34969 109 | P35609 110 | P35916 111 | P35968 112 | P36507 113 | P39877 114 | P42336 115 | P42338 116 | P42684 117 | P43403 118 | P45983 119 | P45984 120 | P47712 121 | P48023 122 | P48736 123 | P49137 124 | P49763 125 | P49765 126 | P49767 127 | P50150 128 | P50151 129 | P51148 130 | P51812 131 | P51817 132 | P52797 133 | P52798 134 | P52803 135 | P53778 136 | P53779 137 | P53816 138 | P55075 139 | P55196 140 | P55211 141 | P59768 142 | P60763 143 | P60953 144 | P61020 145 | P61224 146 | P61328 147 | P61586 148 | P61952 149 | P62070 150 | P62158 151 | P62330 152 | P62834 153 | P62873 154 | P62879 155 | P62993 156 | P63000 157 | P63211 158 | P63215 159 | P63218 160 | P98077 161 | P98177 162 | Q02750 163 | Q02763 164 | Q04206 165 | Q04864 166 | Q05586 167 | Q06124 168 | Q07817 169 | Q07889 170 | Q07890 171 | Q12879 172 | Q12967 173 | Q13009 174 | Q13043 175 | Q13153 176 | Q13177 177 | Q13224 178 | Q13393 179 | Q13480 180 | Q13554 181 | Q13557 182 | Q13671 183 | Q13972 184 | Q14644 185 | Q14957 186 | Q15283 187 | Q15311 188 | Q15349 189 | Q15389 190 | Q15418 191 | Q15759 192 | Q16539 193 | Q16644 194 | Q3MJ16 195 | Q53H76 196 | Q5R387 197 | Q68DD2 198 | Q6S5L8 199 | Q6VAB6 200 | Q7LDG7 201 | Q7Z569 202 | Q86XP0 203 | Q86YV0 204 | Q8IV61 205 | Q8IVT5 206 | Q8TD86 207 | Q8TDF6 208 | Q8WWW0 209 | Q8WYR1 210 | Q92529 211 | Q92565 212 | Q92569 213 | Q92913 214 | Q92914 215 | Q92915 216 | Q92934 217 | Q96KP1 218 | Q96PV0 219 | Q99996 220 | Q9BX93 221 | Q9BZM1 222 | Q9BZM2 223 | Q9GZP0 224 | Q9GZV9 225 | Q9HAV0 226 | Q9HCT0 227 | Q9NP95 228 | Q9NQU5 229 | Q9NRA1 230 | Q9NS23 231 | Q9NSA1 232 | Q9NZ20 233 | Q9NZK7 234 | Q9NZL6 235 | Q9NZT1 236 | Q9P212 237 | Q9P286 238 | Q9P2W3 239 | Q9UBI6 240 | Q9UHD2 241 | Q9UJF2 242 | Q9UK08 243 | Q9UK32 244 | Q9UNK4 245 | Q9UP65 246 | Q9UQC2 247 | Q9UQM7 248 | Q9Y243 249 | Q9Y264 250 | Q9Y6K9 251 | -------------------------------------------------------------------------------- /R/analysisFunctions.R: -------------------------------------------------------------------------------- 1 | #This function invokes runs the Naive Bayes OncoSig classifier to replicate the results presented in 2 | #the accompanying paper 3 | source("R/rFunctions.R") 4 | 5 | runNaiveBayesClassifier <- function(){ 6 | #read in training and testing set 7 | df_1=read.delim("Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 8 | df_2=read.delim("Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE) 9 | 10 | #set binning parameters 11 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6),c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999),c(0,0.01,0.05)) 12 | correlated_features=grep("MS_",colnames(df_1),value = TRUE) 13 | 14 | #perform two fold cross validation 15 | message("Calculating LR_posterior for fold two holdout set\n") 16 | the_results_set_1=OncoSigNB(df_1,df_2,the_bins,correlated_features) 17 | message("Calculating LR_posterior for fold one holdout set\n") 18 | the_results_set_2=OncoSigNB(df_2,df_1,the_bins,correlated_features) 19 | 20 | #rank the results 21 | the_results_set_2_rank=cbind(the_results_set_2,rank(-the_results_set_2)) 22 | the_results_set_1_rank=cbind(the_results_set_1,rank(-the_results_set_1)) 23 | temp=rbind(the_results_set_1_rank,the_results_set_2_rank) 24 | temp=as.data.frame(temp) 25 | colnames(temp)=c("LR_post","Rank") 26 | cross_validated_predictions=temp[order(temp$Rank),] 27 | return(cross_validated_predictions) 28 | #function for performing Naive Bayes Classification 29 | 30 | } 31 | 32 | 33 | #This function calls a script that generates a ROC Curve 34 | generateROCcurve<- function(object_to_create,column_to_use,predictions_file,gold_standard,pdf_outfile){ 35 | setwd("Output_files") 36 | cmd=paste("../scripts/quickROC.pl -s",object_to_create,"-c",column_to_use,predictions_file,gold_standard,pdf_outfile,sep=" ") 37 | system(cmd) 38 | setwd("..") 39 | #print(cmd) 40 | 41 | } 42 | 43 | #For a dataframe containing geneids and Log fold change values, this function finds the p-value of each Log Fold Change 44 | #This loop assigns a p-value to each individual shRNA (note that there are multiple shRNas targeting each gene) 45 | getPvalueofLogFC <- function (df_1,density_null){ 46 | for (i in 1:nrow(df_1)) { 47 | number=df_1[i,2] 48 | Avg.pos <- number; 49 | xt <- diff(density_null$x[density_null$x < Avg.pos]); 50 | #integrate over the density 51 | yt <- rollmean(density_null$y[density_null$x < Avg.pos ],2); 52 | pvalue=sum(xt*yt) 53 | df_1[i,3]=pvalue 54 | #print(i) 55 | } 56 | #Due to errors in integration rounding, some p-values may be above 1, set those to 1. 57 | above_1=which((df_1[,3]) > 1) 58 | df_1[above_1,3]=1 59 | return(df_1) 60 | } 61 | 62 | #This function Integrates pvalues of the same genes using fisher integration: 63 | #Set maximum pvalues to 1, there are some above 1 do to rounding errors in the integration. The input is a dataframe of genes and raw p-values 64 | integratePvaluesbyGene <- function(df_1){ 65 | gene_ids=unique(sort(df_1$Gene)) 66 | Integrated_pvalues=data.frame(row.names = gene_ids) 67 | #Integrate the values using fisher integration 68 | for (i in rownames(Integrated_pvalues)) { 69 | nums=df_1[which(df_1$Gene==i),2] 70 | Integrated_pvalue=fisherIntegration(nums) 71 | Integrated_pvalues[i,1]=Integrated_pvalue 72 | #print(Integrated_pvalue) 73 | } 74 | Integrated_pvalues_2=Integrated_pvalues[order(Integrated_pvalues$V1),,drop=F] 75 | return(Integrated_pvalues_2) 76 | 77 | } 78 | 79 | generateROCcurve10OncogenePathways <- function(){ 80 | system("scripts/generateROC_curves_OncosigRF.sh") 81 | } 82 | 83 | #this function gets all pairwise pearson correlations between two dataframe columns, and returns it as a vector 84 | 85 | getPairwiseCordataframes <- function(df_1,df_2){ 86 | to_return=list() 87 | for (i in colnames(df_1)) { 88 | for (j in colnames(df_2)){ 89 | z=cor.test(df_1[,i],df_2[,j]) 90 | #print(c(i,j,z)) 91 | to_return=append(to_return,z$estimate) 92 | } 93 | } 94 | to_return=unlist(to_return) 95 | } 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /R/functionsRF.R: -------------------------------------------------------------------------------- 1 | listToMatrix <- function(df){ 2 | message("Converting Network to Adjacency Matrix...") 3 | na_impute=0 4 | mat <- matrix(0, length(unique(unlist(df[,2]))), length(unique(unlist(df[,1])))) 5 | #mat <- Matrix(0, nrow = length(unique(unlist(df[,2]))), ncol = length(unique(unlist(df[,1]))),sparse=TRUE) 6 | rownames(mat)=sort(unique(unlist(df[,2]))) 7 | colnames(mat)=sort(unique(unlist(df[,1]))) 8 | #mat[]=na_impute 9 | #z=nrow(df) 10 | #lapply(1:nrow(df),function (x){mat[df[x,2],df[x,1]]=df[x,3];y=x/z;message (y)}) 11 | #for (x in 1:nrow(df)){mat[df[x,2],df[x,1]]=df[x,3];y=x/z;message (y)} 12 | mat[df[,2:1]] <- as.numeric(df[,3]) 13 | message("Done.") 14 | return(mat) 15 | #for (x in 1:nrow(df)){ mat[as.character(df[x,2]),as.character(df[x,1])]=df[x,3];message (x)} 16 | } 17 | -------------------------------------------------------------------------------- /R/rFunctions.R: -------------------------------------------------------------------------------- 1 | #For a ROC Curve (blah), given a False positive threshold (num) and the number of positives (num_pos), tell me on the that ROC curve the correspondoing true positive rate 2 | #example: Roc_FPR(.01,perf_ELRON_no_interactions_KRB.R,250) 3 | #This function reports back the FPR threshold used (which will be close to the threshold you input), the Trupe postive rate, the number of true positive found at this threshold and the number of false pistives at this threshold 4 | Roc_FPR <- function(num,blah,num_pos) 5 | { 6 | closest=1 7 | guess=abs(blah@x.values[[1]][1] -num) 8 | for (i in 2:length(blah@x.values[[1]])){ 9 | guess2=abs(blah@x.values[[1]][[i]]-num) 10 | if (guess2 Define data, use random, 51 | ##-- or do help(data=index) for the standard data sets. 52 | 53 | ## The function is currently defined as 54 | function (df_1, the_bins) 55 | { 56 | the_min = -1e+45 57 | the_max = 1e+43 58 | df_1[is.na(df_1)] <- -1e+23 59 | df_1_copy = df_1 60 | for (i in 3:ncol(df_1)) { 61 | df_1_copy[, i] = .bincode(df_1_copy[, i], c(the_min, 62 | the_bins[i - 2][[1]], the_max), right = FALSE) 63 | } 64 | new_bin_info = list() 65 | for (i in 3:ncol(df_1)) { 66 | new_bin_info = append(new_bin_info, list(getLRsgivenBin_info(df_1_copy[, 67 | i], the_bins[i - 2][[1]], df_1_copy[, 2]))) 68 | } 69 | return(new_bin_info) 70 | } 71 | } 72 | % Add one or more standard keywords, see file 'KEYWORDS' in the 73 | % R documentation directory. 74 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 75 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 76 | -------------------------------------------------------------------------------- /man/OncoSig-package.Rd: -------------------------------------------------------------------------------- 1 | \name{OncoSig-package} 2 | \alias{OncoSig-package} 3 | \alias{OncoSig} 4 | \docType{package} 5 | \title{ 6 | \packageTitle{OncoSig} 7 | } 8 | \description{ 9 | \packageDescription{OncoSig} 10 | } 11 | \details{ 12 | 13 | The DESCRIPTION file: 14 | \packageDESCRIPTION{OncoSig} 15 | \packageIndices{OncoSig} 16 | ~~ An overview of how to use the package, including the most important functions ~~ 17 | } 18 | \author{ 19 | \packageAuthor{OncoSig} 20 | 21 | Maintainer: \packageMaintainer{OncoSig} 22 | } 23 | \references{ 24 | ~~ Literature or other references for background information ~~ 25 | } 26 | \keyword{ package } 27 | \seealso{ 28 | ~~ Optional links to other man pages, e.g. ~~ 29 | ~~ \code{\link[:-package]{}} ~~ 30 | } 31 | \examples{ 32 | ~~ simple examples of the most important functions ~~ 33 | } 34 | -------------------------------------------------------------------------------- /man/OncoSig.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /man/OncoSigNB.Rd: -------------------------------------------------------------------------------- 1 | \name{OncoSigNB} 2 | \alias{OncoSigNB} 3 | \title{ 4 | Run the OncoSig Naive Bayes Classifier 5 | } 6 | \description{ 7 | This function runs the OncoSign Naive Bayes Classifier, utilizing the user provided binning parameters. Optionally, allows the user to provide a list of features that are statistically dependent (and thus violate the assumption of Naive Bayes). The output of this function is a dataframe of predictions in the testing set whith scores based on training the classifier on the training set with corresponding likelihood ratios. Higher scores correspond to higher confidence predictions to be part of the oncogene-centric map. 8 | } 9 | \usage{ 10 | OncoSigNB(training_set, testing_set, the_bins, correlated_features) 11 | } 12 | \arguments{ 13 | \item{training_set}{ 14 | \code{a dataframe containing the training set} 15 | } 16 | \item{testing_set}{ 17 | \code{a dataframe containing the testing set} 18 | } 19 | \item{the_bins}{ 20 | \code{a list of list of the binning parameters. This list of list must be in the same order of the features/columns in the training and testing dataframes} 21 | } 22 | \item{correlated_features}{ 23 | \code{a list of correlated features that are statistically dependent. Pass empty list if none} 24 | } 25 | } 26 | \details{ 27 | In both the training and testing set, the first column should be a unique string identifying the datapoint (e.g. a protein id), and the second column is the label (0 or 1). 28 | } 29 | \value{ 30 | returns a dataframe that is the predictions of classifier on the testing set 31 | 32 | } 33 | \references{ 34 | } 35 | \author{ 36 | 37 | } 38 | \note{ 39 | 40 | 41 | } 42 | 43 | \seealso{ 44 | 45 | } 46 | \examples{ 47 | #set bins 48 | df_1=read.delim("Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 49 | df_2=read.delim("Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE) 50 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6),c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999),c(0,0.01,0.05)) 51 | #specify correlated features 52 | predictions=OncoSigNB(training_set = df_1,testing_set = df_2,the_bins=the_bins,correlated_features =list(correlated_features)) 53 | } 54 | \keyword{ ~Naive Bayes } 55 | -------------------------------------------------------------------------------- /man/OncoSigRF.Rd: -------------------------------------------------------------------------------- 1 | \name{OncoSigRF} 2 | \alias{OncoSigRF} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Create and Generate Predictions Using the OncoSig Random Forest Classifier 6 | } 7 | \description{ 8 | This function creates the OncoSig Random Forest classifer, and returns predictions generated using Monte-Carlo cross validation. Optionally, the forests generated may be saved, which can be used to generate further predictions. 9 | } 10 | \usage{ 11 | OncoSigRF(Network_matrix_df, Gold_Standard_in_Network_names, Fraction_Gold_sample = NULL, ntrees = NULL, max_iterations = NULL, balance = NULL, to_save = NULL) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{Network_matrix_df}{ 16 | A network feature matrix, where the rownames are the points (e..g protein or gene names) to build the classifier on, and the columns are features. 17 | } 18 | \item{Gold_Standard_in_Network_names}{ 19 | A list of proteins/genes in the gold standard. 20 | } 21 | \item{Fraction_Gold_sample}{ 22 | Fraction of gold standard to sample in each Monte-Carlo run. Default:.5 23 | } 24 | \item{ntrees}{ 25 | Number of trees to create in each Random Forest (default:50) 26 | } 27 | \item{max_iterations}{ 28 | Number of iterations of Monte-Carlo samplings to run (i.e. number of forests ot create) (Default:20) 29 | } 30 | \item{balance}{ 31 | Ratio of proteins not in the gold sample to sample in each run.Balance of 1 corresponds to an equal number of proteins in the gold sample and not. (Default:1) 32 | } 33 | \item{to_save}{ 34 | Whether to save the forests created (in a file called "All_forests.R"). This argument must be set to 1 if unsupervised OncoSig is to be used. 35 | } 36 | } 37 | \details{ 38 | 39 | } 40 | \value{ 41 | Returns a dataframe corresponding to predictions from the Monte-Carlo cross-validation. Higher scores in the first column correspond to higher confidence predictions to be part of the oncogene centric map 42 | } 43 | \references{ 44 | %% ~put references to the literature/web site here ~ 45 | } 46 | \author{ 47 | %% ~~who you are~~ 48 | } 49 | \note{ 50 | %% ~~further notes~~ 51 | } 52 | 53 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 54 | 55 | \seealso{ 56 | 57 | } 58 | \examples{ 59 | % Add one or more standard keywords, see file 'KEYWORDS' in the 60 | % R documentation directory. 61 | library (randomForest) 62 | #Process the network 63 | Network_location="Input_data_files/LUAD/original_network.txt" 64 | Network=read.delim(Network_location,header=F) 65 | Network$V1=as.character(Network$V1) 66 | Network$V2=as.character(Network$V2) 67 | Network$V3=as.numeric(Network$V3) 68 | Network=as.matrix(Network) 69 | Gold_Standard_location="Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt" 70 | Gold_Standard=read.delim(Gold_Standard_location,header=F) 71 | Gold_Standard$V1=as.character(Gold_Standard$V1) 72 | 73 | Network[,3]=as.numeric(Network[,3]) 74 | Network_matrix=listToMatrix(Network) 75 | 76 | 77 | #Convert Matrix to Dataframe for future steps 78 | Network_matrix_df=as.data.frame(Network_matrix) 79 | #Remove Members of Gold Standard Not in the Network: 80 | Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df),Gold_Standard$V1) 81 | Negative_Set_names=setdiff(rownames(Network_matrix_df),Gold_Standard_in_Network_names) 82 | #Create Negative_standard 83 | #save(Network_matrix,file="Network_matrix.Rda") 84 | remove(Network_matrix) 85 | 86 | Query_output_results=OncoSigRF(Network_matrix_df,Gold_Standard_in_Network_names,max_iterations = 5) 87 | Query_output_results_scores=Query_output_results[[1]] 88 | } 89 | \keyword{ Random Forest }% use one of RShowDoc("KEYWORDS") 90 | \keyword{ OncoSig }% __ONLY ONE__ keyword per line 91 | \keyword{ Monte Carlo }% __ONLY ONE__ keyword per line 92 | -------------------------------------------------------------------------------- /man/OncoSigUnsup.Rd: -------------------------------------------------------------------------------- 1 | \name{OncoSigUnsup} 2 | \alias{OncoSigUnsup} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Generatres Predictions given a precomputed Oncosig Classifier. 6 | } 7 | \description{ 8 | Given an OncoSig Random Forest classifier that is generated for an abritrary Oncogene-Centric Map, generates predictions for a new Oncogene/Tumor Suppressor Y, whose features are specified in the Network location. This function should be used for Oncogenes/Tumor Suppressors that do not have a gold standard for supervised learning. 9 | } 10 | \usage{ 11 | OncoSigUnsup(Network_location, forest_location) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{Network_location}{ 16 | This is the file that specifies the features for the Oncogene/Tumor Suppressor. The feature names must be the same as the features used to generate the classifier, or an error will be reported. The format of the file is the same as passed to the function listToMatrix(). 17 | } 18 | \item{forest_location}{ 19 | The location of classifier, generated by OncoSigRF 20 | } 21 | } 22 | \details{ 23 | This function takes in one network location and one forest location 24 | } 25 | \value{ 26 | %% ~Describe the value returned 27 | %% If it is a LIST, use 28 | %% \item{comp1 }{Description of 'comp1'} 29 | %% \item{comp2 }{Description of 'comp2'} 30 | %% ... 31 | } 32 | \references{ 33 | %% ~put references to the literature/web site here ~ 34 | } 35 | \author{ 36 | %% ~~who you are~~ 37 | } 38 | \note{ 39 | %% ~~further notes~~ 40 | } 41 | 42 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 43 | 44 | \seealso{ 45 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 46 | } 47 | \examples{ 48 | #Predict KRAS Oncogene-centric map based on EGFR forest 49 | KRAS_features="Input_data_files/LUAD/OncoSigUnsup/feature_list_KRAS.txt" 50 | EGFR_forest="Input_data_files/LUAD/OncoSigUnsup/All_forests_EGFR.r" 51 | results=OncoSigUnsup(KRAS_features,EGFR_forest) 52 | } 53 | % Add one or more standard keywords, see file 'KEYWORDS' in the 54 | % R documentation directory. 55 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 56 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 57 | -------------------------------------------------------------------------------- /man/R/OncoSig.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/man/R/OncoSig.R -------------------------------------------------------------------------------- /man/R/OncoSigNB.Rd: -------------------------------------------------------------------------------- 1 | un the OncoSig Naive Bayes Classifier 2 | \title{OncoSig Naive Bayes} 3 | \description{ 4 | Run the OncoSig Naive Bayes Classifier 5 | \code{save}. 6 | } 7 | \usage{ 8 | load(file, envir = parent.frame()) 9 | } 10 | \arguments{ 11 | \item{training_set}{a dataframe containing the training set} 12 | \item{testing_set}{a dataframe of the testing set} 13 | \item{the_bins}{a list of list of the binning parameters} 14 | \item{correlated_features}{a list of correlated features that are statistically dependent. Pass empty list if none} 15 | } 16 | \seealso{ 17 | \code{\link{save}}. 18 | } 19 | \examples{ 20 | ## set bins;get correlated features 21 | df_1=read.delim("Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 22 | df_2=read.delim("Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE) 23 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6),c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999),c(0,0.01,0.05)) 24 | predictions=OncoSigNB(training_set = df_1,testing_set = df_2,the_bins=the_bins,correlated_features =list()) 25 | 26 | \keyword{file} 27 | -------------------------------------------------------------------------------- /man/computeLRsgivenBins.Rd: -------------------------------------------------------------------------------- 1 | \name{computeLRsgivenBins} 2 | \alias{computeLRsgivenBins} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Compute Likelihood Ratios (LR) for testing set 6 | } 7 | \description{ 8 | This is a helper function for OncoSigNB. After training on a training set, this function computes LRs on a new testing set. Note that labels must be provided for the testing set as well. 9 | } 10 | \usage{ 11 | computeLRsgivenBins(df_1, the_bins, the_bins_info) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{df_1}{ 16 | %% ~~Describe \code{df_1} here~~ 17 | } 18 | \item{the_bins}{ 19 | %% ~~Describe \code{the_bins} here~~ 20 | } 21 | \item{the_bins_info}{ 22 | %% ~~Describe \code{the_bins_info} here~~ 23 | } 24 | } 25 | \details{ 26 | %% ~~ If necessary, more details than the description above ~~ 27 | } 28 | \value{ 29 | %% ~Describe the value returned 30 | %% If it is a LIST, use 31 | %% \item{comp1 }{Description of 'comp1'} 32 | %% \item{comp2 }{Description of 'comp2'} 33 | %% ... 34 | } 35 | \references{ 36 | %% ~put references to the literature/web site here ~ 37 | } 38 | \author{ 39 | %% ~~who you are~~ 40 | } 41 | \note{ 42 | %% ~~further notes~~ 43 | } 44 | 45 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 46 | 47 | \seealso{ 48 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 49 | } 50 | \examples{ 51 | ##---- Should be DIRECTLY executable !! ---- 52 | ##-- ==> Define data, use random, 53 | ##-- or do help(data=index) for the standard data sets. 54 | 55 | ## The function is currently defined as 56 | function (df_1, the_bins, the_bins_info) 57 | { 58 | the_min = -1e+45 59 | the_max = 1e+43 60 | df_1[is.na(df_1)] <- -1e+23 61 | df_1_copy = df_1 62 | for (i in 3:ncol(df_1)) { 63 | df_1_copy[, i] = .bincode(df_1_copy[, i], c(the_min, 64 | the_bins[i - 2][[1]], the_max), right = FALSE) 65 | } 66 | for (i in 3:ncol(df_1)) { 67 | df_1_copy[, i] = replaceBinswithLR(df_1_copy[, i], the_bins_info[i - 68 | 2][[1]]) 69 | } 70 | return(df_1_copy) 71 | } 72 | } 73 | % Add one or more standard keywords, see file 'KEYWORDS' in the 74 | % R documentation directory. 75 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 76 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 77 | -------------------------------------------------------------------------------- /man/getFinalLR.Rd: -------------------------------------------------------------------------------- 1 | \name{getFinalLR} 2 | \alias{getFinalLR} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Multipy LRs to get final LR 6 | } 7 | \description{ 8 | This is a helper function for OncoSigNB(). This function gets the final LR given the input dataframe. The first two columns are the name and the response variable, columns 3 and up feature values. 9 | } 10 | \usage{ 11 | getFinalLR(df_1) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{df_1}{ 16 | %% ~~Describe \code{df_1} here~~ 17 | } 18 | } 19 | \details{ 20 | %% ~~ If necessary, more details than the description above ~~ 21 | } 22 | \value{ 23 | %% ~Describe the value returned 24 | %% If it is a LIST, use 25 | %% \item{comp1 }{Description of 'comp1'} 26 | %% \item{comp2 }{Description of 'comp2'} 27 | %% ... 28 | } 29 | \references{ 30 | %% ~put references to the literature/web site here ~ 31 | } 32 | \author{ 33 | %% ~~who you are~~ 34 | } 35 | \note{ 36 | %% ~~further notes~~ 37 | } 38 | 39 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 40 | 41 | \seealso{ 42 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 43 | } 44 | \examples{ 45 | ##---- Should be DIRECTLY executable !! ---- 46 | ##-- ==> Define data, use random, 47 | ##-- or do help(data=index) for the standard data sets. 48 | 49 | ## The function is currently defined as 50 | function (df_1) 51 | { 52 | to_return = lapply(1:nrow(df_1), function(x) { 53 | prod(df_1[x, 3:ncol(df_1)]) 54 | }) 55 | return(to_return) 56 | } 57 | } 58 | % Add one or more standard keywords, see file 'KEYWORDS' in the 59 | % R documentation directory. 60 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 61 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 62 | -------------------------------------------------------------------------------- /man/getLRsgivenBin_info.Rd: -------------------------------------------------------------------------------- 1 | \name{getLRsgivenBin_info} 2 | \alias{getLRsgivenBin_info} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Return vector of Likelihood Ratios for each bin. 6 | } 7 | \description{ 8 | This is a helper function for OncoSigNB. Given a bin vector and the gold standard vector (i.e. the two vectors of the same length), return the Likelihood Ratio vector. 9 | } 10 | \usage{ 11 | getLRsgivenBin_info(bin_vector, the_bin, label_vector) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{bin_vector}{ 16 | %% ~~Describe \code{bin_vector} here~~ 17 | } 18 | \item{the_bin}{ 19 | %% ~~Describe \code{the_bin} here~~ 20 | } 21 | \item{label_vector}{ 22 | %% ~~Describe \code{label_vector} here~~ 23 | } 24 | } 25 | \details{ 26 | %% ~~ If necessary, more details than the description above ~~ 27 | } 28 | \value{ 29 | %% ~Describe the value returned 30 | %% If it is a LIST, use 31 | %% \item{comp1 }{Description of 'comp1'} 32 | %% \item{comp2 }{Description of 'comp2'} 33 | %% ... 34 | } 35 | \references{ 36 | %% ~put references to the literature/web site here ~ 37 | } 38 | \author{ 39 | %% ~~who you are~~ 40 | } 41 | \note{ 42 | %% ~~further notes~~ 43 | } 44 | 45 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 46 | 47 | \seealso{ 48 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 49 | } 50 | \examples{ 51 | ##---- Should be DIRECTLY executable !! ---- 52 | ##-- ==> Define data, use random, 53 | ##-- or do help(data=index) for the standard data sets. 54 | 55 | ## The function is currently defined as 56 | function (bin_vector, the_bin, label_vector) 57 | { 58 | the_bins_new = the_bin 59 | bin_vector_new = bin_vector 60 | prior = table(label_vector)[2]/table(label_vector)[1] 61 | bin_vector_2 = unique(sort(bin_vector)) 62 | bin_vector_3 = rep(0, length(bin_vector_2)) 63 | for (i in 1:length(bin_vector_2)) { 64 | the_num = bin_vector_2[i] 65 | ratio_1 = table(label_vector[bin_vector == i])[2]/table(label_vector[bin_vector == 66 | i])[1] 67 | LR = ratio_1/prior 68 | bin_vector_3[i] = LR 69 | } 70 | names(bin_vector_3) = bin_vector_2 71 | return(bin_vector_3) 72 | } 73 | } 74 | % Add one or more standard keywords, see file 'KEYWORDS' in the 75 | % R documentation directory. 76 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 77 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 78 | -------------------------------------------------------------------------------- /man/getMaxLR.Rd: -------------------------------------------------------------------------------- 1 | \name{getMaxLR} 2 | \alias{getMaxLR} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Get maximum LR for correlated features 6 | } 7 | \description{ 8 | This is a helper function for OncosigNB;Given the specified columns as a dataframe, return the maximum LR for each case. Note that these columns should be highly correlated. 9 | } 10 | \usage{ 11 | getMaxLR(df_1) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{df_1}{ 16 | %% ~~Describe \code{df_1} here~~ 17 | } 18 | } 19 | \details{ 20 | %% ~~ If necessary, more details than the description above ~~ 21 | } 22 | \value{ 23 | %% ~Describe the value returned 24 | %% If it is a LIST, use 25 | %% \item{comp1 }{Description of 'comp1'} 26 | %% \item{comp2 }{Description of 'comp2'} 27 | %% ... 28 | } 29 | \references{ 30 | %% ~put references to the literature/web site here ~ 31 | } 32 | \author{ 33 | %% ~~who you are~~ 34 | } 35 | \note{ 36 | %% ~~further notes~~ 37 | } 38 | 39 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 40 | 41 | \seealso{ 42 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 43 | } 44 | \examples{ 45 | ##---- Should be DIRECTLY executable !! ---- 46 | ##-- ==> Define data, use random, 47 | ##-- or do help(data=index) for the standard data sets. 48 | 49 | ## The function is currently defined as 50 | function (df_1) 51 | { 52 | the_max_results = lapply(1:nrow(df_1), function(x) { 53 | max(df_1[x, ]) 54 | }) 55 | the_max_results = unlist(the_max_results) 56 | return(the_max_results) 57 | } 58 | } 59 | % Add one or more standard keywords, see file 'KEYWORDS' in the 60 | % R documentation directory. 61 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 62 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 63 | -------------------------------------------------------------------------------- /man/listToMatrix.Rd: -------------------------------------------------------------------------------- 1 | \name{listToMatrix} 2 | \alias{listToMatrix} 3 | \title{ 4 | Convert Adjancency list to adjancency matrix 5 | } 6 | \description{ 7 | Convert Adjancency list (as a dataframe) to adjancency matrix. The first column will become the columns 8 | the second column the rows, and the third column the weights of the edges. 9 | } 10 | 11 | \usage{ 12 | 13 | listToMatrix(df) 14 | } 15 | \arguments{ 16 | \item{df}{ 17 | 18 | } 19 | } 20 | \details{ 21 | } 22 | \value{ 23 | returns a adjacency matrix as a matrix object 24 | } 25 | \references{ 26 | %% ~put references to the literature/web site here ~ 27 | } 28 | \author{ 29 | %% ~~who you are~~ 30 | } 31 | \note{ 32 | %% ~~further notes~~ 33 | } 34 | 35 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 36 | 37 | \seealso{ 38 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 39 | } 40 | \examples{ 41 | Network_location="Input_data_files/LUAD/original_network.txt" 42 | Network=read.delim(Network_location,header=F) 43 | Network$V1=as.character(Network$V1) 44 | Network$V2=as.character(Network$V2) 45 | Network$V3=as.numeric(Network$V3) 46 | Network=as.matrix(Network) 47 | Gold_Standard_location="Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt" 48 | Gold_Standard=read.delim(Gold_Standard_location,header=F) 49 | Gold_Standard$V1=as.character(Gold_Standard$V1) 50 | 51 | Network[,3]=as.numeric(Network[,3]) 52 | Network_matrix=listToMatrix(Network) 53 | } 54 | % Add one or more standard keywords, see file 'KEYWORDS' in the 55 | % R documentation directory. 56 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 57 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 58 | -------------------------------------------------------------------------------- /man/replaceBinswithLR.Rd: -------------------------------------------------------------------------------- 1 | \name{replaceBinswithLR} 2 | \alias{replaceBinswithLR} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Replace bins with LRs 6 | } 7 | \description{ 8 | This is a helper function for OnocoSigNB. Given the bin info, and bined data, replace each bin with the corresponding Likelihood ratio. 9 | } 10 | \usage{ 11 | replaceBinswithLR(bin_vector, the_bin_info) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{bin_vector}{ 16 | %% ~~Describe \code{bin_vector} here~~ 17 | } 18 | \item{the_bin_info}{ 19 | %% ~~Describe \code{the_bin_info} here~~ 20 | } 21 | } 22 | \details{ 23 | %% ~~ If necessary, more details than the description above ~~ 24 | } 25 | \value{ 26 | %% ~Describe the value returned 27 | %% If it is a LIST, use 28 | %% \item{comp1 }{Description of 'comp1'} 29 | %% \item{comp2 }{Description of 'comp2'} 30 | %% ... 31 | } 32 | \references{ 33 | %% ~put references to the literature/web site here ~ 34 | } 35 | \author{ 36 | %% ~~who you are~~ 37 | } 38 | \note{ 39 | %% ~~further notes~~ 40 | } 41 | 42 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 43 | 44 | \seealso{ 45 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 46 | } 47 | \examples{ 48 | ##---- Should be DIRECTLY executable !! ---- 49 | ##-- ==> Define data, use random, 50 | ##-- or do help(data=index) for the standard data sets. 51 | 52 | ## The function is currently defined as 53 | function (bin_vector, the_bin_info) 54 | { 55 | new_bin_vector = lapply(1:len(bin_vector), function(x) { 56 | the_bin = as.character(bin_vector[x]) 57 | bin_value = the_bin_info[the_bin] 58 | bin_value 59 | }) 60 | new_bin_vector = unlist(new_bin_vector) 61 | return(new_bin_vector) 62 | } 63 | } 64 | % Add one or more standard keywords, see file 'KEYWORDS' in the 65 | % R documentation directory. 66 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 67 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 68 | -------------------------------------------------------------------------------- /man/runNaiveBayesClassifier.Rd: -------------------------------------------------------------------------------- 1 | \name{runNaiveBayesClassifier} 2 | \alias{runNaiveBayesClassifier} 3 | \title{ 4 | Run the OncoSig Naive Bayes Classifier as used in the acoompanying paper. 5 | } 6 | \description{ 7 | } 8 | \usage{ 9 | runNaiveBayesClassifier() 10 | } 11 | \details{ 12 | 13 | } 14 | \value{ 15 | } 16 | \references{ 17 | } 18 | \author{ 19 | } 20 | \note{ 21 | } 22 | 23 | 24 | \seealso{ 25 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 26 | } 27 | \examples{ 28 | runNaiveBayesClassifier() 29 | ## The function is currently defined as 30 | function () 31 | { 32 | df_1 = read.delim("Input_data_files/Naive_Bayes_evidences_set_1.txt", 33 | header = TRUE) 34 | df_2 = read.delim("Input_data_files/Naive_Bayes_evidences_set_2.txt", 35 | header = TRUE) 36 | the_bins = list(c(0, 40, 200, 1200), c(0, 0.1), c(-2, -0.15, 37 | -0.02, 0.0925), c(1, 2, 6), c(0, 0.25), c(1, 3, 20), 38 | c(1, 4, 20), c(1, 4, 20), c(0, 1e-04, 0.9999), c(0, 0.01, 39 | 0.05)) 40 | correlated_features = grep("MS_", colnames(df_1), value = TRUE) 41 | message("Calculating LR_posterior for fold two holdout set\n") 42 | the_results_set_1 = OncoSigNB(df_1, df_2, the_bins, correlated_features) 43 | message("Calculating LR_posterior for fold one holdout set\n") 44 | the_results_set_2 = OncoSigNB(df_2, df_1, the_bins, correlated_features) 45 | the_results_set_2_rank = cbind(the_results_set_2, rank(-the_results_set_2)) 46 | the_results_set_1_rank = cbind(the_results_set_1, rank(-the_results_set_1)) 47 | temp = rbind(the_results_set_1_rank, the_results_set_2_rank) 48 | temp = as.data.frame(temp) 49 | colnames(temp) = c("LR_post", "Rank") 50 | cross_validated_predictions = temp[order(temp$Rank), ] 51 | return(cross_validated_predictions) 52 | } 53 | } 54 | % Add one or more standard keywords, see file 'KEYWORDS' in the 55 | % R documentation directory. 56 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 57 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 58 | -------------------------------------------------------------------------------- /vignettes/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/vignettes/.DS_Store -------------------------------------------------------------------------------- /vignettes/OncoSig-concordance.tex: -------------------------------------------------------------------------------- 1 | \Sconcordance{concordance:OncoSig.tex:OncoSig.Rnw:% 2 | 1 6 1 1 0 3 1 1 4 11 1 1 2 4 0 1 2 4 1 1 2 1 0 1 1 1 3 2 0 1 2 % 3 | 4 0 1 2 3 1 1 2 1 0 1 1 17 0 1 2 4 1 1 2 4 0 2 2 1 0 7 1 3 0 2 % 4 | 2 12 0 1 2 1 1 1 3 2 0 2 1 3 0 1 2 1 1 1 3 2 0 1 3 2 0 1 2 1 0 % 5 | 1 1 3 0 1 2 1 3 2 0 1 1 3 0 1 2 2 1 1 3 2 0 1 2 1 0 1 1 3 0 1 % 6 | 2 2 1} 7 | -------------------------------------------------------------------------------- /vignettes/OncoSig.Rnw: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{hyperref} 3 | \usepackage{hyperref} 4 | %\VignetteIndexEntry{Using OncoSig} 5 | \title{Using the OncoSig Classifiers to Discover Novel Oncoprotein network Dependencies} 6 | \date{\today} 7 | \begin{document} 8 | \author{Joshua Broyde, Diana Murray, Barry Honig, Andrea Califano\\Columbia University, New York, USA} 9 | \SweaveOpts{concordance=TRUE} 10 | \maketitle 11 | <>= 12 | options(width=70) 13 | @ 14 | 15 | \section*{Introduction} 16 | 17 | OncoSig comprises a set of machine learning approaches for determinging novel sets of gene products (i.e. genes or proteins) that support 18 | the activity of an oncogene or tumor suppressor (i.e. Oncoprotein-Centric Map, 19 | or OC-map). This is relevant for determining which genes/proteins are involved in an oncoprotein's functional network. OncoSig queries a molecular interaction network or other features regarding a protein's function to predict novel members of the OC-map. This molecular interaction network could contain features such as protein-protein interactions, or gene regulatory networks. 20 | OncoSig can be used primarily in two ways, in a supervised or unsupervised fashion. In the supervised fashion OncoSig uses either a Naive Bayes or Random Forest classifier to train on the molecular interaction network and a gold standard of known members of a particular OC-Map (for example, the members of the KRAS signaling pathway). This approach is appropriate for cases where some members of an OC-Map are known and one wants to leverage the known ones to predict other OC-Map members. 21 | 22 | In a cases where a gold standard is not known, OnconSig can be used in an unsupervised fashion, where an OC-Map trained on a well characterized Oncoprotein is applied to one that is poorly characterized. This usage is appropriate where there is no gold standard for a particular Oncoprotein. 23 | 24 | \section*{Installation and loading} 25 | After first installing R (\href{url}{http://www.r-project.org}) and the OncoSig library, load OncoSig. 26 | <>= 27 | library("OncoSig") 28 | @ 29 | 30 | \section*{OncoSig Naive Bayes Classifier} 31 | The OncoSig Naive Bayes (OncoSigNB) Classifier is a supervised learning approach that is well suited to discovering OC-Map members when there are a few number of features describing each gene product and when the features have no or low statistical dependence. To run OncoSigNB, we create dataframes that correspond to the training and testing sets (which are labeled as 1 or 0, if they are in the gold standard OC-Map or not, respectivley). 32 | For example: 33 | 34 | <<>>= 35 | df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 36 | df_2=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE) 37 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6), 38 | c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999), 39 | c(0,0.01,0.05)) 40 | predictions=OncoSigNB(training_set = df_1,testing_set = df_2, 41 | the_bins=the_bins,correlated_features =list()) 42 | @ 43 | 44 | In this example, we specified the training and testing sets, how to bin the data for each feature, and passed an empty list to indicate that there are no correlated features. 45 | 46 | The input training and testings should be formatted like so. 47 | <<>>= 48 | df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 49 | df_1[1:5,] 50 | @ 51 | Note that when a feature is missing, it is simply coded as NA. The first column is the name (e.g. a gene name), the second column is the label, and all later columns are features. 52 | 53 | \section*{OncoSigRF} 54 | Supervised classification can also be run using the OncoSig Random Forest (OncoSig RF) classifier. Random Forests can be used when the features are statistically dependent, and can more easily be used when the number of features number in the thousands or more. We recommend OncoSigRF when integrating a molecular interaction network that may contain many tens of thousands (note that when using a network of more than a few hundred thousand interactions, OncoSig RF will require a few hours and a prohibitive amount of memory to run). 55 | 56 | <<>>= 57 | library (randomForest) 58 | @ 59 | First we read in the molecular interaction network and convert it to a matrix 60 | <<>>= 61 | Network_location="~/OncoSig/Input_data_files/LUAD/original_network_sample.txt" 62 | Network=read.delim(Network_location,header=F) 63 | Network$V1=as.character(Network$V1) 64 | Network$V2=as.character(Network$V2) 65 | Network$V3=as.numeric(Network$V3) 66 | Network=as.matrix(Network) 67 | Network[,3]=as.numeric(Network[,3]) 68 | Network_matrix=listToMatrix(Network) 69 | @ 70 | Note the format of the input network. The first column is the feature name, the second column is the gene product (i.e. the row of data), and the third column is the score. 71 | <<>>= 72 | Network[1:5,] 73 | @ 74 | 75 | Now read in and process the gold standard 76 | <<>>= 77 | Gold_Standard_location= 78 | "~/OncoSig/Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt" 79 | Gold_Standard=read.delim(Gold_Standard_location,header=F) 80 | Gold_Standard$V1=as.character(Gold_Standard$V1) 81 | @ 82 | 83 | preprocess the data 84 | <<>>= 85 | #Convert Matrix to Dataframe for future steps 86 | Network_matrix_df=as.data.frame(Network_matrix) 87 | #Remove Members of Gold Standard Not in the Network: 88 | Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df), 89 | Gold_Standard$V1) 90 | Negative_Set_names=setdiff(rownames(Network_matrix_df), 91 | Gold_Standard_in_Network_names) 92 | remove(Network_matrix) #delete Matrix. 93 | @ 94 | Run the OncoSigRF Classifier 95 | <>= 96 | Query_output_results=OncoSigRF(Network_matrix_df, 97 | Gold_Standard_in_Network_names, max_iterations=5) 98 | Query_output_results_scores=as.data.frame(Query_output_results[[1]]) 99 | @ 100 | 101 | \section*{Unsupervised OncoSig} 102 | Supervised classification is only applicable when a gold standard suitable for training can be found. However, some Oncogenes/Tumor Suppressors may have no known gene product dependencies. In this case, we can apply a forest created specifically using one Oncogene/Tumor Suppressor and apply it to another. See the documentation for the OncoSigUnsup function for further details. In this example, we read in a random forest created using features for the EGFR oncogene and apply it to the KRAS oncogene. 103 | <>= 104 | KRAS_features= 105 | "~/OncoSig/Input_data_files/LUAD/OncoSigUnsup/feature_list_KRAS.txt" 106 | EGFR_forest= 107 | "~/OncoSig/Input_data_files/LUAD/OncoSigUnsup/All_forests_EGFR.r" 108 | results=OncoSigUnsup(KRAS_features,EGFR_forest) 109 | @ 110 | 111 | 112 | \end{document} -------------------------------------------------------------------------------- /vignettes/OncoSig.log: -------------------------------------------------------------------------------- 1 | This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) (preloaded format=pdflatex 2016.12.27) 13 DEC 2018 04:52 2 | entering extended mode 3 | restricted \write18 enabled. 4 | %&-line parsing enabled. 5 | **OncoSig.tex 6 | (./OncoSig.tex 7 | LaTeX2e <2016/03/31> 8 | Babel <3.9r> and hyphenation patterns for 22 language(s) loaded. 9 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/article.cls 10 | Document Class: article 2014/09/29 v1.4h Standard LaTeX document class 11 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/size10.clo 12 | File: size10.clo 2014/09/29 v1.4h Standard LaTeX file (size option) 13 | ) 14 | \c@part=\count79 15 | \c@section=\count80 16 | \c@subsection=\count81 17 | \c@subsubsection=\count82 18 | \c@paragraph=\count83 19 | \c@subparagraph=\count84 20 | \c@figure=\count85 21 | \c@table=\count86 22 | \abovecaptionskip=\skip41 23 | \belowcaptionskip=\skip42 24 | \bibindent=\dimen102 25 | ) 26 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/hyperref/hyperref.sty 27 | Package: hyperref 2016/05/05 v6.83n Hypertext links for LaTeX 28 | 29 | (/usr/local/texlive/2016basic/texmf-dist/tex/generic/oberdiek/hobsub-hyperref.s 30 | ty 31 | Package: hobsub-hyperref 2016/05/16 v1.14 Bundle oberdiek, subset hyperref (HO) 32 | 33 | 34 | (/usr/local/texlive/2016basic/texmf-dist/tex/generic/oberdiek/hobsub-generic.st 35 | y 36 | Package: hobsub-generic 2016/05/16 v1.14 Bundle oberdiek, subset generic (HO) 37 | Package: hobsub 2016/05/16 v1.14 Construct package bundles (HO) 38 | Package: infwarerr 2016/05/16 v1.4 Providing info/warning/error messages (HO) 39 | Package: ltxcmds 2016/05/16 v1.23 LaTeX kernel commands for general use (HO) 40 | Package: ifluatex 2016/05/16 v1.4 Provides the ifluatex switch (HO) 41 | Package ifluatex Info: LuaTeX not detected. 42 | Package: ifvtex 2016/05/16 v1.6 Detect VTeX and its facilities (HO) 43 | Package ifvtex Info: VTeX not detected. 44 | Package: intcalc 2016/05/16 v1.2 Expandable calculations with integers (HO) 45 | Package: ifpdf 2016/05/14 v3.1 Provides the ifpdf switch 46 | Package: etexcmds 2016/05/16 v1.6 Avoid name clashes with e-TeX commands (HO) 47 | Package etexcmds Info: Could not find \expanded. 48 | (etexcmds) That can mean that you are not using pdfTeX 1.50 or 49 | (etexcmds) that some package has redefined \expanded. 50 | (etexcmds) In the latter case, load this package earlier. 51 | Package: kvsetkeys 2016/05/16 v1.17 Key value parser (HO) 52 | Package: kvdefinekeys 2016/05/16 v1.4 Define keys (HO) 53 | Package: pdftexcmds 2016/05/10 v0.21 Utility functions of pdfTeX for LuaTeX (HO 54 | ) 55 | Package pdftexcmds Info: LuaTeX not detected. 56 | Package pdftexcmds Info: \pdf@primitive is available. 57 | Package pdftexcmds Info: \pdf@ifprimitive is available. 58 | Package pdftexcmds Info: \pdfdraftmode found. 59 | Package: pdfescape 2016/05/16 v1.14 Implements pdfTeX's escape features (HO) 60 | Package: bigintcalc 2016/05/16 v1.4 Expandable calculations on big integers (HO 61 | ) 62 | Package: bitset 2016/05/16 v1.2 Handle bit-vector datatype (HO) 63 | Package: uniquecounter 2016/05/16 v1.3 Provide unlimited unique counter (HO) 64 | ) 65 | Package hobsub Info: Skipping package `hobsub' (already loaded). 66 | Package: letltxmacro 2016/05/16 v1.5 Let assignment for LaTeX macros (HO) 67 | Package: hopatch 2016/05/16 v1.3 Wrapper for package hooks (HO) 68 | Package: xcolor-patch 2016/05/16 xcolor patch 69 | Package: atveryend 2016/05/16 v1.9 Hooks at the very end of document (HO) 70 | Package atveryend Info: \enddocument detected (standard20110627). 71 | Package: atbegshi 2016/05/16 v1.17 At begin shipout hook (HO) 72 | Package: refcount 2016/05/16 v3.5 Data extraction from label references (HO) 73 | Package: hycolor 2016/05/16 v1.8 Color options for hyperref/bookmark (HO) 74 | ) (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics/keyval.sty 75 | Package: keyval 2014/10/28 v1.15 key=value parser (DPC) 76 | \KV@toks@=\toks14 77 | ) 78 | (/usr/local/texlive/2016basic/texmf-dist/tex/generic/ifxetex/ifxetex.sty 79 | Package: ifxetex 2010/09/12 v0.6 Provides ifxetex conditional 80 | ) 81 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/auxhook.sty 82 | Package: auxhook 2016/05/16 v1.4 Hooks for auxiliary files (HO) 83 | ) 84 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/kvoptions.sty 85 | Package: kvoptions 2016/05/16 v3.12 Key value format for package options (HO) 86 | ) 87 | \@linkdim=\dimen103 88 | \Hy@linkcounter=\count87 89 | \Hy@pagecounter=\count88 90 | 91 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/hyperref/pd1enc.def 92 | File: pd1enc.def 2016/05/05 v6.83n Hyperref: PDFDocEncoding definition (HO) 93 | ) 94 | \Hy@SavedSpaceFactor=\count89 95 | 96 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/latexconfig/hyperref.cfg 97 | File: hyperref.cfg 2002/06/06 v1.2 hyperref configuration of TeXLive 98 | ) 99 | Package hyperref Info: Hyper figures OFF on input line 4446. 100 | Package hyperref Info: Link nesting OFF on input line 4451. 101 | Package hyperref Info: Hyper index ON on input line 4454. 102 | Package hyperref Info: Plain pages OFF on input line 4461. 103 | Package hyperref Info: Backreferencing OFF on input line 4466. 104 | Package hyperref Info: Implicit mode ON; LaTeX internals redefined. 105 | Package hyperref Info: Bookmarks ON on input line 4691. 106 | \c@Hy@tempcnt=\count90 107 | 108 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/url/url.sty 109 | \Urlmuskip=\muskip10 110 | Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc. 111 | ) 112 | LaTeX Info: Redefining \url on input line 5044. 113 | \XeTeXLinkMargin=\dimen104 114 | \Fld@menulength=\count91 115 | \Field@Width=\dimen105 116 | \Fld@charsize=\dimen106 117 | Package hyperref Info: Hyper figures OFF on input line 6298. 118 | Package hyperref Info: Link nesting OFF on input line 6303. 119 | Package hyperref Info: Hyper index ON on input line 6306. 120 | Package hyperref Info: backreferencing OFF on input line 6313. 121 | Package hyperref Info: Link coloring OFF on input line 6318. 122 | Package hyperref Info: Link coloring with OCG OFF on input line 6323. 123 | Package hyperref Info: PDF/A mode OFF on input line 6328. 124 | LaTeX Info: Redefining \ref on input line 6368. 125 | LaTeX Info: Redefining \pageref on input line 6372. 126 | \Hy@abspage=\count92 127 | \c@Item=\count93 128 | \c@Hfootnote=\count94 129 | ) 130 | 131 | Package hyperref Message: Driver (autodetected): hpdftex. 132 | 133 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/hyperref/hpdftex.def 134 | File: hpdftex.def 2016/05/05 v6.83n Hyperref driver for pdfTeX 135 | \Fld@listcount=\count95 136 | \c@bookmark@seq@number=\count96 137 | 138 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/rerunfilecheck.sty 139 | Package: rerunfilecheck 2016/05/16 v1.8 Rerun checks for auxiliary files (HO) 140 | Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2 141 | 82. 142 | ) 143 | \Hy@SectionHShift=\skip43 144 | ) 145 | (/usr/local/Cellar/r/3.4.0_1/R.framework/Resources/share/texmf/tex/latex/Sweave 146 | .sty 147 | Package: Sweave 148 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/ifthen.sty 149 | Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC) 150 | ) 151 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics/graphicx.sty 152 | Package: graphicx 2014/10/28 v1.0g Enhanced LaTeX Graphics (DPC,SPQR) 153 | 154 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics/graphics.sty 155 | Package: graphics 2016/05/09 v1.0r Standard LaTeX Graphics (DPC,SPQR) 156 | 157 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics/trig.sty 158 | Package: trig 2016/01/03 v1.10 sin cos tan (DPC) 159 | ) 160 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics-cfg/graphics.cfg 161 | File: graphics.cfg 2016/01/02 v1.10 sample graphics configuration 162 | ) 163 | Package graphics Info: Driver file: pdftex.def on input line 96. 164 | 165 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/pdftex-def/pdftex.def 166 | File: pdftex.def 2011/05/27 v0.06d Graphics/color for pdfTeX 167 | \Gread@gobject=\count97 168 | )) 169 | \Gin@req@height=\dimen107 170 | \Gin@req@width=\dimen108 171 | ) 172 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/fancyvrb/fancyvrb.sty 173 | Package: fancyvrb 2008/02/07 174 | 175 | Style option: `fancyvrb' v2.7a, with DG/SPQR fixes, and firstline=lastline fix 176 | <2008/02/07> (tvz) 177 | \FV@CodeLineNo=\count98 178 | \FV@InFile=\read1 179 | \FV@TabBox=\box26 180 | \c@FancyVerbLine=\count99 181 | \FV@StepNumber=\count100 182 | \FV@OutFile=\write3 183 | ) 184 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/textcomp.sty 185 | Package: textcomp 2005/09/27 v1.99g Standard LaTeX package 186 | Package textcomp Info: Sub-encoding information: 187 | (textcomp) 5 = only ISO-Adobe without \textcurrency 188 | (textcomp) 4 = 5 + \texteuro 189 | (textcomp) 3 = 4 + \textohm 190 | (textcomp) 2 = 3 + \textestimated + \textcurrency 191 | (textcomp) 1 = TS1 - \textcircled - \t 192 | (textcomp) 0 = TS1 (full) 193 | (textcomp) Font families with sub-encoding setting implement 194 | (textcomp) only a restricted character set as indicated. 195 | (textcomp) Family '?' is the default used for unknown fonts. 196 | (textcomp) See the documentation for details. 197 | Package textcomp Info: Setting ? sub-encoding to TS1/1 on input line 79. 198 | 199 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/ts1enc.def 200 | File: ts1enc.def 2001/06/05 v3.0e (jk/car/fm) Standard LaTeX file 201 | ) 202 | LaTeX Info: Redefining \oldstylenums on input line 334. 203 | Package textcomp Info: Setting cmr sub-encoding to TS1/0 on input line 349. 204 | Package textcomp Info: Setting cmss sub-encoding to TS1/0 on input line 350. 205 | Package textcomp Info: Setting cmtt sub-encoding to TS1/0 on input line 351. 206 | Package textcomp Info: Setting cmvtt sub-encoding to TS1/0 on input line 352. 207 | Package textcomp Info: Setting cmbr sub-encoding to TS1/0 on input line 353. 208 | Package textcomp Info: Setting cmtl sub-encoding to TS1/0 on input line 354. 209 | Package textcomp Info: Setting ccr sub-encoding to TS1/0 on input line 355. 210 | Package textcomp Info: Setting ptm sub-encoding to TS1/4 on input line 356. 211 | Package textcomp Info: Setting pcr sub-encoding to TS1/4 on input line 357. 212 | Package textcomp Info: Setting phv sub-encoding to TS1/4 on input line 358. 213 | Package textcomp Info: Setting ppl sub-encoding to TS1/3 on input line 359. 214 | Package textcomp Info: Setting pag sub-encoding to TS1/4 on input line 360. 215 | Package textcomp Info: Setting pbk sub-encoding to TS1/4 on input line 361. 216 | Package textcomp Info: Setting pnc sub-encoding to TS1/4 on input line 362. 217 | Package textcomp Info: Setting pzc sub-encoding to TS1/4 on input line 363. 218 | Package textcomp Info: Setting bch sub-encoding to TS1/4 on input line 364. 219 | Package textcomp Info: Setting put sub-encoding to TS1/5 on input line 365. 220 | Package textcomp Info: Setting uag sub-encoding to TS1/5 on input line 366. 221 | Package textcomp Info: Setting ugq sub-encoding to TS1/5 on input line 367. 222 | Package textcomp Info: Setting ul8 sub-encoding to TS1/4 on input line 368. 223 | Package textcomp Info: Setting ul9 sub-encoding to TS1/4 on input line 369. 224 | Package textcomp Info: Setting augie sub-encoding to TS1/5 on input line 370. 225 | Package textcomp Info: Setting dayrom sub-encoding to TS1/3 on input line 371. 226 | Package textcomp Info: Setting dayroms sub-encoding to TS1/3 on input line 372. 227 | 228 | Package textcomp Info: Setting pxr sub-encoding to TS1/0 on input line 373. 229 | Package textcomp Info: Setting pxss sub-encoding to TS1/0 on input line 374. 230 | Package textcomp Info: Setting pxtt sub-encoding to TS1/0 on input line 375. 231 | Package textcomp Info: Setting txr sub-encoding to TS1/0 on input line 376. 232 | Package textcomp Info: Setting txss sub-encoding to TS1/0 on input line 377. 233 | Package textcomp Info: Setting txtt sub-encoding to TS1/0 on input line 378. 234 | Package textcomp Info: Setting lmr sub-encoding to TS1/0 on input line 379. 235 | Package textcomp Info: Setting lmdh sub-encoding to TS1/0 on input line 380. 236 | Package textcomp Info: Setting lmss sub-encoding to TS1/0 on input line 381. 237 | Package textcomp Info: Setting lmssq sub-encoding to TS1/0 on input line 382. 238 | Package textcomp Info: Setting lmvtt sub-encoding to TS1/0 on input line 383. 239 | Package textcomp Info: Setting lmtt sub-encoding to TS1/0 on input line 384. 240 | Package textcomp Info: Setting qhv sub-encoding to TS1/0 on input line 385. 241 | Package textcomp Info: Setting qag sub-encoding to TS1/0 on input line 386. 242 | Package textcomp Info: Setting qbk sub-encoding to TS1/0 on input line 387. 243 | Package textcomp Info: Setting qcr sub-encoding to TS1/0 on input line 388. 244 | Package textcomp Info: Setting qcs sub-encoding to TS1/0 on input line 389. 245 | Package textcomp Info: Setting qpl sub-encoding to TS1/0 on input line 390. 246 | Package textcomp Info: Setting qtm sub-encoding to TS1/0 on input line 391. 247 | Package textcomp Info: Setting qzc sub-encoding to TS1/0 on input line 392. 248 | Package textcomp Info: Setting qhvc sub-encoding to TS1/0 on input line 393. 249 | Package textcomp Info: Setting futs sub-encoding to TS1/4 on input line 394. 250 | Package textcomp Info: Setting futx sub-encoding to TS1/4 on input line 395. 251 | Package textcomp Info: Setting futj sub-encoding to TS1/4 on input line 396. 252 | Package textcomp Info: Setting hlh sub-encoding to TS1/3 on input line 397. 253 | Package textcomp Info: Setting hls sub-encoding to TS1/3 on input line 398. 254 | Package textcomp Info: Setting hlst sub-encoding to TS1/3 on input line 399. 255 | Package textcomp Info: Setting hlct sub-encoding to TS1/5 on input line 400. 256 | Package textcomp Info: Setting hlx sub-encoding to TS1/5 on input line 401. 257 | Package textcomp Info: Setting hlce sub-encoding to TS1/5 on input line 402. 258 | Package textcomp Info: Setting hlcn sub-encoding to TS1/5 on input line 403. 259 | Package textcomp Info: Setting hlcw sub-encoding to TS1/5 on input line 404. 260 | Package textcomp Info: Setting hlcf sub-encoding to TS1/5 on input line 405. 261 | Package textcomp Info: Setting pplx sub-encoding to TS1/3 on input line 406. 262 | Package textcomp Info: Setting pplj sub-encoding to TS1/3 on input line 407. 263 | Package textcomp Info: Setting ptmx sub-encoding to TS1/4 on input line 408. 264 | Package textcomp Info: Setting ptmj sub-encoding to TS1/4 on input line 409. 265 | ) 266 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/fontenc.sty 267 | Package: fontenc 2005/09/27 v1.99g Standard LaTeX package 268 | 269 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/t1enc.def 270 | File: t1enc.def 2005/09/27 v1.99g Standard LaTeX file 271 | LaTeX Font Info: Redeclaring font encoding T1 on input line 48. 272 | )) 273 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/ae/ae.sty 274 | Package: ae 2001/02/12 1.3 Almost European Computer Modern 275 | 276 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/fontenc.sty 277 | Package: fontenc 2005/09/27 v1.99g Standard LaTeX package 278 | 279 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/t1enc.def 280 | File: t1enc.def 2005/09/27 v1.99g Standard LaTeX file 281 | LaTeX Font Info: Redeclaring font encoding T1 on input line 48. 282 | ) 283 | LaTeX Font Info: Try loading font information for T1+aer on input line 105. 284 | 285 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/ae/t1aer.fd 286 | File: t1aer.fd 1997/11/16 Font definitions for T1/aer. 287 | )))) 288 | (./OncoSig.aux) 289 | \openout1 = `OncoSig.aux'. 290 | 291 | LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 8. 292 | LaTeX Font Info: ... okay on input line 8. 293 | LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 8. 294 | LaTeX Font Info: ... okay on input line 8. 295 | LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 8. 296 | LaTeX Font Info: ... okay on input line 8. 297 | LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 8. 298 | LaTeX Font Info: ... okay on input line 8. 299 | LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 8. 300 | LaTeX Font Info: ... okay on input line 8. 301 | LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 8. 302 | LaTeX Font Info: ... okay on input line 8. 303 | LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 8. 304 | LaTeX Font Info: ... okay on input line 8. 305 | LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 8. 306 | LaTeX Font Info: Try loading font information for TS1+cmr on input line 8. 307 | 308 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/ts1cmr.fd 309 | File: ts1cmr.fd 2014/09/29 v2.5h Standard LaTeX font definitions 310 | ) 311 | LaTeX Font Info: ... okay on input line 8. 312 | \AtBeginShipoutBox=\box27 313 | Package hyperref Info: Link coloring OFF on input line 8. 314 | 315 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/hyperref/nameref.sty 316 | Package: nameref 2012/10/27 v2.43 Cross-referencing by name of section 317 | 318 | (/usr/local/texlive/2016basic/texmf-dist/tex/generic/oberdiek/gettitlestring.st 319 | y 320 | Package: gettitlestring 2016/05/16 v1.5 Cleanup title references (HO) 321 | ) 322 | \c@section@level=\count101 323 | ) 324 | LaTeX Info: Redefining \ref on input line 8. 325 | LaTeX Info: Redefining \pageref on input line 8. 326 | LaTeX Info: Redefining \nameref on input line 8. 327 | (./OncoSig.out) (./OncoSig.out) 328 | \@outlinefile=\write4 329 | \openout4 = `OncoSig.out'. 330 | 331 | 332 | (/usr/local/texlive/2016basic/texmf-dist/tex/context/base/mkii/supp-pdf.mkii 333 | [Loading MPS to PDF converter (version 2006.09.02).] 334 | \scratchcounter=\count102 335 | \scratchdimen=\dimen109 336 | \scratchbox=\box28 337 | \nofMPsegments=\count103 338 | \nofMParguments=\count104 339 | \everyMPshowfont=\toks15 340 | \MPscratchCnt=\count105 341 | \MPscratchDim=\dimen110 342 | \MPnumerator=\count106 343 | \makeMPintoPDFobject=\count107 344 | \everyMPtoPDFconversion=\toks16 345 | ) (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/epstopdf-base.sty 346 | Package: epstopdf-base 2016/05/15 v2.6 Base part for package epstopdf 347 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/grfext.sty 348 | Package: grfext 2016/05/16 v1.2 Manage graphics extensions (HO) 349 | ) 350 | Package grfext Info: Graphics extension search list: 351 | (grfext) [.png,.pdf,.jpg,.mps,.jpeg,.jbig2,.jb2,.PNG,.PDF,.JPG,.JPE 352 | G,.JBIG2,.JB2,.eps] 353 | (grfext) \AppendGraphicsExtensions on input line 456. 354 | 355 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg 356 | File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv 357 | e 358 | )) (./OncoSig-concordance.tex) 359 | LaTeX Font Info: External font `cmex10' loaded for size 360 | (Font) <12> on input line 12. 361 | LaTeX Font Info: External font `cmex10' loaded for size 362 | (Font) <8> on input line 12. 363 | LaTeX Font Info: External font `cmex10' loaded for size 364 | (Font) <6> on input line 12. 365 | LaTeX Font Info: Try loading font information for T1+aett on input line 25. 366 | 367 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/ae/t1aett.fd 368 | File: t1aett.fd 1997/11/16 Font definitions for T1/aett. 369 | ) [1 370 | 371 | {/usr/local/texlive/2016basic/texmf-var/fonts/map/pdftex/updmap/pdftex.map}] [2 372 | ] [3] 373 | Package atveryend Info: Empty hook `BeforeClearDocument' on input line 152. 374 | 375 | [4] 376 | Package atveryend Info: Empty hook `AfterLastShipout' on input line 152. 377 | (./OncoSig.aux) 378 | Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 152. 379 | Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 152. 380 | Package rerunfilecheck Info: File `OncoSig.out' has not changed. 381 | (rerunfilecheck) Checksum: D41D8CD98F00B204E9800998ECF8427E;0. 382 | Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 152. 383 | ) 384 | Here is how much of TeX's memory you used: 385 | 6210 strings out of 494447 386 | 92542 string characters out of 6166765 387 | 166594 words of memory out of 5000000 388 | 9465 multiletter control sequences out of 15000+600000 389 | 20787 words of font info for 49 fonts, out of 8000000 for 9000 390 | 319 hyphenation exceptions out of 8191 391 | 35i,6n,23p,549b,472s stack positions out of 5000i,500n,10000p,200000b,80000s 392 | 399 | Output written on OncoSig.pdf (4 pages, 103165 bytes). 400 | PDF statistics: 401 | 58 PDF objects out of 1000 (max. 8388607) 402 | 44 compressed objects within 1 object stream 403 | 10 named destinations out of 1000 (max. 500000) 404 | 5 words of extra memory for PDF output out of 10000 (max. 10000000) 405 | 406 | -------------------------------------------------------------------------------- /vignettes/OncoSig.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/vignettes/OncoSig.pdf -------------------------------------------------------------------------------- /vignettes/OncoSig.synctex.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/vignettes/OncoSig.synctex.gz -------------------------------------------------------------------------------- /vignettes/OncoSig.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{hyperref} 3 | \usepackage{hyperref} 4 | %\VignetteIndexEntry{Using OncoSig} 5 | \title{Using the OncoSig Classifiers to Discover Novel Oncoprotein network Dependencies} 6 | \date{\today} 7 | \usepackage{Sweave} 8 | \begin{document} 9 | \author{Joshua Broyde, Diana Murray, Barry Honig, Andrea Califano\\Columbia University, New York, USA} 10 | \input{OncoSig-concordance} 11 | \maketitle 12 | 13 | \section*{Introduction} 14 | 15 | OncoSig comprises a set of machine learning approaches for determinging novel sets of gene products (i.e. genes or proteins) that support 16 | the activity of an oncogene or tumor suppressor (i.e. Oncoprotein-Centric Map, 17 | or OC-map). This is relevant for determining which genes/proteins are involved in an oncoprotein's functional network. OncoSig queries a molecular interaction network or other features regarding a protein's function to predict novel members of the OC-map. This molecular interaction network could contain features such as protein-protein interactions, or gene regulatory networks. 18 | OncoSig can be used primarily in two ways, in a supervised or unsupervised fashion. In the supervised fashion OncoSig uses either a Naive Bayes or Random Forest classifier to train on the molecular interaction network and a gold standard of known members of a particular OC-Map (for example, the members of the KRAS signaling pathway). This approach is appropriate for cases where some members of an OC-Map are known and one wants to leverage the known ones to predict other OC-Map members. 19 | 20 | In a cases where a gold standard is not known, OnconSig can be used in an unsupervised fashion, where an OC-Map trained on a well characterized Oncoprotein is applied to one that is poorly characterized. This usage is appropriate where there is no gold standard for a particular Oncoprotein. 21 | 22 | \section*{Installation and loading} 23 | After first installing R (\href{url}{http://www.r-project.org}) and the OncoSig library, load OncoSig. 24 | \begin{Schunk} 25 | \begin{Sinput} 26 | > library("OncoSig") 27 | \end{Sinput} 28 | \end{Schunk} 29 | 30 | \section*{OncoSig Naive Bayes Classifier} 31 | The OncoSig Naive Bayes (OncoSigNB) Classifier is a supervised learning approach that is well suited to discovering OC-Map members when there are a few number of features describing each gene product and when the features have no or low statistical dependence. To run OncoSigNB, we create dataframes that correspond to the training and testing sets (which are labeled as 1 or 0, if they are in the gold standard OC-Map or not, respectivley). 32 | For example: 33 | 34 | \begin{Schunk} 35 | \begin{Sinput} 36 | > df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 37 | > df_2=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE) 38 | > the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6), 39 | + c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999), 40 | + c(0,0.01,0.05)) 41 | > predictions=OncoSigNB(training_set = df_1,testing_set = df_2, 42 | + the_bins=the_bins,correlated_features =list()) 43 | \end{Sinput} 44 | \end{Schunk} 45 | 46 | In this example, we specified the training and testing sets, how to bin the data for each feature, and passed an empty list to indicate that there are no correlated features. 47 | 48 | The input training and testings should be formatted like so. 49 | \begin{Schunk} 50 | \begin{Sinput} 51 | > df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE) 52 | > df_1[1:5,] 53 | \end{Sinput} 54 | \begin{Soutput} 55 | V1 df_labels PREP_LR luad_vip_pv luad_lincs MS_RALGDS 56 | 1 Q16539 1 676.84 1.00e+00 0.13985800 NA 57 | 2 P78383 1 NA NA NA NA 58 | 3 P30281 1 NA 1.00e+00 -0.02819036 NA 59 | 4 P28799 1 30.83 7.97e-05 0.34413929 NA 60 | 5 Q96HE7 1 NA 1.00e+00 -0.33786435 NA 61 | Luad_vip_up MS_TBK1 MS_RALA MS_RALB Demand_pv mindy_overlap_lung 62 | 1 NA NA NA NA 1 1.0000000000 63 | 2 NA NA NA NA 1 NA 64 | 3 NA NA NA NA 1 0.0682243033 65 | 4 NA NA NA NA 1 0.0002509032 66 | 5 NA NA NA NA 1 0.0771524132 67 | \end{Soutput} 68 | \end{Schunk} 69 | Note that when a feature is missing, it is simply coded as NA. The first column is the name (e.g. a gene name), the second column is the label, and all later columns are features. 70 | 71 | \section*{OncoSigRF} 72 | Supervised classification can also be run using the OncoSig Random Forest (OncoSig RF) classifier. Random Forests can be used when the features are statistically dependent, and can more easily be used when the number of features number in the thousands or more. We recommend OncoSigRF when integrating a molecular interaction network that may contain many tens of thousands (note that when using a network of more than a few hundred thousand interactions, OncoSig RF will require a few hours and a prohibitive amount of memory to run). 73 | 74 | \begin{Schunk} 75 | \begin{Sinput} 76 | > library (randomForest) 77 | \end{Sinput} 78 | \end{Schunk} 79 | First we read in the molecular interaction network and convert it to a matrix 80 | \begin{Schunk} 81 | \begin{Sinput} 82 | > Network_location="~/OncoSig/Input_data_files/LUAD/original_network_sample.txt" 83 | > Network=read.delim(Network_location,header=F) 84 | > Network$V1=as.character(Network$V1) 85 | > Network$V2=as.character(Network$V2) 86 | > Network$V3=as.numeric(Network$V3) 87 | > Network=as.matrix(Network) 88 | > Network[,3]=as.numeric(Network[,3]) 89 | > Network_matrix=listToMatrix(Network) 90 | \end{Sinput} 91 | \end{Schunk} 92 | Note the format of the input network. The first column is the feature name, the second column is the gene product (i.e. the row of data), and the third column is the score. 93 | \begin{Schunk} 94 | \begin{Sinput} 95 | > Network[1:5,] 96 | \end{Sinput} 97 | \begin{Soutput} 98 | V1 V2 V3 99 | [1,] "Q9Y5P4_CINDY_SIG" "Q8N653" "79" 100 | [2,] "A6NF89_PREPPI" "P47881" "2109.56" 101 | [3,] "P61586_PREPPI" "Q6ZUM4" "6546.191" 102 | [4,] "Q06124_CINDY_SIG" "O14647" "50" 103 | [5,] "Q9Y606_ARACNE" "Q00613" "0.3334542" 104 | \end{Soutput} 105 | \end{Schunk} 106 | 107 | Now read in and process the gold standard 108 | \begin{Schunk} 109 | \begin{Sinput} 110 | > Gold_Standard_location= 111 | + "~/OncoSig/Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt" 112 | > Gold_Standard=read.delim(Gold_Standard_location,header=F) 113 | > Gold_Standard$V1=as.character(Gold_Standard$V1) 114 | \end{Sinput} 115 | \end{Schunk} 116 | 117 | preprocess the data 118 | \begin{Schunk} 119 | \begin{Sinput} 120 | > #Convert Matrix to Dataframe for future steps 121 | > Network_matrix_df=as.data.frame(Network_matrix) 122 | > #Remove Members of Gold Standard Not in the Network: 123 | > Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df), 124 | + Gold_Standard$V1) 125 | > Negative_Set_names=setdiff(rownames(Network_matrix_df), 126 | + Gold_Standard_in_Network_names) 127 | > remove(Network_matrix) #delete Matrix. 128 | \end{Sinput} 129 | \end{Schunk} 130 | Run the OncoSigRF Classifier 131 | \begin{Schunk} 132 | \begin{Sinput} 133 | > Query_output_results=OncoSigRF(Network_matrix_df, 134 | + Gold_Standard_in_Network_names, max_iterations=5) 135 | > Query_output_results_scores=as.data.frame(Query_output_results[[1]]) 136 | \end{Sinput} 137 | \end{Schunk} 138 | 139 | \section*{Unsupervised OncoSig} 140 | Supervised classification is only applicable when a gold standard suitable for training can be found. However, some Oncogenes/Tumor Suppressors may have no known gene product dependencies. In this case, we can apply a forest created specifically using one Oncogene/Tumor Suppressor and apply it to another. See the documentation for the OncoSigUnsup function for further details. In this example, we read in a random forest created using features for the EGFR oncogene and apply it to the KRAS oncogene. 141 | \begin{Schunk} 142 | \begin{Sinput} 143 | > KRAS_features= 144 | + "~/OncoSig/Input_data_files/LUAD/OncoSigUnsup/feature_list_KRAS.txt" 145 | > EGFR_forest= 146 | + "~/OncoSig/Input_data_files/LUAD/OncoSigUnsup/All_forests_EGFR.r" 147 | > results=OncoSigUnsup(KRAS_features,EGFR_forest) 148 | \end{Sinput} 149 | \end{Schunk} 150 | 151 | 152 | \end{document} 153 | --------------------------------------------------------------------------------