├── .DS_Store
├── .Rbuildignore
├── .Rhistory
├── .Rproj.user
├── F983FB94
│ ├── build_options
│ ├── cpp-definition-cache
│ ├── pcs
│ │ ├── files-pane.pper
│ │ ├── source-pane.pper
│ │ ├── windowlayoutstate.pper
│ │ └── workbench-pane.pper
│ ├── persistent-state
│ ├── rmd-outputs
│ ├── saved_source_markers
│ └── sources
│ │ └── prop
│ │ ├── 1B64C678
│ │ ├── 1CC714F7
│ │ ├── 27C419B
│ │ ├── 27EB650E
│ │ ├── 2EE4D19B
│ │ ├── 453A360A
│ │ ├── 4668FDBC
│ │ ├── 4EC81EA2
│ │ ├── 6CEE29D
│ │ ├── 6E2F621B
│ │ ├── 71CD7210
│ │ ├── 78F86D91
│ │ ├── 908CD31C
│ │ ├── A67B5CF0
│ │ ├── C0746A86
│ │ ├── D6FEEFCC
│ │ ├── D71BEEF9
│ │ ├── F8F7728B
│ │ └── INDEX
└── shared
│ └── notebooks
│ ├── patch-chunk-names
│ └── paths
├── All_forests.r
├── All_forests_KRAS_Ingenuity.r
├── All_forests_KRAS_MSigDB.r
├── DESCRIPTION
├── NAMESPACE
├── OncoSig.Rproj
├── R
├── NBfunctions.R
├── OncoSig.R
├── OncoSigNB.R
├── OncoSigRF.R
├── OncoSigUnsup.R
├── Oncosig-RF
│ └── OncoSig
│ │ ├── .functions.R.swo
│ │ ├── .functions.R.swp
│ │ ├── ONCOSIG_README.md
│ │ ├── ONCOSIG_README.pdf
│ │ └── Test
│ │ ├── OncoSig_objects.R
│ │ ├── OncoSig_results.txt
│ │ ├── Performance.pdf
│ │ ├── gold_standard.txt
│ │ └── test_network.txt
├── analysisFunctions.R
├── functionsRF.R
└── rFunctions.R
├── README.md
├── man
├── NaiveBayesBin.Rd
├── OncoSig-package.Rd
├── OncoSig.Rproj
├── OncoSigNB.Rd
├── OncoSigRF.Rd
├── OncoSigUnsup.Rd
├── R
│ ├── OncoSig.R
│ └── OncoSigNB.Rd
├── computeLRsgivenBins.Rd
├── getFinalLR.Rd
├── getLRsgivenBin_info.Rd
├── getMaxLR.Rd
├── listToMatrix.Rd
├── replaceBinswithLR.Rd
└── runNaiveBayesClassifier.Rd
└── vignettes
├── .DS_Store
├── OncoSig-concordance.tex
├── OncoSig.Rnw
├── OncoSig.log
├── OncoSig.pdf
├── OncoSig.synctex
├── OncoSig.synctex.gz
└── OncoSig.tex
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/.DS_Store
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 |
--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
1 | amp.matrix[amp.matrix == 2] <- "A"
2 | amp.matrix <- amp.matrix %>% as_tibble(rownames = NA) %>%
3 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>%
4 | inner_join(amp.events) %>% select(just.genes, tags, coseg.names, everything())
5 | #dels
6 | del.events <- all.final.mat.event.types.df %>% filter(dels == "D") %>%
7 | select(coseg.names, tags, just.genes)
8 | # remove duplicates of both D and D+M
9 | if(length(unique(del.events$just.genes)) != length(del.events$just.genes)) {
10 | dups <- del.events %>% group_by(just.genes) %>% filter(n() > 1)
11 | to.keep <- dups %>% filter(tags == "D+M")
12 | del.events <- del.events %>% filter(!just.genes %in% dups$just.genes) %>%
13 | bind_rows(to.keep)
14 | }
15 | del.matrix <- del.mat[unique(del.events$just.genes), intersecting.samples , drop = F]
16 | # only get focal
17 | del.matrix[del.matrix > -2] <- NA
18 | del.matrix[del.matrix == -2] <- "D"
19 | del.matrix <- del.matrix %>% as_tibble(rownames = NA) %>%
20 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>%
21 | inner_join(del.events) %>% select(just.genes, tags, coseg.names, everything())
22 | # fusions (if they exist)
23 | if(isFALSE(all(is.na(all.final.mat.event.types.df$fus)))) {
24 | fus.events <- all.final.mat.event.types.df %>% filter(fus == "F") %>%
25 | select(coseg.names, tags, just.genes)
26 | fus.mat <- read.table(paste0(fusion.data.dir, tumAcro, '.txt'), sep='\t', header=T, row.names=1, check.names=F)
27 | fus.matrix <- fus.mat[unique(fus.events$just.genes), intersect(colnames(fus.mat), intersecting.samples), drop = F]
28 | fus.matrix[fus.matrix == 1] <- "F"
29 | fus.matrix[fus.matrix == 0] <- NA
30 | fus.matrix <- fus.matrix %>% as_tibble(rownames = NA) %>%
31 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>%
32 | inner_join(fus.events) %>% select(just.genes, tags, coseg.names, everything())
33 | } else {
34 | fus.matrix <- NULL
35 | }
36 | ## bind them all together
37 | tumor.events.mat <- bind_rows(mut.matrix, amp.matrix, del.matrix, fus.matrix)
38 | # combine rows of duplicates so they represent only one event
39 | dups <- tumor.events.mat %>% group_by(just.genes) %>% filter(n() > 1)
40 | if(nrow(dups) > 0) {
41 | # keep events that are just A vs D on separate lines
42 | to.keep <- dups %>% filter(tags %in% c("A", "D", "M"))
43 | remaining <- dups %>% filter(tags %in% c("A+M", "D+M"))
44 | for (gene in unique(remaining$just.genes)) {
45 | sub <- remaining[remaining$just.genes == gene,]
46 | # combine lines with same tags to merge M and A/D
47 | for (tag in unique(sub$tags)){
48 | to.merge <- sub %>% filter(tags == tag)
49 | new.line <- apply(to.merge, 2, paste0, collapse = ";")
50 | new.line[1:3] <- unlist(to.merge[1,1:3])
51 | to.keep <- bind_rows(to.keep, new.line)
52 | }
53 | }
54 | to.keep[to.keep == "NA;NA"] <- NA
55 | tumor.events.mat <- tumor.events.mat %>% filter(!just.genes %in% dups$just.genes) %>%
56 | bind_rows(to.keep)
57 | }
58 | trans.tumor.events.mat <- tumor.events.mat %>% column_to_rownames("coseg.names") %>%
59 | select(-just.genes, -tags) %>% t() %>% as_tibble(rownames = NA) %>% rownames_to_column("sample")
60 | # add clusters to trans.tumor.events.mat
61 | sample.clustering <- pancan.clusters[[tumAcro]] %>% enframe("sample", "cluster") %>%
62 | filter(sample %in% intersecting.samples)
63 | trans.tumor.events.mat <- inner_join(sample.clustering, trans.tumor.events.mat) %>%
64 | select(-sample)
65 | # get rid of gbm subtype 2 here before continuing
66 | if(tumAcro == "gbm"){
67 | trans.tumor.events.mat <- filter(trans.tumor.events.mat, cluster != 2)
68 | }
69 | # check if any subtype had no events (was saved as a logical in the final events plots)
70 | no.events <- which(sapply(final.plots[["matrices"]][[tumAcro]], is.logical))
71 | if(length(no.events > 0)) {
72 | trans.tumor.events.mat <- filter(trans.tumor.events.mat, !cluster %in% no.events)
73 | }
74 | get.percent <- function(x){
75 | round(sum(!is.na(x))/length(x)*100)
76 | }
77 | # matrix with all the percentages for the events
78 | summary.df <- trans.tumor.events.mat %>% group_by(cluster) %>% summarise_all(get.percent) %>%
79 | column_to_rownames("cluster") %>% t()
80 | to.plot.df <- summary.df %>% as_tibble(rownames = NA) %>%
81 | rownames_to_column("event.names")
82 | # matrix with just the percentages for the events that are plotted
83 | # replace all the cells that are not MOMA events with 0s
84 | for (clus in colnames(to.plot.df)[2:ncol(to.plot.df)]) {
85 | events.this.clus <- all.final.mat.event.types[[as.numeric(clus)]]$coseg.names
86 | #to.plot.df.test <- to.plot.df %>% mutate(test1 = if_else(event.names %in% events.this.clus, to.plot.df[,clus], 0))
87 | to.replace <- to.plot.df %>% as_tibble(rownames = NA) %>%
88 | select(event.names, all_of(clus)) %>% deframe()
89 | to.replace <- if_else(names(to.replace) %in% events.this.clus, to.replace,
90 | if_else(str_sub(names(to.replace), end = -3) %in% events.this.clus, to.replace, 0))
91 | to.plot.df[,clus] <- to.replace
92 | }
93 | ###
94 | # do row-wise proportion test of percentages
95 | ###
96 | # first get number of samples per cluster
97 | cluster.sums <- trans.tumor.events.mat %>% group_by(cluster) %>% summarise(total = n())
98 | # initialize final dataframe for events and p.values
99 | event.prop.df <- tibble(event = colnames(trans.tumor.events.mat[2:ncol(trans.tumor.events.mat)]),
100 | pval = 1)
101 | # create a contingency table for each event given it's occurence/non-occurence in each cluster
102 | # do a chisq test to determine if the proportions are or aren't the same
103 | for(ge in colnames(trans.tumor.events.mat[2:ncol(trans.tumor.events.mat)])) {
104 | get.non.na <- function(x) sum(!is.na(x))
105 | sub.df <- trans.tumor.events.mat %>% select(cluster, all_of(ge)) %>%
106 | group_by(cluster) %>% summarise_all(get.non.na) %>% right_join(cluster.sums) %>%
107 | transmute(suc = get(ge), failures = total - get(ge)) %>% as.matrix()
108 | res <- chisq.test(sub.df, simulate.p.value = TRUE)$p.value
109 | event.prop.df[event.prop.df$event == ge, 2] <- res
110 | }
111 | # adjust pvals for multi hypothesis correction
112 | event.prop.df$adj.pval <- event.prop.df$stars <- p.adjust(event.prop.df$pval, method = "BH")
113 | #event.prop.df$stars[event.prop.df$stars > 0.05] <- ""
114 | event.prop.df <- mutate(event.prop.df, stars = ifelse(event.prop.df$stars < 0.0001, "****",
115 | ifelse(event.prop.df$stars < 0.001, "***",
116 | ifelse(event.prop.df$stars < 0.01, "**",
117 | ifelse(event.prop.df$stars < 0.05, "*", " ")))))
118 | # make the plot
119 | to.plot.df <- to.plot.df %>% column_to_rownames("event.names") %>% as.matrix()
120 | col_fun = colorRamp2(c(0, 1, 100), c("grey", "palegreen", "darkgreen"))
121 | if(nrow(to.plot.df) >= 60) {
122 | label.size <- 5
123 | } else if (nrow(to.plot.df) >= 50 ) {
124 | label.size <- 6
125 | } else if (nrow(to.plot.df) >= 40 ) {
126 | label.size <- 7
127 | } else if (nrow(to.plot.df) >= 30 ) {
128 | label.size <- 8
129 | } else {
130 | label.size <- 9
131 | }
132 | # make row annotation with astericks for chi.sq test
133 | ha <- rowAnnotation(signif = anno_text(event.prop.df$stars, gp = gpar(fontsize = label.size)))
134 | # make rowlabels that say cluster
135 | col.labels <- structure(paste0("Cluster ", colnames(to.plot.df)), names = colnames(to.plot.df))
136 | # make the title and subtitle with p values
137 | title <- paste0("Events in ", toupper(tumAcro), "",
138 | "
p values: *** < 0.001 | ** < 0.01 | * < 0.05 " )
139 | ht <- Heatmap(to.plot.df, col = col_fun, rect_gp = gpar(col = "white", lwd = 1),
140 | column_names_rot = 45, column_labels = col.labels,
141 | cell_fun = function(j, i, x, y, width, height, fill) {
142 | if(to.plot.df[i, j] > 0) {
143 | grid.text(sprintf("%.0f", to.plot.df[i, j]), x, y, gp = gpar(fontsize = label.size))
144 | } else {
145 | grid.text(sprintf("%.0f", summary.df[i, j]), x, y, gp = gpar(fontsize = label.size))
146 | }
147 | },
148 | heatmap_legend_param = list(title = "% Samples \nin Cluster"),
149 | row_names_gp = gpar(fontsize = label.size),
150 | column_title = gt_render(title),
151 | show_row_dend = F, show_column_dend = F,
152 | right_annotation = ha)
153 | p <- grid::grid.grabExpr(draw(ht, padding = unit(c(2, 10, 2, .5), "mm")))
154 | cosegregation.plots[[tumAcro]] <- p
155 | }
156 | m1 <- marrangeGrob(cosegregation.plots, ncol = 1, nrow = 1)
157 | ggsave(filename = paste0(output.folder, "cosegregation.plots.focal.only.pdf"), m1,
158 | width = 8.5, height = 11, units = c("in"),
159 | dpi = 300)
160 | ggsave(filename = "~/Desktop/cosegregation.plots.focal.only.pdf", m1,
161 | width = 8.5, height = 11, units = c("in"),
162 | dpi = 300)
163 | View(cosegregation.plots)
164 | cosegregation.plots <- list()
165 | for (tumAcro in tumor.types) {
166 | print(paste("Making plots for:", tumAcro))
167 | # First clean up current event names for each subtype
168 | # Make table with event names and type
169 | all.final.mat.event.types <- list()
170 | for (clus in seq_along(final.plots[["matrices"]][[tumAcro]])) {
171 | final.mat <- final.plots[["matrices"]][[tumAcro]][[clus]]
172 | if (is.null(final.mat)) {
173 | all.final.mat.event.types[[clus]] <- NULL
174 | next
175 | }
176 | final.plot.names <- str_split_fixed(rownames(final.mat), pattern = "::", n = 2)
177 | # already added this to the original plot script
178 | # if(length(unique(final.plot.names[,1])) == nrow(final.mat)) {
179 | # rownames(final.mat) <- final.plot.names[,1]
180 | # } else {
181 | # # replace duplicate names with gene::type, otherwise just the name
182 | # dups <- final.plot.names[duplicated(final.plot.names[,1]),1]
183 | # new.names <- c()
184 | #
185 | # for (idx in seq_along(final.plot.names[,1])) {
186 | # ge <- final.plot.names[idx,1]
187 | # if (ge %in% dups) {
188 | # new.names <- c(new.names, rownames(final.mat)[idx])
189 | # } else {
190 | # new.names <- c(new.names, ge)
191 | # }
192 | # }
193 | # rownames(final.mat) <- new.names
194 | # }
195 | #
196 | # resave new final mat to object for plotting heatmaps later
197 | # final.plots[["matrices"]][[tumAcro]][[clus]] <- final.mat
198 | # scrape final events matrix for types to create tags
199 | final.mat.event.types <- apply(final.mat, 1, function(x){paste0(unique(x, na.rm = T), collapse = "_")}) %>%
200 | enframe()
201 | final.mat.event.types <- final.mat.event.types %>%
202 | mutate(dels = if_else(str_detect(final.mat.event.types$value, "highdel"), "D", NA_character_)) %>%
203 | mutate(amps = if_else(str_detect(final.mat.event.types$value, "highamp"), "A", NA_character_)) %>%
204 | mutate(muts = if_else(str_detect(final.mat.event.types$value, "mut"), "M", NA_character_)) %>%
205 | mutate(fus = if_else(str_detect(final.mat.event.types$value, "fus"), "F", NA_character_))
206 | tags <- final.mat.event.types %>%
207 | column_to_rownames("name") %>% select(-value) %>%
208 | as.matrix() %>%
209 | apply(X = ., MARGIN = 1, FUN = function(x){paste0(na.omit(x), collapse = "+")})
210 | # make sure these names don't have the :: so they can be unified later
211 | cosegregation.names <- paste(final.plot.names[,1], tags, sep = " - ")
212 | final.mat.event.types$coseg.names <- cosegregation.names
213 | final.mat.event.types$tags <- tags
214 | final.mat.event.types$just.genes <- final.plot.names[,1]
215 | # replace any "multi" regions with the gene names
216 | multi.indices <- which(grepl("multi",final.mat.event.types$coseg.names))
217 | if(length(multi.indices) > 0){
218 | for (idx in multi.indices) {
219 | type <- if_else(grepl("D", final.mat.event.types$coseg.names[idx]), "del", "amp")
220 | region.name <- str_sub(final.plot.names[idx,1], end = -7)
221 | cluster <- paste0("cluster", clus)
222 | region.genes <- pluck(multi.gene.list, tumAcro, cluster, type, region.name)
223 | final.mat.event.types$just.genes[idx] <- region.genes[1]
224 | }
225 | }
226 | all.final.mat.event.types[[clus]] <- final.mat.event.types
227 | }
228 | all.final.mat.event.types.df <- all.final.mat.event.types %>% discard(is.null) %>% map(select, -value, -name) %>%
229 | reduce(full_join) %>% mutate(just.genes = str_split_fixed(just.genes, ";", n = 2)[,1])
230 | #### get rid of duplicates from having A/D + M and just M alone
231 | # dups <- all.final.mat.event.types.df %>% group_by(just.genes) %>% filter(n() > 1)
232 | # all.final.mat.event.types.remove.dups <- filter(all.final.mat.event.types.df, !just.genes %in% dups$just.genes)
233 | #
234 | # for (event in unique(dups$just.genes)) {
235 | # sub.df <- dups %>% filter(just.genes == event)
236 | # name.split <- str_split_fixed(sub.df$coseg.names, " - ", 2)[,2]
237 | #
238 | # ## combine based on all different combos of duplicates
239 | # if(all(c("A+M", "D+M") %in% name.split) & nrow(sub.df) == 2) {
240 | # # just has two tags A+M and D+M
241 | # # don't merge!
242 | # all.final.mat.event.types.remove.dups <- bind_rows(all.final.mat.event.types.remove.dups, sub.df)
243 | # } else if (all(c("A+M", "D+M") %in% name.split) & nrow(sub.df) > 2) {
244 | # # has A+M, D+M and A/D/M alone
245 | # # only keep the A+M and D+M
246 | #
247 | # }
248 | # }
249 | ### load required event matrices
250 | ### Collect percentages for occurence per subtype
251 | # load all in first to get intersecting samples
252 | mut.mat <- get(load(paste0(snp.dir,"hugo-ids/", tumAcro, "-rawsnp.HUGO.rda")))
253 | # cnvs, have to amps and dels separately
254 | # thresh.by.gene <- read.table(paste0(gistic.dir, tumAcro, '/all_thresholded.by_genes.txt'), header=T, sep='\t', row.names=1, check.names=F)
255 | thresh.by.gene <- vroom(paste0(gistic.dir, tumAcro, '/all_thresholded.by_genes.txt')) %>% column_to_rownames("Gene Symbol")
256 | # first two columns are metadata and sample names are in long format
257 | short.sample.ids <- sapply(colnames(thresh.by.gene)[3:ncol(thresh.by.gene)], function(x) substr(x,1,15), USE.NAMES = F)
258 | colnames(thresh.by.gene) <- c(colnames(thresh.by.gene)[1:2], short.sample.ids)
259 | cnv <- thresh.by.gene
260 | amp.mat <- del.mat <- cnv <- cnv[,3:ncol(cnv)]
261 | # get sample intersection
262 | intersecting.samples <- intersect(colnames(mut.mat), colnames(cnv)) %>%
263 | intersect(viper.names)
264 | # mutations
265 | mut.events <- all.final.mat.event.types.df %>% filter(muts == "M") %>%
266 | select(coseg.names, tags, just.genes)
267 | mut.matrix <- mut.mat[unique(mut.events$just.genes), intersecting.samples, drop = F]
268 | mut.matrix[mut.matrix == 1] <- "M"
269 | mut.matrix[mut.matrix == 0] <- NA
270 | mut.matrix <- mut.matrix %>% as_tibble(rownames = NA) %>%
271 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>%
272 | inner_join(mut.events) %>% select(just.genes, tags, coseg.names, everything())
273 | #amps
274 | amp.events <- all.final.mat.event.types.df %>% filter(amps == "A") %>%
275 | select(coseg.names, tags, just.genes)
276 | # remove duplicates of both A and A+M
277 | if(length(unique(amp.events$just.genes)) != length(amp.events$just.genes)) {
278 | dups <- amp.events %>% group_by(just.genes) %>% filter(n() > 1)
279 | to.keep <- dups %>% filter(tags == "A+M")
280 | amp.events <- amp.events %>% filter(!just.genes %in% dups$just.genes) %>%
281 | bind_rows(to.keep)
282 | }
283 | amp.matrix <- amp.mat[unique(amp.events$just.genes), intersecting.samples, drop = F]
284 | # only get focal
285 | amp.matrix[amp.matrix < 2] <- NA
286 | amp.matrix[amp.matrix == 2] <- "A"
287 | amp.matrix <- amp.matrix %>% as_tibble(rownames = NA) %>%
288 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>%
289 | inner_join(amp.events) %>% select(just.genes, tags, coseg.names, everything())
290 | #dels
291 | del.events <- all.final.mat.event.types.df %>% filter(dels == "D") %>%
292 | select(coseg.names, tags, just.genes)
293 | # remove duplicates of both D and D+M
294 | if(length(unique(del.events$just.genes)) != length(del.events$just.genes)) {
295 | dups <- del.events %>% group_by(just.genes) %>% filter(n() > 1)
296 | to.keep <- dups %>% filter(tags == "D+M")
297 | del.events <- del.events %>% filter(!just.genes %in% dups$just.genes) %>%
298 | bind_rows(to.keep)
299 | }
300 | del.matrix <- del.mat[unique(del.events$just.genes), intersecting.samples , drop = F]
301 | # only get focal
302 | del.matrix[del.matrix > -2] <- NA
303 | del.matrix[del.matrix == -2] <- "D"
304 | del.matrix <- del.matrix %>% as_tibble(rownames = NA) %>%
305 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>%
306 | inner_join(del.events) %>% select(just.genes, tags, coseg.names, everything())
307 | # fusions (if they exist)
308 | if(isFALSE(all(is.na(all.final.mat.event.types.df$fus)))) {
309 | fus.events <- all.final.mat.event.types.df %>% filter(fus == "F") %>%
310 | select(coseg.names, tags, just.genes)
311 | fus.mat <- read.table(paste0(fusion.data.dir, tumAcro, '.txt'), sep='\t', header=T, row.names=1, check.names=F)
312 | fus.matrix <- fus.mat[unique(fus.events$just.genes), intersect(colnames(fus.mat), intersecting.samples), drop = F]
313 | fus.matrix[fus.matrix == 1] <- "F"
314 | fus.matrix[fus.matrix == 0] <- NA
315 | fus.matrix <- fus.matrix %>% as_tibble(rownames = NA) %>%
316 | rownames_to_column("just.genes") %>% mutate_all(as.character) %>%
317 | inner_join(fus.events) %>% select(just.genes, tags, coseg.names, everything())
318 | } else {
319 | fus.matrix <- NULL
320 | }
321 | ## bind them all together
322 | tumor.events.mat <- bind_rows(mut.matrix, amp.matrix, del.matrix, fus.matrix)
323 | # combine rows of duplicates so they represent only one event
324 | dups <- tumor.events.mat %>% group_by(just.genes) %>% filter(n() > 1)
325 | if(nrow(dups) > 0) {
326 | # keep events that are just A vs D on separate lines
327 | to.keep <- dups %>% filter(tags %in% c("A", "D", "M"))
328 | remaining <- dups %>% filter(tags %in% c("A+M", "D+M"))
329 | for (gene in unique(remaining$just.genes)) {
330 | sub <- remaining[remaining$just.genes == gene,]
331 | # combine lines with same tags to merge M and A/D
332 | for (tag in unique(sub$tags)){
333 | to.merge <- sub %>% filter(tags == tag)
334 | new.line <- apply(to.merge, 2, paste0, collapse = ";")
335 | new.line[1:3] <- unlist(to.merge[1,1:3])
336 | to.keep <- bind_rows(to.keep, new.line)
337 | }
338 | }
339 | to.keep[to.keep == "NA;NA"] <- NA
340 | tumor.events.mat <- tumor.events.mat %>% filter(!just.genes %in% dups$just.genes) %>%
341 | bind_rows(to.keep)
342 | }
343 | trans.tumor.events.mat <- tumor.events.mat %>% column_to_rownames("coseg.names") %>%
344 | select(-just.genes, -tags) %>% t() %>% as_tibble(rownames = NA) %>% rownames_to_column("sample")
345 | # add clusters to trans.tumor.events.mat
346 | sample.clustering <- pancan.clusters[[tumAcro]] %>% enframe("sample", "cluster") %>%
347 | filter(sample %in% intersecting.samples)
348 | trans.tumor.events.mat <- inner_join(sample.clustering, trans.tumor.events.mat) %>%
349 | select(-sample)
350 | # get rid of gbm subtype 2 here before continuing
351 | if(tumAcro == "gbm"){
352 | trans.tumor.events.mat <- filter(trans.tumor.events.mat, cluster != 2)
353 | }
354 | # check if any subtype had no events (was saved as a logical in the final events plots)
355 | no.events <- which(sapply(final.plots[["matrices"]][[tumAcro]], is.logical))
356 | if(length(no.events > 0)) {
357 | trans.tumor.events.mat <- filter(trans.tumor.events.mat, !cluster %in% no.events)
358 | }
359 | get.percent <- function(x){
360 | round(sum(!is.na(x))/length(x)*100)
361 | }
362 | # matrix with all the percentages for the events
363 | summary.df <- trans.tumor.events.mat %>% group_by(cluster) %>% summarise_all(get.percent) %>%
364 | column_to_rownames("cluster") %>% t()
365 | to.plot.df <- summary.df %>% as_tibble(rownames = NA) %>%
366 | rownames_to_column("event.names")
367 | # matrix with just the percentages for the events that are plotted
368 | # replace all the cells that are not MOMA events with 0s
369 | for (clus in colnames(to.plot.df)[2:ncol(to.plot.df)]) {
370 | events.this.clus <- all.final.mat.event.types[[as.numeric(clus)]]$coseg.names
371 | #to.plot.df.test <- to.plot.df %>% mutate(test1 = if_else(event.names %in% events.this.clus, to.plot.df[,clus], 0))
372 | to.replace <- to.plot.df %>% as_tibble(rownames = NA) %>%
373 | select(event.names, all_of(clus)) %>% deframe()
374 | to.replace <- if_else(names(to.replace) %in% events.this.clus, to.replace,
375 | if_else(str_sub(names(to.replace), end = -3) %in% events.this.clus, to.replace, 0))
376 | to.plot.df[,clus] <- to.replace
377 | }
378 | ###
379 | # do row-wise proportion test of percentages
380 | ###
381 | # first get number of samples per cluster
382 | cluster.sums <- trans.tumor.events.mat %>% group_by(cluster) %>% summarise(total = n())
383 | # initialize final dataframe for events and p.values
384 | event.prop.df <- tibble(event = colnames(trans.tumor.events.mat[2:ncol(trans.tumor.events.mat)]),
385 | pval = 1)
386 | # create a contingency table for each event given it's occurence/non-occurence in each cluster
387 | # do a chisq test to determine if the proportions are or aren't the same
388 | for(ge in colnames(trans.tumor.events.mat[2:ncol(trans.tumor.events.mat)])) {
389 | get.non.na <- function(x) sum(!is.na(x))
390 | sub.df <- trans.tumor.events.mat %>% select(cluster, all_of(ge)) %>%
391 | group_by(cluster) %>% summarise_all(get.non.na) %>% right_join(cluster.sums) %>%
392 | transmute(suc = get(ge), failures = total - get(ge)) %>% as.matrix()
393 | res <- chisq.test(sub.df, simulate.p.value = TRUE)$p.value
394 | event.prop.df[event.prop.df$event == ge, 2] <- res
395 | }
396 | # adjust pvals for multi hypothesis correction
397 | event.prop.df$adj.pval <- event.prop.df$stars <- p.adjust(event.prop.df$pval, method = "BH")
398 | #event.prop.df$stars[event.prop.df$stars > 0.05] <- ""
399 | event.prop.df <- mutate(event.prop.df, stars = ifelse(event.prop.df$stars < 0.0001, "****",
400 | ifelse(event.prop.df$stars < 0.001, "***",
401 | ifelse(event.prop.df$stars < 0.01, "**",
402 | ifelse(event.prop.df$stars < 0.05, "*", " ")))))
403 | # make the plot
404 | to.plot.df <- to.plot.df %>% column_to_rownames("event.names") %>% as.matrix()
405 | col_fun = colorRamp2(c(0, 1, 100), c("grey", "palegreen", "darkgreen"))
406 | if(nrow(to.plot.df) >= 60) {
407 | label.size <- 5
408 | } else if (nrow(to.plot.df) >= 50 ) {
409 | label.size <- 6
410 | } else if (nrow(to.plot.df) >= 40 ) {
411 | label.size <- 7
412 | } else if (nrow(to.plot.df) >= 30 ) {
413 | label.size <- 8
414 | } else {
415 | label.size <- 9
416 | }
417 | # make row annotation with astericks for chi.sq test
418 | ha <- rowAnnotation(signif = anno_text(event.prop.df$stars, gp = gpar(fontsize = label.size)))
419 | # make rowlabels that say cluster
420 | col.labels <- structure(paste0("Cluster ", colnames(to.plot.df)), names = colnames(to.plot.df))
421 | # make the title and subtitle with p values
422 | title <- paste0("Events in ", toupper(tumAcro), "",
423 | "
p values: *** < 0.001 | ** < 0.01 | * < 0.05 " )
424 | ht <- Heatmap(to.plot.df, col = col_fun, rect_gp = gpar(col = "white", lwd = 1),
425 | column_names_rot = 45, column_labels = col.labels,
426 | cell_fun = function(j, i, x, y, width, height, fill) {
427 | if(to.plot.df[i, j] > 0) {
428 | grid.text(sprintf("%.0f", to.plot.df[i, j]), x, y, gp = gpar(fontsize = label.size))
429 | } else {
430 | grid.text(sprintf("%.0f", summary.df[i, j]), x, y, gp = gpar(fontsize = label.size))
431 | }
432 | },
433 | heatmap_legend_param = list(title = "% Samples \nin Cluster"),
434 | row_names_gp = gpar(fontsize = label.size),
435 | column_title = gt_render(title),
436 | show_row_dend = T, show_column_dend = T,
437 | right_annotation = ha)
438 | p <- grid::grid.grabExpr(draw(ht, padding = unit(c(2, 10, 2, .5), "mm")))
439 | cosegregation.plots[[tumAcro]] <- p
440 | }
441 | m1 <- marrangeGrob(cosegregation.plots, ncol = 1, nrow = 1)
442 | ggsave(filename = paste0(output.folder, "cosegregation.plots.focal.only.pdf"), m1,
443 | width = 8.5, height = 11, units = c("in"),
444 | dpi = 300)
445 | ggsave(filename = "~/Desktop/cosegregation.plots.focal.only.pdf", m1,
446 | width = 8.5, height = 11, units = c("in"),
447 | dpi = 300)
448 | setwd("~/Documents/Github/OncoSig")
449 | library(OncoSig)
450 | df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
451 | library(randomForest)
452 | Network_location="./Input_data_files/COAD/original_network.txt"
453 | Network=read.delim(Network_location,header=F)
454 | Network$V1=as.character(Network$V1)
455 | Network$V2=as.character(Network$V2)
456 | Network$V3=as.numeric(Network$V3)
457 | Network=as.matrix(Network)
458 | Network[,3]=as.numeric(Network[,3])
459 | Network_matrix=listToMatrix(Network)
460 | Gold_Standard_location= "./Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt"
461 | Gold_Standard=read.delim(Gold_Standard_location,header=F)
462 | Gold_Standard$V1=as.character(Gold_Standard$V1)
463 | remove(Network_matrix)
464 | Query_output_results=OncoSigRF(Network_matrix_df, Gold_Standard_in_Network_names, Fraction_Gold_sample=0.5, ntrees=50, max_iterations=50, balance=1, to_save=1)
465 | Query_output_results_scores=as.data.frame(Query_output_results[[1]])
466 | Query_output_results=OncoSigRF(Network_matrix_df, Gold_Standard, Fraction_Gold_sample=0.5, ntrees=50, max_iterations=50, balance=1, to_save=1)
467 | Query_output_results=OncoSigRF(Network_matrix, Gold_Standard, Fraction_Gold_sample=0.5, ntrees=50, max_iterations=50, balance=1, to_save=1)
468 | Query_output_results=OncoSigRF(Network, Gold_Standard, Fraction_Gold_sample=0.5, ntrees=50, max_iterations=50, balance=1, to_save=1)
469 | df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
470 | df_1=read.delim("./Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
471 | df_1=read.delim("./Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
472 | df_2=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE)
473 | df_2=read.delim("./Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE)
474 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6),
475 | c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999),
476 | c(0,0.01,0.05))
477 | predictions=OncoSigNB(training_set = df_1,testing_set = df_2,
478 | the_bins=the_bins,correlated_features =list())
479 | predictions[1:5]
480 | Network_location="./Input_data_files/COAD/original_network.txt"
481 | Network=read.delim(Network_location,header=F)
482 | Network$V1=as.character(Network$V1)
483 | Network$V2=as.character(Network$V2)
484 | Network$V3=as.numeric(Network$V3)
485 | Network=as.matrix(Network)
486 | Network[,3]=as.numeric(Network[,3])
487 | Network_matrix=listToMatrix(Network)
488 | # part 2
489 | Network_location="./Input_data_files/LUAD/original_network_sample.txt"
490 | Network=read.delim(Network_location,header=F)
491 | Network$V1=as.character(Network$V1)
492 | Network$V2=as.character(Network$V2)
493 | Network$V3=as.numeric(Network$V3)
494 | Network=as.matrix(Network)
495 | Network[,3]=as.numeric(Network[,3])
496 | Network[1:5m]
497 | Network[1:5,]
498 | Network_matrix=listToMatrix(Network)
499 | Gold_Standard_location= "./Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt"
500 | Gold_Standard=read.delim(Gold_Standard_location,header=F)
501 | Gold_Standard$V1=as.character(Gold_Standard$V1)
502 | Network_matrix_df=as.data.frame(Network_matrix)
503 | Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df), Gold_Standard$V1)
504 | Negative_Set_names=setdiff(rownames(Network_matrix_df), Gold_Standard_in_Network_names)
505 | remove(Network_matrix)
506 | Query_output_results=OncoSigRF(Network_matrix_df, Gold_Standard_in_Network_names, max_iterations=5)
507 | Query_output_results_scores=as.data.frame(Query_output_results[[1]])
508 | View(Query_output_results_scores)
509 | KRAS_features= "./Input_data_files/LUAD/OncoSigUnsup/feature_list_KRAS.txt"
510 | EGFR_forest= "./Input_data_files/LUAD/OncoSigUnsup/All_forests_EGFR.r"
511 | results=OncoSigUnsup(KRAS_features,EGFR_forest)
512 | View(results)
513 |
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/build_options:
--------------------------------------------------------------------------------
1 | auto_roxygenize_for_build_and_reload="1"
2 | auto_roxygenize_for_build_package="1"
3 | auto_roxygenize_for_check="1"
4 | live_preview_website="1"
5 | makefile_args=""
6 | preview_website="1"
7 | website_output_format="all"
8 |
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/cpp-definition-cache:
--------------------------------------------------------------------------------
1 | [
2 | ]
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/pcs/files-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 | "path" : "~/Documents/Github/OncoSig_main/OncoSig/Input_data_files/LUAD/OncoSigUnsup",
3 | "sortOrder" : [
4 | {
5 | "ascending" : true,
6 | "columnIndex" : 2
7 | }
8 | ]
9 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/pcs/source-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 | "activeTab" : -1
3 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/pcs/windowlayoutstate.pper:
--------------------------------------------------------------------------------
1 | {
2 | "left" : {
3 | "panelheight" : 554,
4 | "splitterpos" : 181,
5 | "topwindowstate" : "HIDE",
6 | "windowheight" : 592
7 | },
8 | "right" : {
9 | "panelheight" : 554,
10 | "splitterpos" : 355,
11 | "topwindowstate" : "NORMAL",
12 | "windowheight" : 592
13 | }
14 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/pcs/workbench-pane.pper:
--------------------------------------------------------------------------------
1 | {
2 | "TabSet1" : 0,
3 | "TabSet2" : 0,
4 | "TabZoom" : {
5 | }
6 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/persistent-state:
--------------------------------------------------------------------------------
1 | build-last-errors="[]"
2 | build-last-errors-base-dir="~/Documents/Github/OncoSig_main/OncoSig/"
3 | build-last-outputs="[{\"output\":\"==> devtools::document(roclets = c('rd', 'collate', 'namespace'))\\n\\n\",\"type\":0},{\"output\":\"Updating OncoSig documentation\\n\",\"type\":2},{\"output\":\"First time using roxygen2. Upgrading automatically...\\n\",\"type\":2},{\"output\":\"Loading OncoSig\\n\",\"type\":2},{\"output\":\"Warning: The existing 'NAMESPACE' file was not generated by roxygen2, and will not be overwritten.\\nWarning message:\\nroxygen2 requires Encoding: UTF-8 \\n\",\"type\":2},{\"output\":\"Documentation completed\\n\\n\",\"type\":1},{\"output\":\"==> R CMD INSTALL --no-multiarch --with-keep.source OncoSig\\n\\n\",\"type\":0},{\"output\":\"* installing to library ‘/Library/Frameworks/R.framework/Versions/4.0/Resources/library’\\n\",\"type\":1},{\"output\":\"* installing *source* package ‘OncoSig’ ...\\n\",\"type\":1},{\"output\":\"** using staged installation\\n\",\"type\":1},{\"output\":\"** R\\n\",\"type\":1},{\"output\":\"** byte-compile and prepare package for lazy loading\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** help\\n\",\"type\":1},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/OncoSigRF.Rd:88: unexpected section header '\\\\keyword'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/OncoSigRF.Rd:89: unexpected section header '\\\\keyword'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/OncoSigRF.Rd:90: unexpected section header '\\\\keyword'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/OncoSigRF.Rd:91: unexpected END_OF_INPUT '\\n\",\"type\":2},{\"output\":\"'\\n\",\"type\":1},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/listToMatrix.Rd:10: unexpected UNKNOWN '\\\\warning'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/listToMatrix.Rd:13: unexpected '}'\\n\",\"type\":2},{\"output\":\"Warning: /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/listToMatrix.Rd:11: All text must be in a section\\n\",\"type\":2},{\"output\":\"\",\"type\":1},{\"output\":\"*** installing help indices\\n\",\"type\":1},{\"output\":\"** building package indices\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** installing vignettes\\n\",\"type\":1},{\"output\":\"** testing if installed package can be loaded from temporary location\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** testing if installed package can be loaded from final location\\n\",\"type\":1},{\"output\":\"\",\"type\":1},{\"output\":\"** testing if installed package keeps a record of temporary installation path\\n\",\"type\":1},{\"output\":\"* DONE (OncoSig)\\n\",\"type\":1},{\"output\":\"\",\"type\":1}]"
4 | compile_pdf_state="{\"errors\":[],\"output\":\"\",\"running\":false,\"tab_visible\":false,\"target_file\":\"\"}"
5 | files.monitored-path=""
6 | find-in-files-state="{\"handle\":\"\",\"input\":\"\",\"path\":\"\",\"regex\":false,\"results\":{\"file\":[],\"line\":[],\"lineValue\":[],\"matchOff\":[],\"matchOn\":[]},\"running\":false}"
7 | imageDirtyState="0"
8 | saveActionState="0"
9 |
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/rmd-outputs:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/saved_source_markers:
--------------------------------------------------------------------------------
1 | {"active_set":"","sets":[]}
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/1B64C678:
--------------------------------------------------------------------------------
1 | {
2 | "cursorPosition" : "35,21",
3 | "scrollLine" : "32"
4 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/1CC714F7:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/27C419B:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/27EB650E:
--------------------------------------------------------------------------------
1 | {
2 | "cursorPosition" : "24,64",
3 | "scrollLine" : "16"
4 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/2EE4D19B:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/453A360A:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/4668FDBC:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/4EC81EA2:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/6CEE29D:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/6E2F621B:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/71CD7210:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/78F86D91:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/908CD31C:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/A67B5CF0:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/C0746A86:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/D6FEEFCC:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/D71BEEF9:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/F8F7728B:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/.Rproj.user/F983FB94/sources/prop/INDEX:
--------------------------------------------------------------------------------
1 | ~%2FDocuments%2FGithub%2FOncoSig%2FNAMESPACE="D71BEEF9"
2 | ~%2FDocuments%2FGithub%2FOncoSig%2FRead-and-delete-me="D6FEEFCC"
3 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FDESCRIPTION="A67B5CF0"
4 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FNAMESPACE="78F86D91"
5 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FR%2FOncoSig.R="6E2F621B"
6 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FR%2FOncoSigNB.Rd="C0746A86"
7 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FRead-and-delete-me="2EE4D19B"
8 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FcomputeLRsgivenBins.Rd="453A360A"
9 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FgetFinalLR.Rd="1CC714F7"
10 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FgetLRsgivenBin_info.Rd="908CD31C"
11 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FgetMaxLR.Rd="4668FDBC"
12 | ~%2FDocuments%2FGithub%2FOncoSig%2Fman%2FlistToMatrix.Rd="4EC81EA2"
13 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FDiana_Dec4_Run_RF_COAD_KRAS.txt="27EB650E"
14 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2FInput_data_files%2FLUAD%2FREADME.txt="F8F7728B"
15 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2FREADME.md="27C419B"
16 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2Fvignettes%2FOncoSig-concordance.tex="71CD7210"
17 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2Fvignettes%2FOncoSig.log="6CEE29D"
18 | ~%2FDocuments%2FGithub%2FOncoSig_main%2FOncoSig%2Fvignettes%2FOncoSig.tex="1B64C678"
19 |
--------------------------------------------------------------------------------
/.Rproj.user/shared/notebooks/patch-chunk-names:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/.Rproj.user/shared/notebooks/patch-chunk-names
--------------------------------------------------------------------------------
/.Rproj.user/shared/notebooks/paths:
--------------------------------------------------------------------------------
1 | /Users/sunnyjones/Documents/Github/OncoSig_main/Diana_Dec4_Run_RF_COAD_KRAS.txt="87298F43"
2 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/Input_data_files/LUAD/README.txt="2A438251"
3 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/README.md="A09906A"
4 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/R/OncoSigNB.Rd="DBC78E44"
5 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/computeLRsgivenBins.Rd="D96AD446"
6 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/getFinalLR.Rd="422015D3"
7 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/getLRsgivenBin_info.Rd="2B1FC101"
8 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/getMaxLR.Rd="B669C0BF"
9 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/man/listToMatrix.Rd="96DBECF"
10 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/vignettes/OncoSig-concordance.tex="E951F25E"
11 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/vignettes/OncoSig.log="8523CE22"
12 | /Users/sunnyjones/Documents/Github/OncoSig_main/OncoSig/vignettes/OncoSig.tex="1D4E4DDC"
13 |
--------------------------------------------------------------------------------
/All_forests.r:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/All_forests.r
--------------------------------------------------------------------------------
/All_forests_KRAS_Ingenuity.r:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/All_forests_KRAS_Ingenuity.r
--------------------------------------------------------------------------------
/All_forests_KRAS_MSigDB.r:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/All_forests_KRAS_MSigDB.r
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: OncoSig
2 | Type: Package
3 | Title: What the package does (short line)
4 | Version: 1.0
5 | Date: 2018-10-14
6 | Author: Who wrote it
7 | Maintainer: Who to complain to
8 | Description: More about what it does (maybe more than one line)
9 | License: What license is it under?
10 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | exportPattern("^[[:alpha:]]+")
2 |
--------------------------------------------------------------------------------
/OncoSig.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | PackageRoxygenize: rd,collate,namespace
19 |
--------------------------------------------------------------------------------
/R/NBfunctions.R:
--------------------------------------------------------------------------------
1 | source("R/rFunctions.R")
2 | #These sets of functions allow the creation of a binary Naive Bayes classifer, given two sets of data.
3 | #1. The feature matrix
4 | #2. The bin parameters.
5 |
6 | #1.Feature Matrix
7 | #The first column in the feature matrix is the name of the instance (e.g. the name of a protein)
8 | #The second column is the feature value (either a 1 or 0)
9 | #All later columns are feature values, which can be either discrete or continous numeric values, or "NA"
10 | #For example:
11 | # V1 df_labels feature_1 feature_2
12 | #1 Q16539 1 6.76840e+02 1.000e+00
13 | #2 P78383 1 NA NA
14 | #3 P30281 1 NA 1.000e+00
15 |
16 | #2. Bin Parameters
17 | #bin parameters are passed to the function as lists of lists, with one list per feature: For example
18 | #the_bins=list(c(0,40,200,1200),c(0,.1)
19 | #will bin the first feature into bins corresponding to: 0-40,40-200,200-1200,1200-Inf. "NA's" (i.e. no feature present) are given a seperate bin for each feature.
20 | #NOTE: Due to the way that "NA"'s are imputed, features should not have any features less than -99999999999999999999999, or greater than 9999999999999999999999999999999999999999.
21 | #NOTE: These functions assume that Naive Bayes is used as a binary classifier, where there are only two labels in the response vector:1 or 0.
22 |
23 | #This function takes in a dataframe and a bins for each feature and returns, for each bin, the corresponding likelihood Ratios (LR)
24 | #Where LR=p(1|bin)/(p(1))
25 | NaiveBayesBin <-function (df_1,the_bins){
26 | #impute all NAs with a very small value temporarily.
27 | the_min=-999999999999999999999999999999999999999999999
28 | the_max=9999999999999999999999999999999999999999999
29 | df_1[is.na(df_1)] <- -99999999999999999999999
30 | df_1_copy=df_1
31 |
32 | #go through the data frame with the assigned breaks and create the bins
33 | #the first two columns must be the name of the entry and the labels; all future columns are features
34 | #for each feature, find the proper bins, which is 2 less than the according column
35 | for (i in 3:ncol(df_1)){
36 | df_1_copy[,i]=.bincode(df_1_copy[,i],c(the_min,the_bins[i-2][[1]],the_max),right=FALSE)
37 | #print(i)
38 | }
39 | new_bin_info=list()
40 | for (i in 3:ncol(df_1)){
41 | new_bin_info=append(new_bin_info,list(getLRsgivenBin_info(df_1_copy[,i],the_bins[i-2][[1]],df_1_copy[,2])))
42 |
43 | }
44 | return(new_bin_info)
45 |
46 |
47 | }
48 | #After training on a training set, this function computes LRs on a new testing set. Note that labels must be provided for the testing set as well.
49 | computeLRsgivenBins <- function (df_1,the_bins,the_bins_info){
50 | #impute all NAs with a very small value temporarily.
51 | the_min=-999999999999999999999999999999999999999999999
52 | the_max=9999999999999999999999999999999999999999999
53 | df_1[is.na(df_1)] <- -99999999999999999999999
54 | df_1_copy=df_1
55 |
56 | #go through the data frame with the assigned breaks and create the bins
57 | #the first two columns must be the name of the entry and the labels; all future columns are features
58 | #for each feature, find the proper bins, which is 2 less than the according column
59 | for (i in 3:ncol(df_1)){
60 | df_1_copy[,i]=.bincode(df_1_copy[,i],c(the_min,the_bins[i-2][[1]],the_max),right=FALSE)
61 | #print(i)
62 | }
63 | #replace bins with LR
64 | for (i in 3:ncol(df_1)){
65 | df_1_copy[,i]=replaceBinswithLR(df_1_copy[,i],the_bins_info[i-2][[1]])
66 | #print(i)
67 | }
68 | return(df_1_copy)
69 | }
70 |
71 | #given a bin vector and the gold standard vector (i.e. the two vectors of the same length), return the Likelihood Ratio vector
72 | getLRsgivenBin_info <- function (bin_vector,the_bin,label_vector){
73 | #get bins info
74 | the_bins_new=the_bin
75 | bin_vector_new=bin_vector
76 | prior=table(label_vector)[2]/table(label_vector)[1]
77 | bin_vector_2=unique(sort(bin_vector))
78 | bin_vector_3=rep(0,length(bin_vector_2))
79 | for (i in 1:length(bin_vector_2)) {
80 | the_num=bin_vector_2[i]
81 | ratio_1=table(label_vector[bin_vector==i])[2]/table(label_vector[bin_vector==i])[1]
82 | LR=ratio_1/prior
83 | bin_vector_3[i]=LR
84 |
85 | }
86 | names(bin_vector_3)=bin_vector_2
87 | #for (i in 1:length(bin_vector_new)) {
88 | # the_bin_value=bin_vector_new[i]
89 | # bin_vector_new[i]=bin_vector_3[the_bin_value]
90 | #}
91 |
92 | return(bin_vector_3)
93 |
94 | }
95 |
96 | #get the final LR given the dataframe. Columns starting at 3 are feature values
97 | getFinalLR <- function(df_1){
98 | to_return=lapply(1:nrow(df_1),
99 | function(x){
100 | prod(df_1[x,3:ncol(df_1)])
101 |
102 | }
103 |
104 |
105 | )
106 | return (to_return)
107 |
108 | }
109 |
110 | #given the specified columns, return the maximum LR for each case
111 | getMaxLR <- function(df_1){
112 | the_max_results=lapply(1:nrow(df_1),
113 | function(x){
114 | max(df_1[x,])
115 | }
116 | )
117 | the_max_results=unlist(the_max_results)
118 | return(the_max_results)
119 | }
120 | #Given the bin info, and bined data, replace each bin with the corresponding Likelihood ratio.
121 | replaceBinswithLR <- function(bin_vector,the_bin_info){
122 | new_bin_vector=lapply(1:len(bin_vector),
123 | function(x){
124 | the_bin=as.character(bin_vector[x])
125 | bin_value=the_bin_info[the_bin]
126 | #print(x)
127 | bin_value
128 |
129 |
130 | }
131 |
132 | )
133 | new_bin_vector=unlist(new_bin_vector)
134 | return(new_bin_vector)
135 |
136 | }
137 |
--------------------------------------------------------------------------------
/R/OncoSig.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/OncoSig.R
--------------------------------------------------------------------------------
/R/OncoSigNB.R:
--------------------------------------------------------------------------------
1 | OncoSigNB <- function (training_set,testing_set,the_bins,correlated_features){
2 | message("\tBinning features based on holdout set\n")
3 | the_bins_info=NaiveBayesBin(training_set,the_bins)
4 | testing_set=computeLRsgivenBins(testing_set,the_bins,the_bins_info)
5 | #Get the maximum of correlated features if they were passed
6 | if (len(correlated_features)>0){
7 | mass_spec_features=correlated_features
8 | the_mass_spec_features=testing_set[mass_spec_features]
9 | #print(the_mass_spec_features)
10 | message("\tCorrecting for correlated features \n")
11 | #message("test")
12 | ms_max=getMaxLR(the_mass_spec_features)
13 |
14 | testing_set[mass_spec_features] <- 1
15 | testing_set$MS_max=ms_max
16 | }
17 | message("\tExtracting predicted LR_posterior holdout set\n")
18 | the_results=unlist(getFinalLR(testing_set))
19 | names(the_results)=testing_set$V1
20 |
21 | return(the_results)
22 | }
23 |
--------------------------------------------------------------------------------
/R/OncoSigRF.R:
--------------------------------------------------------------------------------
1 |
2 | OncoSigRF <- function(Network_matrix_df,Gold_Standard_in_Network_names, Fraction_Gold_sample=NULL, ntrees=NULL, max_iterations=
3 | NULL,balance=NULL,to_save=NULL){
4 | #This function runs the Random Forest Learner using the Network Provided and the gold standard provided
5 |
6 | if(is.null(Fraction_Gold_sample)) {Fraction_Gold_sample=.5}
7 | if(is.null(ntrees)) {ntrees=50}
8 | if(is.null(max_iterations)) {max_iterations=20}
9 | if(is.null(balance)) {balance=1}
10 | if(is.null(to_save)) {to_save=0}
11 | message("Running OncoSig")
12 | message("Fraction of Gold Standard to train on for each Random Forest: ", Fraction_Gold_sample,sep="")
13 | message("Number of Trees per Iteration: ", ntrees,sep="")
14 | message("Number of Iterations: ", max_iterations,sep="")
15 | message("Balance: ", balance,sep="")
16 | #Sample gold_standard, user can Change this
17 | #ntrees=50 #Give each Random Fores 50 Trees, user can change this
18 | #max_iterations=100 #How many Iterations to do
19 | #Number to sample from the sets:
20 | Num_to_sample=floor(Fraction_Gold_sample*length(Gold_Standard_in_Network_names))
21 | #balance=3 #Change this depending on whether you want a balanced classifier or not, 1 means a balanced classifier, this will create more errors overall
22 | message("Number of positive results to sample: ", Num_to_sample,sep="")
23 |
24 | Num_to_sample_negative=Num_to_sample*balance;
25 | message("Number of negative results to sample: ", Num_to_sample_negative,sep="")
26 | QueryResults_scores=data.frame(row.names=rownames(Network_matrix_df))
27 | importance_df=data.frame()
28 | all_forests=list()
29 | for (i in 1:max_iterations){
30 | Gold_sample=sample(Gold_Standard_in_Network_names,Num_to_sample)
31 | Negative_sample=sample(Negative_Set_names,Num_to_sample_negative)
32 | label_vector=c(rep(1,Num_to_sample),rep(0,Num_to_sample_negative))
33 | label_vector=as.factor(label_vector)
34 |
35 | Not_in_Gold_or_Negative_Sample=setdiff(rownames(Network_matrix_df),c(Gold_sample,Negative_sample))
36 | #You can Cadd dotrace=TRUE if you want to see the trace of the random Forests
37 | message("Performing Random Forest",sep="")
38 | #Testing with fast by only passing it part of the matrix in the first place
39 | #my_col_sample=sample(colnames(Network_matrix_df),3000)
40 |
41 |
42 | #result=randomForest(Network_matrix_df[c(Gold_sample,Negative_sample),],label_vector,ntree = ntrees,importance=TRUE,do.trace=FALSE)
43 | #set mtry
44 |
45 | mtry=floor(ncol(Network_matrix_df)**.5)
46 | message("mtry equals ",mtry)
47 | result=randomForest(Network_matrix_df[c(Gold_sample,Negative_sample),],label_vector,ntree = ntrees,importance=TRUE,do.trace=TRUE,mtry=mtry)
48 | if (to_save==1){
49 | all_forests[[i]]=result
50 | }
51 | Query_results=predict(result,type="prob",newdata=Network_matrix_df[Not_in_Gold_or_Negative_Sample,])
52 | QueryResults_scores[Not_in_Gold_or_Negative_Sample,i]=Query_results[Not_in_Gold_or_Negative_Sample,2]
53 |
54 | #Get the Importance, using mean decrease accuracy
55 | importance=as.data.frame(result$importance);importance=importance[order(importance$MeanDecreaseAccuracy,decreasing=T),]
56 | importance_vector=importance$MeanDecreaseAccuracy
57 | names(importance_vector)=rownames(importance)
58 | importance_df[names(importance_vector),i]=importance_vector
59 | #If the matrix is very large and you cannot query the results all at once, do it in chunks
60 |
61 | #How Converged are we if i>1
62 | if (i>2){
63 | #QueryResults_scores_Complete_cases=QueryResults_scores[complete.cases(QueryResults_scores),]
64 | #old_ones=rowMeans(QueryResults_scores_Complete_cases[,c(1:c(i-1))])
65 | #old_plus_new=rowMeans(QueryResults_scores_Complete_cases[,c(1:c(i))])
66 | #old_plus_new=rowMeans(QueryResults_scores_Complete_cases[,c(1:c(i))])
67 | #Turn off warning for correlaiton, otherwise it spits back tie-related errors
68 | #options(warn=-1)
69 | #Correlation=cor.test(old_ones,old_plus_new,method="spearman")$estimate
70 | #options(warn=0)
71 | message("At iteration ", i,sep="")
72 |
73 | }
74 | Gold_sample_old=Gold_sample
75 | Negative_sample_old=Negative_sample
76 | }
77 | QueryResults_scores_average=as.data.frame(rowMeans(QueryResults_scores,na.rm=TRUE))
78 | colnames(QueryResults_scores_average)=c("Score")
79 | QueryResults_scores_average=as.data.frame(QueryResults_scores_average)
80 | if (to_save==1){
81 | save(all_forests,file="All_forests.r")
82 | }
83 | return(list(QueryResults_scores_average,QueryResults_scores, importance_df))
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/R/OncoSigUnsup.R:
--------------------------------------------------------------------------------
1 | source("./R/rFunctions.R")
2 | OncoSigUnsup <- function(Network_location, forest_location){
3 | load(forest_location,verbose=T)
4 |
5 | Network=read.delim(Network_location,header=F)
6 | Network$V1=as.character(Network$V1)
7 | Network$V2=as.character(Network$V2)
8 | Network$V3=as.numeric(Network$V3)
9 | Network=as.matrix(Network)
10 |
11 | #Convert to Matrix. Inputes missing values as 0, so make sure your scores range from greater than zero to higher!
12 | Network[,3]=as.numeric(Network[,3])
13 | Network_matrix=listToMatrix(Network)
14 |
15 | result_matrix=matrix(nrow=nrow(Network_matrix),ncol=length(all_forests))
16 | rownames(result_matrix)=rownames(Network_matrix)
17 | for (i in 1:length(all_forests)){
18 | Query_results=predict(all_forests[[i]],newdata = Network_matrix,type="prob")
19 | result_matrix[,i]=Query_results[,2]
20 | }
21 | the_means=rowMeans(result_matrix)
22 | the_means_df=as.data.frame(the_means)
23 | return(the_means_df)
24 | }
25 |
--------------------------------------------------------------------------------
/R/Oncosig-RF/OncoSig/.functions.R.swo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/.functions.R.swo
--------------------------------------------------------------------------------
/R/Oncosig-RF/OncoSig/.functions.R.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/.functions.R.swp
--------------------------------------------------------------------------------
/R/Oncosig-RF/OncoSig/ONCOSIG_README.md:
--------------------------------------------------------------------------------
1 |
2 | # OncoSig-RF Overview
3 | Written by: Joshua Broyde (2/23/2018)
4 |
5 | OncoSig-RF is an algorithm for determinging novel sets of proteins that support the activity of an oncogene or tumor suppressor (i.e. Oncoprotein-Centric Map, or OC-map). Given a gold standard and an input molecular interaction network, the algorithm uses a random forest classifier to discover novel members of the OC-Map.
6 |
7 | # OncoSig-RF Input and Options
8 |
9 | OncoSig-RF Requires 2 input files,
10 | 1.a network of interactions and
11 | 2.A gold standard of pathway members (i.e. members of the OC-Map) to train on.
12 |
13 | The network is a tab delimited dataframe of 3 columns, Gene A, Gene B and the strength of the interaction. This can range from 0 to infinite, but should not be a negative number. Note that the dataframe must be redundant, so that the interaction is represented twice, between A and B and B and A.
14 |
15 | The other required inputs are as follows:
16 | The maximum number of iterations for monte-carlo cross validation. (Default=50)
17 | The fraction of the gold standard to train on at eact iteration. (Default=.5)
18 | The number of trees to to create in each forest in the random forest classifier. (Default=50)
19 | The number of proteins in the negative standard for each training in the random forest (default=same number as picked for the gold standard). This is the balance option. (Default=1)
20 | Note the the "balance" option picks the same *number* of proteins, not percentage, so if there are 200 proteins in the gold standard, by default 50% are picked then at each.
21 |
22 | To run OncoSig-RF is from the bash command line:
23 |
24 | source location/to/OncoSig-RF/runOncoSig-RFscript_wrapper.sh location/input_network.txt
25 |
26 | # OncoSig-RF Example
27 |
28 | An example script for running OncoSig-RF is included in the "runOncoSig-RFscript.R" script. We will now run OncoSig-RF step-by step from that script to discover novel members of Kras-regulated pathways.
29 | We will run OncoSig-RF using `test_network.txt` in the `test` directory.
30 |
31 | In this example `test_network.txt` is the network file, `gold_standard.txt` is the gold standard. There are 50 iterations, .5 of the gold standard is used for training for each iteration, there are 75 trees in each iteration, and the balance equals 1, which means that the same number of negative proteins are taken at each iteration for training. If balance were equal to 2, twice as many negative proteins would be samples at each iteration.
32 |
33 | OncoSig-RF uses the randomForest and MASS package as well as internal functions:
34 |
35 | library(randomForest)
36 | library(MASS)
37 | library(Matrix)
38 | library('getopt')
39 | #Change this depending on where the functions are located
40 | source("location/of/OncoSig-RF/functions.R")
41 |
42 | If you do not have randomForest of MASS, then first install them:
43 |
44 | install.packages("randomForest")
45 | install.packages("MASS")
46 |
47 | Get the location the network and Gold Standard:
48 |
49 | Network_location="Test/test_network.txt"
50 | Gold_Standard_location="Test/gold_standard.txt"
51 |
52 | Read in the network and Gold Standard and the other paramters.The network must be tab delimited. The first two columns are the names, and third column is the strength of the interaction:
53 |
54 | arg <- commandArgs(trailingOnly = TRUE)
55 | args=as.vector(arg);
56 | Network_location=args[[1]]
57 | message("Network location: ", Network_location, sep="" )
58 | Gold_Standard_location=args[[2]]
59 | message("Gold_Standard_location: ",Gold_Standard_location,sep="")
60 | max_iterations=args[[3]]
61 | max_iterations=as.numeric(max_iterations)
62 | Fraction_Gold_sample=args[[4]]
63 | Fraction_Gold_sample=as.numeric(Fraction_Gold_sample)
64 | ntrees=args[[5]]
65 | ntrees=as.numeric(ntrees)
66 | balance=as.numeric(args[7])
67 |
68 |
69 | The Network matrix looks like this. Note that it is symmetric:
70 |
71 | Q13131_PREPPI P14625 1.111887e+03
72 | P14625_PREPPI Q13131 1.111887e+03
73 | P37058_PREPPI P15428 1.502400e+03
74 | P15428_PREPPI P37058 1.502400e+03
75 | Q8IY84_PREPPI Q9Y3S1 7.255526e+02
76 | Q9Y3S1_PREPPI Q8IY84 7.255526e+02
77 | Q13315_PREPPI Q96T68 2.535267e+03
78 | Q96T68_PREPPI Q13315 2.535267e+03
79 | P27348_PREPPI O75385 1.084084e+04
80 | O75385_PREPPI P27348 1.084084e+04
81 |
82 | In this particular example, only PREPPI protein-protein interactions are represented. However, other interaction types may be included as well.
83 |
84 | Next, we will convert the network list (e.g. an adjacency list) to an adjacency matrix. Note that this may take a few minutes if the network is very large.
85 |
86 |
87 | Network=read.delim(Network_location,header=F)
88 | Network$V1=as.character(Network$V1)
89 | Network$V2=as.character(Network$V2)
90 | Network$V3=as.numeric(Network$V3)
91 | Network=as.matrix(Network)
92 | Gold_Standard=read.delim(Gold_Standard_location,header=F)
93 | Gold_Standard$V1=as.character(Gold_Standard$V1)
94 |
95 | This converts the network to a matrix. Inputes non-interactions as 0.
96 | Network[,3]=as.numeric(Network[,3])
97 | Network_matrix=listToMatrix(Network)
98 |
99 | The Network_matrix looks like this. Zero indicates no edge between nodes:
100 |
101 | Q13131_PREPPI P14625_PREPPI P37058_PREPPI P15428_PREPPI
102 | P14625 1111.887 25258.640 0.000 0.0
103 | Q13131 8911.691 1111.887 0.000 0.0
104 | P15428 0.000 0.000 1502.400 0.0
105 | P37058 0.000 0.000 2079.157 1502.4
106 |
107 | Convert Matrix to Dataframe for future steps
108 | Network_matrix_df=as.data.frame(Network_matrix)
109 |
110 | Remove members of the gold standard that are not present in the network.
111 |
112 | Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df),Gold_Standard$V1)
113 |
114 | Retrieve the negative set (i.e. all proteins not in the gold standard):
115 | Negative_Set_names=setdiff(rownames(Network_matrix_df),Gold_Standard_in_Network_names)
116 |
117 | Next, run the random forest classifier. The Random forest classifier will train on a portion of the gold standard and a sample of negative standard of the same size. A variant of repeated random sub-sampling validation is used to train the classifier. To do this, a fraction of the gold standard is randomly sampled from the data, and a random sample of the negative set of the same size is also sampled. A random forest is created with a number of trees. To generate new predictions, the score of each protein is predicted only with the random forests that were not used to train it. In the example script, the Kras gold standard has 250 members, so each random forest will be trained on 250 (125 + 125) proteins total.
118 |
119 | If the set that you are using has a very small number of proteins in in (e.g. 3-30), I recommend using a larger fraction of the gold standard and
120 | more iterations.
121 |
122 |
123 | Query_output_results=runOncoSig-RF(Network_matrix_df,Gold_Standard_in_Network_names,max_iterations = max_iterations, Fraction_Gold_sample =Fraction_Gold_sample,ntrees = ntrees, balance = balance)
124 | Query_output_results_scores=Query_output_results[[1]]
125 | write.table(Query_output_results_scores,file="OncoSig-RF_results.txt",row.names = TRUE, col.names=FALSE,quote = FALSE,sep="\t")
126 | save(Query_output_results,Query_output_results_scores,Gold_Standard_location,Network_location,Gold_Standard_in_Network_names, file="OncoSig-RF_objects.R")
127 |
128 | Now evaluate performance using a ROC Curve:
129 |
130 | library(ROCR)
131 | Query_output_results_scores=Query_output_results[[1]]
132 | #See how good the performance is:
133 | Query_output_results_scores$label=0
134 | Query_output_results_scores[Gold_Standard_in_Network_names,2]=1
135 | pred=prediction(Query_output_results_scores$Score,Query_output_results_scores$label)
136 | pdf("Performance.pdf",height=5,width=5)
137 | perf=performance(pred,measure = "tpr", x.measure = "fpr")
138 | plot(perf,col='red') #Plot the ROC curve
139 | abline(a=0,b=1);
140 |
--------------------------------------------------------------------------------
/R/Oncosig-RF/OncoSig/ONCOSIG_README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/ONCOSIG_README.pdf
--------------------------------------------------------------------------------
/R/Oncosig-RF/OncoSig/Test/OncoSig_objects.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/Test/OncoSig_objects.R
--------------------------------------------------------------------------------
/R/Oncosig-RF/OncoSig/Test/Performance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/R/Oncosig-RF/OncoSig/Test/Performance.pdf
--------------------------------------------------------------------------------
/R/Oncosig-RF/OncoSig/Test/gold_standard.txt:
--------------------------------------------------------------------------------
1 | C9J798
2 | O00329
3 | O00459
4 | O14610
5 | O14775
6 | O14807
7 | O14827
8 | O14920
9 | O14939
10 | O15111
11 | O15123
12 | O15211
13 | O15264
14 | O15399
15 | O15496
16 | O15520
17 | O15530
18 | O43320
19 | O43374
20 | O43561
21 | O43915
22 | O43921
23 | O60258
24 | O60262
25 | O60733
26 | O75914
27 | O76093
28 | O95267
29 | O95294
30 | O95750
31 | O96013
32 | P00519
33 | P00533
34 | P01111
35 | P01112
36 | P01116
37 | P01127
38 | P01133
39 | P01138
40 | P01308
41 | P04049
42 | P04054
43 | P04085
44 | P04629
45 | P05019
46 | P05129
47 | P05230
48 | P05771
49 | P06213
50 | P06493
51 | P07196
52 | P07333
53 | P08069
54 | P08138
55 | P08581
56 | P08620
57 | P09038
58 | P09603
59 | P09619
60 | P0C869
61 | P10301
62 | P10721
63 | P10767
64 | P11233
65 | P11234
66 | P11362
67 | P11487
68 | P12034
69 | P12931
70 | P14210
71 | P14555
72 | P14921
73 | P15036
74 | P15056
75 | P15153
76 | P15692
77 | P16220
78 | P16234
79 | P16520
80 | P16885
81 | P17252
82 | P17612
83 | P17948
84 | P19174
85 | P19419
86 | P19838
87 | P20339
88 | P20827
89 | P20936
90 | P21359
91 | P21583
92 | P21781
93 | P21802
94 | P22455
95 | P22607
96 | P22612
97 | P22694
98 | P27361
99 | P27482
100 | P27986
101 | P28482
102 | P29317
103 | P29353
104 | P31371
105 | P31749
106 | P31751
107 | P31946
108 | P34969
109 | P35609
110 | P35916
111 | P35968
112 | P36507
113 | P39877
114 | P42336
115 | P42338
116 | P42684
117 | P43403
118 | P45983
119 | P45984
120 | P47712
121 | P48023
122 | P48736
123 | P49137
124 | P49763
125 | P49765
126 | P49767
127 | P50150
128 | P50151
129 | P51148
130 | P51812
131 | P51817
132 | P52797
133 | P52798
134 | P52803
135 | P53778
136 | P53779
137 | P53816
138 | P55075
139 | P55196
140 | P55211
141 | P59768
142 | P60763
143 | P60953
144 | P61020
145 | P61224
146 | P61328
147 | P61586
148 | P61952
149 | P62070
150 | P62158
151 | P62330
152 | P62834
153 | P62873
154 | P62879
155 | P62993
156 | P63000
157 | P63211
158 | P63215
159 | P63218
160 | P98077
161 | P98177
162 | Q02750
163 | Q02763
164 | Q04206
165 | Q04864
166 | Q05586
167 | Q06124
168 | Q07817
169 | Q07889
170 | Q07890
171 | Q12879
172 | Q12967
173 | Q13009
174 | Q13043
175 | Q13153
176 | Q13177
177 | Q13224
178 | Q13393
179 | Q13480
180 | Q13554
181 | Q13557
182 | Q13671
183 | Q13972
184 | Q14644
185 | Q14957
186 | Q15283
187 | Q15311
188 | Q15349
189 | Q15389
190 | Q15418
191 | Q15759
192 | Q16539
193 | Q16644
194 | Q3MJ16
195 | Q53H76
196 | Q5R387
197 | Q68DD2
198 | Q6S5L8
199 | Q6VAB6
200 | Q7LDG7
201 | Q7Z569
202 | Q86XP0
203 | Q86YV0
204 | Q8IV61
205 | Q8IVT5
206 | Q8TD86
207 | Q8TDF6
208 | Q8WWW0
209 | Q8WYR1
210 | Q92529
211 | Q92565
212 | Q92569
213 | Q92913
214 | Q92914
215 | Q92915
216 | Q92934
217 | Q96KP1
218 | Q96PV0
219 | Q99996
220 | Q9BX93
221 | Q9BZM1
222 | Q9BZM2
223 | Q9GZP0
224 | Q9GZV9
225 | Q9HAV0
226 | Q9HCT0
227 | Q9NP95
228 | Q9NQU5
229 | Q9NRA1
230 | Q9NS23
231 | Q9NSA1
232 | Q9NZ20
233 | Q9NZK7
234 | Q9NZL6
235 | Q9NZT1
236 | Q9P212
237 | Q9P286
238 | Q9P2W3
239 | Q9UBI6
240 | Q9UHD2
241 | Q9UJF2
242 | Q9UK08
243 | Q9UK32
244 | Q9UNK4
245 | Q9UP65
246 | Q9UQC2
247 | Q9UQM7
248 | Q9Y243
249 | Q9Y264
250 | Q9Y6K9
251 |
--------------------------------------------------------------------------------
/R/analysisFunctions.R:
--------------------------------------------------------------------------------
1 | #This function invokes runs the Naive Bayes OncoSig classifier to replicate the results presented in
2 | #the accompanying paper
3 | source("R/rFunctions.R")
4 |
5 | runNaiveBayesClassifier <- function(){
6 | #read in training and testing set
7 | df_1=read.delim("Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
8 | df_2=read.delim("Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE)
9 |
10 | #set binning parameters
11 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6),c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999),c(0,0.01,0.05))
12 | correlated_features=grep("MS_",colnames(df_1),value = TRUE)
13 |
14 | #perform two fold cross validation
15 | message("Calculating LR_posterior for fold two holdout set\n")
16 | the_results_set_1=OncoSigNB(df_1,df_2,the_bins,correlated_features)
17 | message("Calculating LR_posterior for fold one holdout set\n")
18 | the_results_set_2=OncoSigNB(df_2,df_1,the_bins,correlated_features)
19 |
20 | #rank the results
21 | the_results_set_2_rank=cbind(the_results_set_2,rank(-the_results_set_2))
22 | the_results_set_1_rank=cbind(the_results_set_1,rank(-the_results_set_1))
23 | temp=rbind(the_results_set_1_rank,the_results_set_2_rank)
24 | temp=as.data.frame(temp)
25 | colnames(temp)=c("LR_post","Rank")
26 | cross_validated_predictions=temp[order(temp$Rank),]
27 | return(cross_validated_predictions)
28 | #function for performing Naive Bayes Classification
29 |
30 | }
31 |
32 |
33 | #This function calls a script that generates a ROC Curve
34 | generateROCcurve<- function(object_to_create,column_to_use,predictions_file,gold_standard,pdf_outfile){
35 | setwd("Output_files")
36 | cmd=paste("../scripts/quickROC.pl -s",object_to_create,"-c",column_to_use,predictions_file,gold_standard,pdf_outfile,sep=" ")
37 | system(cmd)
38 | setwd("..")
39 | #print(cmd)
40 |
41 | }
42 |
43 | #For a dataframe containing geneids and Log fold change values, this function finds the p-value of each Log Fold Change
44 | #This loop assigns a p-value to each individual shRNA (note that there are multiple shRNas targeting each gene)
45 | getPvalueofLogFC <- function (df_1,density_null){
46 | for (i in 1:nrow(df_1)) {
47 | number=df_1[i,2]
48 | Avg.pos <- number;
49 | xt <- diff(density_null$x[density_null$x < Avg.pos]);
50 | #integrate over the density
51 | yt <- rollmean(density_null$y[density_null$x < Avg.pos ],2);
52 | pvalue=sum(xt*yt)
53 | df_1[i,3]=pvalue
54 | #print(i)
55 | }
56 | #Due to errors in integration rounding, some p-values may be above 1, set those to 1.
57 | above_1=which((df_1[,3]) > 1)
58 | df_1[above_1,3]=1
59 | return(df_1)
60 | }
61 |
62 | #This function Integrates pvalues of the same genes using fisher integration:
63 | #Set maximum pvalues to 1, there are some above 1 do to rounding errors in the integration. The input is a dataframe of genes and raw p-values
64 | integratePvaluesbyGene <- function(df_1){
65 | gene_ids=unique(sort(df_1$Gene))
66 | Integrated_pvalues=data.frame(row.names = gene_ids)
67 | #Integrate the values using fisher integration
68 | for (i in rownames(Integrated_pvalues)) {
69 | nums=df_1[which(df_1$Gene==i),2]
70 | Integrated_pvalue=fisherIntegration(nums)
71 | Integrated_pvalues[i,1]=Integrated_pvalue
72 | #print(Integrated_pvalue)
73 | }
74 | Integrated_pvalues_2=Integrated_pvalues[order(Integrated_pvalues$V1),,drop=F]
75 | return(Integrated_pvalues_2)
76 |
77 | }
78 |
79 | generateROCcurve10OncogenePathways <- function(){
80 | system("scripts/generateROC_curves_OncosigRF.sh")
81 | }
82 |
83 | #this function gets all pairwise pearson correlations between two dataframe columns, and returns it as a vector
84 |
85 | getPairwiseCordataframes <- function(df_1,df_2){
86 | to_return=list()
87 | for (i in colnames(df_1)) {
88 | for (j in colnames(df_2)){
89 | z=cor.test(df_1[,i],df_2[,j])
90 | #print(c(i,j,z))
91 | to_return=append(to_return,z$estimate)
92 | }
93 | }
94 | to_return=unlist(to_return)
95 | }
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/R/functionsRF.R:
--------------------------------------------------------------------------------
1 | listToMatrix <- function(df){
2 | message("Converting Network to Adjacency Matrix...")
3 | na_impute=0
4 | mat <- matrix(0, length(unique(unlist(df[,2]))), length(unique(unlist(df[,1]))))
5 | #mat <- Matrix(0, nrow = length(unique(unlist(df[,2]))), ncol = length(unique(unlist(df[,1]))),sparse=TRUE)
6 | rownames(mat)=sort(unique(unlist(df[,2])))
7 | colnames(mat)=sort(unique(unlist(df[,1])))
8 | #mat[]=na_impute
9 | #z=nrow(df)
10 | #lapply(1:nrow(df),function (x){mat[df[x,2],df[x,1]]=df[x,3];y=x/z;message (y)})
11 | #for (x in 1:nrow(df)){mat[df[x,2],df[x,1]]=df[x,3];y=x/z;message (y)}
12 | mat[df[,2:1]] <- as.numeric(df[,3])
13 | message("Done.")
14 | return(mat)
15 | #for (x in 1:nrow(df)){ mat[as.character(df[x,2]),as.character(df[x,1])]=df[x,3];message (x)}
16 | }
17 |
--------------------------------------------------------------------------------
/R/rFunctions.R:
--------------------------------------------------------------------------------
1 | #For a ROC Curve (blah), given a False positive threshold (num) and the number of positives (num_pos), tell me on the that ROC curve the correspondoing true positive rate
2 | #example: Roc_FPR(.01,perf_ELRON_no_interactions_KRB.R,250)
3 | #This function reports back the FPR threshold used (which will be close to the threshold you input), the Trupe postive rate, the number of true positive found at this threshold and the number of false pistives at this threshold
4 | Roc_FPR <- function(num,blah,num_pos)
5 | {
6 | closest=1
7 | guess=abs(blah@x.values[[1]][1] -num)
8 | for (i in 2:length(blah@x.values[[1]])){
9 | guess2=abs(blah@x.values[[1]][[i]]-num)
10 | if (guess2 Define data, use random,
51 | ##-- or do help(data=index) for the standard data sets.
52 |
53 | ## The function is currently defined as
54 | function (df_1, the_bins)
55 | {
56 | the_min = -1e+45
57 | the_max = 1e+43
58 | df_1[is.na(df_1)] <- -1e+23
59 | df_1_copy = df_1
60 | for (i in 3:ncol(df_1)) {
61 | df_1_copy[, i] = .bincode(df_1_copy[, i], c(the_min,
62 | the_bins[i - 2][[1]], the_max), right = FALSE)
63 | }
64 | new_bin_info = list()
65 | for (i in 3:ncol(df_1)) {
66 | new_bin_info = append(new_bin_info, list(getLRsgivenBin_info(df_1_copy[,
67 | i], the_bins[i - 2][[1]], df_1_copy[, 2])))
68 | }
69 | return(new_bin_info)
70 | }
71 | }
72 | % Add one or more standard keywords, see file 'KEYWORDS' in the
73 | % R documentation directory.
74 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS")
75 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
76 |
--------------------------------------------------------------------------------
/man/OncoSig-package.Rd:
--------------------------------------------------------------------------------
1 | \name{OncoSig-package}
2 | \alias{OncoSig-package}
3 | \alias{OncoSig}
4 | \docType{package}
5 | \title{
6 | \packageTitle{OncoSig}
7 | }
8 | \description{
9 | \packageDescription{OncoSig}
10 | }
11 | \details{
12 |
13 | The DESCRIPTION file:
14 | \packageDESCRIPTION{OncoSig}
15 | \packageIndices{OncoSig}
16 | ~~ An overview of how to use the package, including the most important functions ~~
17 | }
18 | \author{
19 | \packageAuthor{OncoSig}
20 |
21 | Maintainer: \packageMaintainer{OncoSig}
22 | }
23 | \references{
24 | ~~ Literature or other references for background information ~~
25 | }
26 | \keyword{ package }
27 | \seealso{
28 | ~~ Optional links to other man pages, e.g. ~~
29 | ~~ \code{\link[:-package]{}} ~~
30 | }
31 | \examples{
32 | ~~ simple examples of the most important functions ~~
33 | }
34 |
--------------------------------------------------------------------------------
/man/OncoSig.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 |
--------------------------------------------------------------------------------
/man/OncoSigNB.Rd:
--------------------------------------------------------------------------------
1 | \name{OncoSigNB}
2 | \alias{OncoSigNB}
3 | \title{
4 | Run the OncoSig Naive Bayes Classifier
5 | }
6 | \description{
7 | This function runs the OncoSign Naive Bayes Classifier, utilizing the user provided binning parameters. Optionally, allows the user to provide a list of features that are statistically dependent (and thus violate the assumption of Naive Bayes). The output of this function is a dataframe of predictions in the testing set whith scores based on training the classifier on the training set with corresponding likelihood ratios. Higher scores correspond to higher confidence predictions to be part of the oncogene-centric map.
8 | }
9 | \usage{
10 | OncoSigNB(training_set, testing_set, the_bins, correlated_features)
11 | }
12 | \arguments{
13 | \item{training_set}{
14 | \code{a dataframe containing the training set}
15 | }
16 | \item{testing_set}{
17 | \code{a dataframe containing the testing set}
18 | }
19 | \item{the_bins}{
20 | \code{a list of list of the binning parameters. This list of list must be in the same order of the features/columns in the training and testing dataframes}
21 | }
22 | \item{correlated_features}{
23 | \code{a list of correlated features that are statistically dependent. Pass empty list if none}
24 | }
25 | }
26 | \details{
27 | In both the training and testing set, the first column should be a unique string identifying the datapoint (e.g. a protein id), and the second column is the label (0 or 1).
28 | }
29 | \value{
30 | returns a dataframe that is the predictions of classifier on the testing set
31 |
32 | }
33 | \references{
34 | }
35 | \author{
36 |
37 | }
38 | \note{
39 |
40 |
41 | }
42 |
43 | \seealso{
44 |
45 | }
46 | \examples{
47 | #set bins
48 | df_1=read.delim("Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
49 | df_2=read.delim("Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE)
50 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6),c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999),c(0,0.01,0.05))
51 | #specify correlated features
52 | predictions=OncoSigNB(training_set = df_1,testing_set = df_2,the_bins=the_bins,correlated_features =list(correlated_features))
53 | }
54 | \keyword{ ~Naive Bayes }
55 |
--------------------------------------------------------------------------------
/man/OncoSigRF.Rd:
--------------------------------------------------------------------------------
1 | \name{OncoSigRF}
2 | \alias{OncoSigRF}
3 | %- Also NEED an '\alias' for EACH other topic documented here.
4 | \title{
5 | Create and Generate Predictions Using the OncoSig Random Forest Classifier
6 | }
7 | \description{
8 | This function creates the OncoSig Random Forest classifer, and returns predictions generated using Monte-Carlo cross validation. Optionally, the forests generated may be saved, which can be used to generate further predictions.
9 | }
10 | \usage{
11 | OncoSigRF(Network_matrix_df, Gold_Standard_in_Network_names, Fraction_Gold_sample = NULL, ntrees = NULL, max_iterations = NULL, balance = NULL, to_save = NULL)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 | \item{Network_matrix_df}{
16 | A network feature matrix, where the rownames are the points (e..g protein or gene names) to build the classifier on, and the columns are features.
17 | }
18 | \item{Gold_Standard_in_Network_names}{
19 | A list of proteins/genes in the gold standard.
20 | }
21 | \item{Fraction_Gold_sample}{
22 | Fraction of gold standard to sample in each Monte-Carlo run. Default:.5
23 | }
24 | \item{ntrees}{
25 | Number of trees to create in each Random Forest (default:50)
26 | }
27 | \item{max_iterations}{
28 | Number of iterations of Monte-Carlo samplings to run (i.e. number of forests ot create) (Default:20)
29 | }
30 | \item{balance}{
31 | Ratio of proteins not in the gold sample to sample in each run.Balance of 1 corresponds to an equal number of proteins in the gold sample and not. (Default:1)
32 | }
33 | \item{to_save}{
34 | Whether to save the forests created (in a file called "All_forests.R"). This argument must be set to 1 if unsupervised OncoSig is to be used.
35 | }
36 | }
37 | \details{
38 |
39 | }
40 | \value{
41 | Returns a dataframe corresponding to predictions from the Monte-Carlo cross-validation. Higher scores in the first column correspond to higher confidence predictions to be part of the oncogene centric map
42 | }
43 | \references{
44 | %% ~put references to the literature/web site here ~
45 | }
46 | \author{
47 | %% ~~who you are~~
48 | }
49 | \note{
50 | %% ~~further notes~~
51 | }
52 |
53 | %% ~Make other sections like Warning with \section{Warning }{....} ~
54 |
55 | \seealso{
56 |
57 | }
58 | \examples{
59 | % Add one or more standard keywords, see file 'KEYWORDS' in the
60 | % R documentation directory.
61 | library (randomForest)
62 | #Process the network
63 | Network_location="Input_data_files/LUAD/original_network.txt"
64 | Network=read.delim(Network_location,header=F)
65 | Network$V1=as.character(Network$V1)
66 | Network$V2=as.character(Network$V2)
67 | Network$V3=as.numeric(Network$V3)
68 | Network=as.matrix(Network)
69 | Gold_Standard_location="Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt"
70 | Gold_Standard=read.delim(Gold_Standard_location,header=F)
71 | Gold_Standard$V1=as.character(Gold_Standard$V1)
72 |
73 | Network[,3]=as.numeric(Network[,3])
74 | Network_matrix=listToMatrix(Network)
75 |
76 |
77 | #Convert Matrix to Dataframe for future steps
78 | Network_matrix_df=as.data.frame(Network_matrix)
79 | #Remove Members of Gold Standard Not in the Network:
80 | Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df),Gold_Standard$V1)
81 | Negative_Set_names=setdiff(rownames(Network_matrix_df),Gold_Standard_in_Network_names)
82 | #Create Negative_standard
83 | #save(Network_matrix,file="Network_matrix.Rda")
84 | remove(Network_matrix)
85 |
86 | Query_output_results=OncoSigRF(Network_matrix_df,Gold_Standard_in_Network_names,max_iterations = 5)
87 | Query_output_results_scores=Query_output_results[[1]]
88 | }
89 | \keyword{ Random Forest }% use one of RShowDoc("KEYWORDS")
90 | \keyword{ OncoSig }% __ONLY ONE__ keyword per line
91 | \keyword{ Monte Carlo }% __ONLY ONE__ keyword per line
92 |
--------------------------------------------------------------------------------
/man/OncoSigUnsup.Rd:
--------------------------------------------------------------------------------
1 | \name{OncoSigUnsup}
2 | \alias{OncoSigUnsup}
3 | %- Also NEED an '\alias' for EACH other topic documented here.
4 | \title{
5 | Generatres Predictions given a precomputed Oncosig Classifier.
6 | }
7 | \description{
8 | Given an OncoSig Random Forest classifier that is generated for an abritrary Oncogene-Centric Map, generates predictions for a new Oncogene/Tumor Suppressor Y, whose features are specified in the Network location. This function should be used for Oncogenes/Tumor Suppressors that do not have a gold standard for supervised learning.
9 | }
10 | \usage{
11 | OncoSigUnsup(Network_location, forest_location)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 | \item{Network_location}{
16 | This is the file that specifies the features for the Oncogene/Tumor Suppressor. The feature names must be the same as the features used to generate the classifier, or an error will be reported. The format of the file is the same as passed to the function listToMatrix().
17 | }
18 | \item{forest_location}{
19 | The location of classifier, generated by OncoSigRF
20 | }
21 | }
22 | \details{
23 | This function takes in one network location and one forest location
24 | }
25 | \value{
26 | %% ~Describe the value returned
27 | %% If it is a LIST, use
28 | %% \item{comp1 }{Description of 'comp1'}
29 | %% \item{comp2 }{Description of 'comp2'}
30 | %% ...
31 | }
32 | \references{
33 | %% ~put references to the literature/web site here ~
34 | }
35 | \author{
36 | %% ~~who you are~~
37 | }
38 | \note{
39 | %% ~~further notes~~
40 | }
41 |
42 | %% ~Make other sections like Warning with \section{Warning }{....} ~
43 |
44 | \seealso{
45 | %% ~~objects to See Also as \code{\link{help}}, ~~~
46 | }
47 | \examples{
48 | #Predict KRAS Oncogene-centric map based on EGFR forest
49 | KRAS_features="Input_data_files/LUAD/OncoSigUnsup/feature_list_KRAS.txt"
50 | EGFR_forest="Input_data_files/LUAD/OncoSigUnsup/All_forests_EGFR.r"
51 | results=OncoSigUnsup(KRAS_features,EGFR_forest)
52 | }
53 | % Add one or more standard keywords, see file 'KEYWORDS' in the
54 | % R documentation directory.
55 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS")
56 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
57 |
--------------------------------------------------------------------------------
/man/R/OncoSig.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/man/R/OncoSig.R
--------------------------------------------------------------------------------
/man/R/OncoSigNB.Rd:
--------------------------------------------------------------------------------
1 | un the OncoSig Naive Bayes Classifier
2 | \title{OncoSig Naive Bayes}
3 | \description{
4 | Run the OncoSig Naive Bayes Classifier
5 | \code{save}.
6 | }
7 | \usage{
8 | load(file, envir = parent.frame())
9 | }
10 | \arguments{
11 | \item{training_set}{a dataframe containing the training set}
12 | \item{testing_set}{a dataframe of the testing set}
13 | \item{the_bins}{a list of list of the binning parameters}
14 | \item{correlated_features}{a list of correlated features that are statistically dependent. Pass empty list if none}
15 | }
16 | \seealso{
17 | \code{\link{save}}.
18 | }
19 | \examples{
20 | ## set bins;get correlated features
21 | df_1=read.delim("Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
22 | df_2=read.delim("Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE)
23 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6),c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999),c(0,0.01,0.05))
24 | predictions=OncoSigNB(training_set = df_1,testing_set = df_2,the_bins=the_bins,correlated_features =list())
25 |
26 | \keyword{file}
27 |
--------------------------------------------------------------------------------
/man/computeLRsgivenBins.Rd:
--------------------------------------------------------------------------------
1 | \name{computeLRsgivenBins}
2 | \alias{computeLRsgivenBins}
3 | %- Also NEED an '\alias' for EACH other topic documented here.
4 | \title{
5 | Compute Likelihood Ratios (LR) for testing set
6 | }
7 | \description{
8 | This is a helper function for OncoSigNB. After training on a training set, this function computes LRs on a new testing set. Note that labels must be provided for the testing set as well.
9 | }
10 | \usage{
11 | computeLRsgivenBins(df_1, the_bins, the_bins_info)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 | \item{df_1}{
16 | %% ~~Describe \code{df_1} here~~
17 | }
18 | \item{the_bins}{
19 | %% ~~Describe \code{the_bins} here~~
20 | }
21 | \item{the_bins_info}{
22 | %% ~~Describe \code{the_bins_info} here~~
23 | }
24 | }
25 | \details{
26 | %% ~~ If necessary, more details than the description above ~~
27 | }
28 | \value{
29 | %% ~Describe the value returned
30 | %% If it is a LIST, use
31 | %% \item{comp1 }{Description of 'comp1'}
32 | %% \item{comp2 }{Description of 'comp2'}
33 | %% ...
34 | }
35 | \references{
36 | %% ~put references to the literature/web site here ~
37 | }
38 | \author{
39 | %% ~~who you are~~
40 | }
41 | \note{
42 | %% ~~further notes~~
43 | }
44 |
45 | %% ~Make other sections like Warning with \section{Warning }{....} ~
46 |
47 | \seealso{
48 | %% ~~objects to See Also as \code{\link{help}}, ~~~
49 | }
50 | \examples{
51 | ##---- Should be DIRECTLY executable !! ----
52 | ##-- ==> Define data, use random,
53 | ##-- or do help(data=index) for the standard data sets.
54 |
55 | ## The function is currently defined as
56 | function (df_1, the_bins, the_bins_info)
57 | {
58 | the_min = -1e+45
59 | the_max = 1e+43
60 | df_1[is.na(df_1)] <- -1e+23
61 | df_1_copy = df_1
62 | for (i in 3:ncol(df_1)) {
63 | df_1_copy[, i] = .bincode(df_1_copy[, i], c(the_min,
64 | the_bins[i - 2][[1]], the_max), right = FALSE)
65 | }
66 | for (i in 3:ncol(df_1)) {
67 | df_1_copy[, i] = replaceBinswithLR(df_1_copy[, i], the_bins_info[i -
68 | 2][[1]])
69 | }
70 | return(df_1_copy)
71 | }
72 | }
73 | % Add one or more standard keywords, see file 'KEYWORDS' in the
74 | % R documentation directory.
75 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS")
76 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
77 |
--------------------------------------------------------------------------------
/man/getFinalLR.Rd:
--------------------------------------------------------------------------------
1 | \name{getFinalLR}
2 | \alias{getFinalLR}
3 | %- Also NEED an '\alias' for EACH other topic documented here.
4 | \title{
5 | Multipy LRs to get final LR
6 | }
7 | \description{
8 | This is a helper function for OncoSigNB(). This function gets the final LR given the input dataframe. The first two columns are the name and the response variable, columns 3 and up feature values.
9 | }
10 | \usage{
11 | getFinalLR(df_1)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 | \item{df_1}{
16 | %% ~~Describe \code{df_1} here~~
17 | }
18 | }
19 | \details{
20 | %% ~~ If necessary, more details than the description above ~~
21 | }
22 | \value{
23 | %% ~Describe the value returned
24 | %% If it is a LIST, use
25 | %% \item{comp1 }{Description of 'comp1'}
26 | %% \item{comp2 }{Description of 'comp2'}
27 | %% ...
28 | }
29 | \references{
30 | %% ~put references to the literature/web site here ~
31 | }
32 | \author{
33 | %% ~~who you are~~
34 | }
35 | \note{
36 | %% ~~further notes~~
37 | }
38 |
39 | %% ~Make other sections like Warning with \section{Warning }{....} ~
40 |
41 | \seealso{
42 | %% ~~objects to See Also as \code{\link{help}}, ~~~
43 | }
44 | \examples{
45 | ##---- Should be DIRECTLY executable !! ----
46 | ##-- ==> Define data, use random,
47 | ##-- or do help(data=index) for the standard data sets.
48 |
49 | ## The function is currently defined as
50 | function (df_1)
51 | {
52 | to_return = lapply(1:nrow(df_1), function(x) {
53 | prod(df_1[x, 3:ncol(df_1)])
54 | })
55 | return(to_return)
56 | }
57 | }
58 | % Add one or more standard keywords, see file 'KEYWORDS' in the
59 | % R documentation directory.
60 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS")
61 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
62 |
--------------------------------------------------------------------------------
/man/getLRsgivenBin_info.Rd:
--------------------------------------------------------------------------------
1 | \name{getLRsgivenBin_info}
2 | \alias{getLRsgivenBin_info}
3 | %- Also NEED an '\alias' for EACH other topic documented here.
4 | \title{
5 | Return vector of Likelihood Ratios for each bin.
6 | }
7 | \description{
8 | This is a helper function for OncoSigNB. Given a bin vector and the gold standard vector (i.e. the two vectors of the same length), return the Likelihood Ratio vector.
9 | }
10 | \usage{
11 | getLRsgivenBin_info(bin_vector, the_bin, label_vector)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 | \item{bin_vector}{
16 | %% ~~Describe \code{bin_vector} here~~
17 | }
18 | \item{the_bin}{
19 | %% ~~Describe \code{the_bin} here~~
20 | }
21 | \item{label_vector}{
22 | %% ~~Describe \code{label_vector} here~~
23 | }
24 | }
25 | \details{
26 | %% ~~ If necessary, more details than the description above ~~
27 | }
28 | \value{
29 | %% ~Describe the value returned
30 | %% If it is a LIST, use
31 | %% \item{comp1 }{Description of 'comp1'}
32 | %% \item{comp2 }{Description of 'comp2'}
33 | %% ...
34 | }
35 | \references{
36 | %% ~put references to the literature/web site here ~
37 | }
38 | \author{
39 | %% ~~who you are~~
40 | }
41 | \note{
42 | %% ~~further notes~~
43 | }
44 |
45 | %% ~Make other sections like Warning with \section{Warning }{....} ~
46 |
47 | \seealso{
48 | %% ~~objects to See Also as \code{\link{help}}, ~~~
49 | }
50 | \examples{
51 | ##---- Should be DIRECTLY executable !! ----
52 | ##-- ==> Define data, use random,
53 | ##-- or do help(data=index) for the standard data sets.
54 |
55 | ## The function is currently defined as
56 | function (bin_vector, the_bin, label_vector)
57 | {
58 | the_bins_new = the_bin
59 | bin_vector_new = bin_vector
60 | prior = table(label_vector)[2]/table(label_vector)[1]
61 | bin_vector_2 = unique(sort(bin_vector))
62 | bin_vector_3 = rep(0, length(bin_vector_2))
63 | for (i in 1:length(bin_vector_2)) {
64 | the_num = bin_vector_2[i]
65 | ratio_1 = table(label_vector[bin_vector == i])[2]/table(label_vector[bin_vector ==
66 | i])[1]
67 | LR = ratio_1/prior
68 | bin_vector_3[i] = LR
69 | }
70 | names(bin_vector_3) = bin_vector_2
71 | return(bin_vector_3)
72 | }
73 | }
74 | % Add one or more standard keywords, see file 'KEYWORDS' in the
75 | % R documentation directory.
76 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS")
77 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
78 |
--------------------------------------------------------------------------------
/man/getMaxLR.Rd:
--------------------------------------------------------------------------------
1 | \name{getMaxLR}
2 | \alias{getMaxLR}
3 | %- Also NEED an '\alias' for EACH other topic documented here.
4 | \title{
5 | Get maximum LR for correlated features
6 | }
7 | \description{
8 | This is a helper function for OncosigNB;Given the specified columns as a dataframe, return the maximum LR for each case. Note that these columns should be highly correlated.
9 | }
10 | \usage{
11 | getMaxLR(df_1)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 | \item{df_1}{
16 | %% ~~Describe \code{df_1} here~~
17 | }
18 | }
19 | \details{
20 | %% ~~ If necessary, more details than the description above ~~
21 | }
22 | \value{
23 | %% ~Describe the value returned
24 | %% If it is a LIST, use
25 | %% \item{comp1 }{Description of 'comp1'}
26 | %% \item{comp2 }{Description of 'comp2'}
27 | %% ...
28 | }
29 | \references{
30 | %% ~put references to the literature/web site here ~
31 | }
32 | \author{
33 | %% ~~who you are~~
34 | }
35 | \note{
36 | %% ~~further notes~~
37 | }
38 |
39 | %% ~Make other sections like Warning with \section{Warning }{....} ~
40 |
41 | \seealso{
42 | %% ~~objects to See Also as \code{\link{help}}, ~~~
43 | }
44 | \examples{
45 | ##---- Should be DIRECTLY executable !! ----
46 | ##-- ==> Define data, use random,
47 | ##-- or do help(data=index) for the standard data sets.
48 |
49 | ## The function is currently defined as
50 | function (df_1)
51 | {
52 | the_max_results = lapply(1:nrow(df_1), function(x) {
53 | max(df_1[x, ])
54 | })
55 | the_max_results = unlist(the_max_results)
56 | return(the_max_results)
57 | }
58 | }
59 | % Add one or more standard keywords, see file 'KEYWORDS' in the
60 | % R documentation directory.
61 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS")
62 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
63 |
--------------------------------------------------------------------------------
/man/listToMatrix.Rd:
--------------------------------------------------------------------------------
1 | \name{listToMatrix}
2 | \alias{listToMatrix}
3 | \title{
4 | Convert Adjancency list to adjancency matrix
5 | }
6 | \description{
7 | Convert Adjancency list (as a dataframe) to adjancency matrix. The first column will become the columns
8 | the second column the rows, and the third column the weights of the edges.
9 | }
10 |
11 | \usage{
12 |
13 | listToMatrix(df)
14 | }
15 | \arguments{
16 | \item{df}{
17 |
18 | }
19 | }
20 | \details{
21 | }
22 | \value{
23 | returns a adjacency matrix as a matrix object
24 | }
25 | \references{
26 | %% ~put references to the literature/web site here ~
27 | }
28 | \author{
29 | %% ~~who you are~~
30 | }
31 | \note{
32 | %% ~~further notes~~
33 | }
34 |
35 | %% ~Make other sections like Warning with \section{Warning }{....} ~
36 |
37 | \seealso{
38 | %% ~~objects to See Also as \code{\link{help}}, ~~~
39 | }
40 | \examples{
41 | Network_location="Input_data_files/LUAD/original_network.txt"
42 | Network=read.delim(Network_location,header=F)
43 | Network$V1=as.character(Network$V1)
44 | Network$V2=as.character(Network$V2)
45 | Network$V3=as.numeric(Network$V3)
46 | Network=as.matrix(Network)
47 | Gold_Standard_location="Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt"
48 | Gold_Standard=read.delim(Gold_Standard_location,header=F)
49 | Gold_Standard$V1=as.character(Gold_Standard$V1)
50 |
51 | Network[,3]=as.numeric(Network[,3])
52 | Network_matrix=listToMatrix(Network)
53 | }
54 | % Add one or more standard keywords, see file 'KEYWORDS' in the
55 | % R documentation directory.
56 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS")
57 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
58 |
--------------------------------------------------------------------------------
/man/replaceBinswithLR.Rd:
--------------------------------------------------------------------------------
1 | \name{replaceBinswithLR}
2 | \alias{replaceBinswithLR}
3 | %- Also NEED an '\alias' for EACH other topic documented here.
4 | \title{
5 | Replace bins with LRs
6 | }
7 | \description{
8 | This is a helper function for OnocoSigNB. Given the bin info, and bined data, replace each bin with the corresponding Likelihood ratio.
9 | }
10 | \usage{
11 | replaceBinswithLR(bin_vector, the_bin_info)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 | \item{bin_vector}{
16 | %% ~~Describe \code{bin_vector} here~~
17 | }
18 | \item{the_bin_info}{
19 | %% ~~Describe \code{the_bin_info} here~~
20 | }
21 | }
22 | \details{
23 | %% ~~ If necessary, more details than the description above ~~
24 | }
25 | \value{
26 | %% ~Describe the value returned
27 | %% If it is a LIST, use
28 | %% \item{comp1 }{Description of 'comp1'}
29 | %% \item{comp2 }{Description of 'comp2'}
30 | %% ...
31 | }
32 | \references{
33 | %% ~put references to the literature/web site here ~
34 | }
35 | \author{
36 | %% ~~who you are~~
37 | }
38 | \note{
39 | %% ~~further notes~~
40 | }
41 |
42 | %% ~Make other sections like Warning with \section{Warning }{....} ~
43 |
44 | \seealso{
45 | %% ~~objects to See Also as \code{\link{help}}, ~~~
46 | }
47 | \examples{
48 | ##---- Should be DIRECTLY executable !! ----
49 | ##-- ==> Define data, use random,
50 | ##-- or do help(data=index) for the standard data sets.
51 |
52 | ## The function is currently defined as
53 | function (bin_vector, the_bin_info)
54 | {
55 | new_bin_vector = lapply(1:len(bin_vector), function(x) {
56 | the_bin = as.character(bin_vector[x])
57 | bin_value = the_bin_info[the_bin]
58 | bin_value
59 | })
60 | new_bin_vector = unlist(new_bin_vector)
61 | return(new_bin_vector)
62 | }
63 | }
64 | % Add one or more standard keywords, see file 'KEYWORDS' in the
65 | % R documentation directory.
66 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS")
67 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
68 |
--------------------------------------------------------------------------------
/man/runNaiveBayesClassifier.Rd:
--------------------------------------------------------------------------------
1 | \name{runNaiveBayesClassifier}
2 | \alias{runNaiveBayesClassifier}
3 | \title{
4 | Run the OncoSig Naive Bayes Classifier as used in the acoompanying paper.
5 | }
6 | \description{
7 | }
8 | \usage{
9 | runNaiveBayesClassifier()
10 | }
11 | \details{
12 |
13 | }
14 | \value{
15 | }
16 | \references{
17 | }
18 | \author{
19 | }
20 | \note{
21 | }
22 |
23 |
24 | \seealso{
25 | %% ~~objects to See Also as \code{\link{help}}, ~~~
26 | }
27 | \examples{
28 | runNaiveBayesClassifier()
29 | ## The function is currently defined as
30 | function ()
31 | {
32 | df_1 = read.delim("Input_data_files/Naive_Bayes_evidences_set_1.txt",
33 | header = TRUE)
34 | df_2 = read.delim("Input_data_files/Naive_Bayes_evidences_set_2.txt",
35 | header = TRUE)
36 | the_bins = list(c(0, 40, 200, 1200), c(0, 0.1), c(-2, -0.15,
37 | -0.02, 0.0925), c(1, 2, 6), c(0, 0.25), c(1, 3, 20),
38 | c(1, 4, 20), c(1, 4, 20), c(0, 1e-04, 0.9999), c(0, 0.01,
39 | 0.05))
40 | correlated_features = grep("MS_", colnames(df_1), value = TRUE)
41 | message("Calculating LR_posterior for fold two holdout set\n")
42 | the_results_set_1 = OncoSigNB(df_1, df_2, the_bins, correlated_features)
43 | message("Calculating LR_posterior for fold one holdout set\n")
44 | the_results_set_2 = OncoSigNB(df_2, df_1, the_bins, correlated_features)
45 | the_results_set_2_rank = cbind(the_results_set_2, rank(-the_results_set_2))
46 | the_results_set_1_rank = cbind(the_results_set_1, rank(-the_results_set_1))
47 | temp = rbind(the_results_set_1_rank, the_results_set_2_rank)
48 | temp = as.data.frame(temp)
49 | colnames(temp) = c("LR_post", "Rank")
50 | cross_validated_predictions = temp[order(temp$Rank), ]
51 | return(cross_validated_predictions)
52 | }
53 | }
54 | % Add one or more standard keywords, see file 'KEYWORDS' in the
55 | % R documentation directory.
56 | \keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS")
57 | \keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
58 |
--------------------------------------------------------------------------------
/vignettes/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/vignettes/.DS_Store
--------------------------------------------------------------------------------
/vignettes/OncoSig-concordance.tex:
--------------------------------------------------------------------------------
1 | \Sconcordance{concordance:OncoSig.tex:OncoSig.Rnw:%
2 | 1 6 1 1 0 3 1 1 4 11 1 1 2 4 0 1 2 4 1 1 2 1 0 1 1 1 3 2 0 1 2 %
3 | 4 0 1 2 3 1 1 2 1 0 1 1 17 0 1 2 4 1 1 2 4 0 2 2 1 0 7 1 3 0 2 %
4 | 2 12 0 1 2 1 1 1 3 2 0 2 1 3 0 1 2 1 1 1 3 2 0 1 3 2 0 1 2 1 0 %
5 | 1 1 3 0 1 2 1 3 2 0 1 1 3 0 1 2 2 1 1 3 2 0 1 2 1 0 1 1 3 0 1 %
6 | 2 2 1}
7 |
--------------------------------------------------------------------------------
/vignettes/OncoSig.Rnw:
--------------------------------------------------------------------------------
1 | \documentclass{article}
2 | \usepackage{hyperref}
3 | \usepackage{hyperref}
4 | %\VignetteIndexEntry{Using OncoSig}
5 | \title{Using the OncoSig Classifiers to Discover Novel Oncoprotein network Dependencies}
6 | \date{\today}
7 | \begin{document}
8 | \author{Joshua Broyde, Diana Murray, Barry Honig, Andrea Califano\\Columbia University, New York, USA}
9 | \SweaveOpts{concordance=TRUE}
10 | \maketitle
11 | <>=
12 | options(width=70)
13 | @
14 |
15 | \section*{Introduction}
16 |
17 | OncoSig comprises a set of machine learning approaches for determinging novel sets of gene products (i.e. genes or proteins) that support
18 | the activity of an oncogene or tumor suppressor (i.e. Oncoprotein-Centric Map,
19 | or OC-map). This is relevant for determining which genes/proteins are involved in an oncoprotein's functional network. OncoSig queries a molecular interaction network or other features regarding a protein's function to predict novel members of the OC-map. This molecular interaction network could contain features such as protein-protein interactions, or gene regulatory networks.
20 | OncoSig can be used primarily in two ways, in a supervised or unsupervised fashion. In the supervised fashion OncoSig uses either a Naive Bayes or Random Forest classifier to train on the molecular interaction network and a gold standard of known members of a particular OC-Map (for example, the members of the KRAS signaling pathway). This approach is appropriate for cases where some members of an OC-Map are known and one wants to leverage the known ones to predict other OC-Map members.
21 |
22 | In a cases where a gold standard is not known, OnconSig can be used in an unsupervised fashion, where an OC-Map trained on a well characterized Oncoprotein is applied to one that is poorly characterized. This usage is appropriate where there is no gold standard for a particular Oncoprotein.
23 |
24 | \section*{Installation and loading}
25 | After first installing R (\href{url}{http://www.r-project.org}) and the OncoSig library, load OncoSig.
26 | <>=
27 | library("OncoSig")
28 | @
29 |
30 | \section*{OncoSig Naive Bayes Classifier}
31 | The OncoSig Naive Bayes (OncoSigNB) Classifier is a supervised learning approach that is well suited to discovering OC-Map members when there are a few number of features describing each gene product and when the features have no or low statistical dependence. To run OncoSigNB, we create dataframes that correspond to the training and testing sets (which are labeled as 1 or 0, if they are in the gold standard OC-Map or not, respectivley).
32 | For example:
33 |
34 | <<>>=
35 | df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
36 | df_2=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE)
37 | the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6),
38 | c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999),
39 | c(0,0.01,0.05))
40 | predictions=OncoSigNB(training_set = df_1,testing_set = df_2,
41 | the_bins=the_bins,correlated_features =list())
42 | @
43 |
44 | In this example, we specified the training and testing sets, how to bin the data for each feature, and passed an empty list to indicate that there are no correlated features.
45 |
46 | The input training and testings should be formatted like so.
47 | <<>>=
48 | df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
49 | df_1[1:5,]
50 | @
51 | Note that when a feature is missing, it is simply coded as NA. The first column is the name (e.g. a gene name), the second column is the label, and all later columns are features.
52 |
53 | \section*{OncoSigRF}
54 | Supervised classification can also be run using the OncoSig Random Forest (OncoSig RF) classifier. Random Forests can be used when the features are statistically dependent, and can more easily be used when the number of features number in the thousands or more. We recommend OncoSigRF when integrating a molecular interaction network that may contain many tens of thousands (note that when using a network of more than a few hundred thousand interactions, OncoSig RF will require a few hours and a prohibitive amount of memory to run).
55 |
56 | <<>>=
57 | library (randomForest)
58 | @
59 | First we read in the molecular interaction network and convert it to a matrix
60 | <<>>=
61 | Network_location="~/OncoSig/Input_data_files/LUAD/original_network_sample.txt"
62 | Network=read.delim(Network_location,header=F)
63 | Network$V1=as.character(Network$V1)
64 | Network$V2=as.character(Network$V2)
65 | Network$V3=as.numeric(Network$V3)
66 | Network=as.matrix(Network)
67 | Network[,3]=as.numeric(Network[,3])
68 | Network_matrix=listToMatrix(Network)
69 | @
70 | Note the format of the input network. The first column is the feature name, the second column is the gene product (i.e. the row of data), and the third column is the score.
71 | <<>>=
72 | Network[1:5,]
73 | @
74 |
75 | Now read in and process the gold standard
76 | <<>>=
77 | Gold_Standard_location=
78 | "~/OncoSig/Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt"
79 | Gold_Standard=read.delim(Gold_Standard_location,header=F)
80 | Gold_Standard$V1=as.character(Gold_Standard$V1)
81 | @
82 |
83 | preprocess the data
84 | <<>>=
85 | #Convert Matrix to Dataframe for future steps
86 | Network_matrix_df=as.data.frame(Network_matrix)
87 | #Remove Members of Gold Standard Not in the Network:
88 | Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df),
89 | Gold_Standard$V1)
90 | Negative_Set_names=setdiff(rownames(Network_matrix_df),
91 | Gold_Standard_in_Network_names)
92 | remove(Network_matrix) #delete Matrix.
93 | @
94 | Run the OncoSigRF Classifier
95 | <>=
96 | Query_output_results=OncoSigRF(Network_matrix_df,
97 | Gold_Standard_in_Network_names, max_iterations=5)
98 | Query_output_results_scores=as.data.frame(Query_output_results[[1]])
99 | @
100 |
101 | \section*{Unsupervised OncoSig}
102 | Supervised classification is only applicable when a gold standard suitable for training can be found. However, some Oncogenes/Tumor Suppressors may have no known gene product dependencies. In this case, we can apply a forest created specifically using one Oncogene/Tumor Suppressor and apply it to another. See the documentation for the OncoSigUnsup function for further details. In this example, we read in a random forest created using features for the EGFR oncogene and apply it to the KRAS oncogene.
103 | <>=
104 | KRAS_features=
105 | "~/OncoSig/Input_data_files/LUAD/OncoSigUnsup/feature_list_KRAS.txt"
106 | EGFR_forest=
107 | "~/OncoSig/Input_data_files/LUAD/OncoSigUnsup/All_forests_EGFR.r"
108 | results=OncoSigUnsup(KRAS_features,EGFR_forest)
109 | @
110 |
111 |
112 | \end{document}
--------------------------------------------------------------------------------
/vignettes/OncoSig.log:
--------------------------------------------------------------------------------
1 | This is pdfTeX, Version 3.14159265-2.6-1.40.17 (TeX Live 2016) (preloaded format=pdflatex 2016.12.27) 13 DEC 2018 04:52
2 | entering extended mode
3 | restricted \write18 enabled.
4 | %&-line parsing enabled.
5 | **OncoSig.tex
6 | (./OncoSig.tex
7 | LaTeX2e <2016/03/31>
8 | Babel <3.9r> and hyphenation patterns for 22 language(s) loaded.
9 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/article.cls
10 | Document Class: article 2014/09/29 v1.4h Standard LaTeX document class
11 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/size10.clo
12 | File: size10.clo 2014/09/29 v1.4h Standard LaTeX file (size option)
13 | )
14 | \c@part=\count79
15 | \c@section=\count80
16 | \c@subsection=\count81
17 | \c@subsubsection=\count82
18 | \c@paragraph=\count83
19 | \c@subparagraph=\count84
20 | \c@figure=\count85
21 | \c@table=\count86
22 | \abovecaptionskip=\skip41
23 | \belowcaptionskip=\skip42
24 | \bibindent=\dimen102
25 | )
26 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/hyperref/hyperref.sty
27 | Package: hyperref 2016/05/05 v6.83n Hypertext links for LaTeX
28 |
29 | (/usr/local/texlive/2016basic/texmf-dist/tex/generic/oberdiek/hobsub-hyperref.s
30 | ty
31 | Package: hobsub-hyperref 2016/05/16 v1.14 Bundle oberdiek, subset hyperref (HO)
32 |
33 |
34 | (/usr/local/texlive/2016basic/texmf-dist/tex/generic/oberdiek/hobsub-generic.st
35 | y
36 | Package: hobsub-generic 2016/05/16 v1.14 Bundle oberdiek, subset generic (HO)
37 | Package: hobsub 2016/05/16 v1.14 Construct package bundles (HO)
38 | Package: infwarerr 2016/05/16 v1.4 Providing info/warning/error messages (HO)
39 | Package: ltxcmds 2016/05/16 v1.23 LaTeX kernel commands for general use (HO)
40 | Package: ifluatex 2016/05/16 v1.4 Provides the ifluatex switch (HO)
41 | Package ifluatex Info: LuaTeX not detected.
42 | Package: ifvtex 2016/05/16 v1.6 Detect VTeX and its facilities (HO)
43 | Package ifvtex Info: VTeX not detected.
44 | Package: intcalc 2016/05/16 v1.2 Expandable calculations with integers (HO)
45 | Package: ifpdf 2016/05/14 v3.1 Provides the ifpdf switch
46 | Package: etexcmds 2016/05/16 v1.6 Avoid name clashes with e-TeX commands (HO)
47 | Package etexcmds Info: Could not find \expanded.
48 | (etexcmds) That can mean that you are not using pdfTeX 1.50 or
49 | (etexcmds) that some package has redefined \expanded.
50 | (etexcmds) In the latter case, load this package earlier.
51 | Package: kvsetkeys 2016/05/16 v1.17 Key value parser (HO)
52 | Package: kvdefinekeys 2016/05/16 v1.4 Define keys (HO)
53 | Package: pdftexcmds 2016/05/10 v0.21 Utility functions of pdfTeX for LuaTeX (HO
54 | )
55 | Package pdftexcmds Info: LuaTeX not detected.
56 | Package pdftexcmds Info: \pdf@primitive is available.
57 | Package pdftexcmds Info: \pdf@ifprimitive is available.
58 | Package pdftexcmds Info: \pdfdraftmode found.
59 | Package: pdfescape 2016/05/16 v1.14 Implements pdfTeX's escape features (HO)
60 | Package: bigintcalc 2016/05/16 v1.4 Expandable calculations on big integers (HO
61 | )
62 | Package: bitset 2016/05/16 v1.2 Handle bit-vector datatype (HO)
63 | Package: uniquecounter 2016/05/16 v1.3 Provide unlimited unique counter (HO)
64 | )
65 | Package hobsub Info: Skipping package `hobsub' (already loaded).
66 | Package: letltxmacro 2016/05/16 v1.5 Let assignment for LaTeX macros (HO)
67 | Package: hopatch 2016/05/16 v1.3 Wrapper for package hooks (HO)
68 | Package: xcolor-patch 2016/05/16 xcolor patch
69 | Package: atveryend 2016/05/16 v1.9 Hooks at the very end of document (HO)
70 | Package atveryend Info: \enddocument detected (standard20110627).
71 | Package: atbegshi 2016/05/16 v1.17 At begin shipout hook (HO)
72 | Package: refcount 2016/05/16 v3.5 Data extraction from label references (HO)
73 | Package: hycolor 2016/05/16 v1.8 Color options for hyperref/bookmark (HO)
74 | ) (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics/keyval.sty
75 | Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
76 | \KV@toks@=\toks14
77 | )
78 | (/usr/local/texlive/2016basic/texmf-dist/tex/generic/ifxetex/ifxetex.sty
79 | Package: ifxetex 2010/09/12 v0.6 Provides ifxetex conditional
80 | )
81 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/auxhook.sty
82 | Package: auxhook 2016/05/16 v1.4 Hooks for auxiliary files (HO)
83 | )
84 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/kvoptions.sty
85 | Package: kvoptions 2016/05/16 v3.12 Key value format for package options (HO)
86 | )
87 | \@linkdim=\dimen103
88 | \Hy@linkcounter=\count87
89 | \Hy@pagecounter=\count88
90 |
91 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/hyperref/pd1enc.def
92 | File: pd1enc.def 2016/05/05 v6.83n Hyperref: PDFDocEncoding definition (HO)
93 | )
94 | \Hy@SavedSpaceFactor=\count89
95 |
96 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/latexconfig/hyperref.cfg
97 | File: hyperref.cfg 2002/06/06 v1.2 hyperref configuration of TeXLive
98 | )
99 | Package hyperref Info: Hyper figures OFF on input line 4446.
100 | Package hyperref Info: Link nesting OFF on input line 4451.
101 | Package hyperref Info: Hyper index ON on input line 4454.
102 | Package hyperref Info: Plain pages OFF on input line 4461.
103 | Package hyperref Info: Backreferencing OFF on input line 4466.
104 | Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
105 | Package hyperref Info: Bookmarks ON on input line 4691.
106 | \c@Hy@tempcnt=\count90
107 |
108 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/url/url.sty
109 | \Urlmuskip=\muskip10
110 | Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc.
111 | )
112 | LaTeX Info: Redefining \url on input line 5044.
113 | \XeTeXLinkMargin=\dimen104
114 | \Fld@menulength=\count91
115 | \Field@Width=\dimen105
116 | \Fld@charsize=\dimen106
117 | Package hyperref Info: Hyper figures OFF on input line 6298.
118 | Package hyperref Info: Link nesting OFF on input line 6303.
119 | Package hyperref Info: Hyper index ON on input line 6306.
120 | Package hyperref Info: backreferencing OFF on input line 6313.
121 | Package hyperref Info: Link coloring OFF on input line 6318.
122 | Package hyperref Info: Link coloring with OCG OFF on input line 6323.
123 | Package hyperref Info: PDF/A mode OFF on input line 6328.
124 | LaTeX Info: Redefining \ref on input line 6368.
125 | LaTeX Info: Redefining \pageref on input line 6372.
126 | \Hy@abspage=\count92
127 | \c@Item=\count93
128 | \c@Hfootnote=\count94
129 | )
130 |
131 | Package hyperref Message: Driver (autodetected): hpdftex.
132 |
133 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/hyperref/hpdftex.def
134 | File: hpdftex.def 2016/05/05 v6.83n Hyperref driver for pdfTeX
135 | \Fld@listcount=\count95
136 | \c@bookmark@seq@number=\count96
137 |
138 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/rerunfilecheck.sty
139 | Package: rerunfilecheck 2016/05/16 v1.8 Rerun checks for auxiliary files (HO)
140 | Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2
141 | 82.
142 | )
143 | \Hy@SectionHShift=\skip43
144 | )
145 | (/usr/local/Cellar/r/3.4.0_1/R.framework/Resources/share/texmf/tex/latex/Sweave
146 | .sty
147 | Package: Sweave
148 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/ifthen.sty
149 | Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC)
150 | )
151 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics/graphicx.sty
152 | Package: graphicx 2014/10/28 v1.0g Enhanced LaTeX Graphics (DPC,SPQR)
153 |
154 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics/graphics.sty
155 | Package: graphics 2016/05/09 v1.0r Standard LaTeX Graphics (DPC,SPQR)
156 |
157 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics/trig.sty
158 | Package: trig 2016/01/03 v1.10 sin cos tan (DPC)
159 | )
160 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
161 | File: graphics.cfg 2016/01/02 v1.10 sample graphics configuration
162 | )
163 | Package graphics Info: Driver file: pdftex.def on input line 96.
164 |
165 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/pdftex-def/pdftex.def
166 | File: pdftex.def 2011/05/27 v0.06d Graphics/color for pdfTeX
167 | \Gread@gobject=\count97
168 | ))
169 | \Gin@req@height=\dimen107
170 | \Gin@req@width=\dimen108
171 | )
172 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/fancyvrb/fancyvrb.sty
173 | Package: fancyvrb 2008/02/07
174 |
175 | Style option: `fancyvrb' v2.7a, with DG/SPQR fixes, and firstline=lastline fix
176 | <2008/02/07> (tvz)
177 | \FV@CodeLineNo=\count98
178 | \FV@InFile=\read1
179 | \FV@TabBox=\box26
180 | \c@FancyVerbLine=\count99
181 | \FV@StepNumber=\count100
182 | \FV@OutFile=\write3
183 | )
184 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/textcomp.sty
185 | Package: textcomp 2005/09/27 v1.99g Standard LaTeX package
186 | Package textcomp Info: Sub-encoding information:
187 | (textcomp) 5 = only ISO-Adobe without \textcurrency
188 | (textcomp) 4 = 5 + \texteuro
189 | (textcomp) 3 = 4 + \textohm
190 | (textcomp) 2 = 3 + \textestimated + \textcurrency
191 | (textcomp) 1 = TS1 - \textcircled - \t
192 | (textcomp) 0 = TS1 (full)
193 | (textcomp) Font families with sub-encoding setting implement
194 | (textcomp) only a restricted character set as indicated.
195 | (textcomp) Family '?' is the default used for unknown fonts.
196 | (textcomp) See the documentation for details.
197 | Package textcomp Info: Setting ? sub-encoding to TS1/1 on input line 79.
198 |
199 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/ts1enc.def
200 | File: ts1enc.def 2001/06/05 v3.0e (jk/car/fm) Standard LaTeX file
201 | )
202 | LaTeX Info: Redefining \oldstylenums on input line 334.
203 | Package textcomp Info: Setting cmr sub-encoding to TS1/0 on input line 349.
204 | Package textcomp Info: Setting cmss sub-encoding to TS1/0 on input line 350.
205 | Package textcomp Info: Setting cmtt sub-encoding to TS1/0 on input line 351.
206 | Package textcomp Info: Setting cmvtt sub-encoding to TS1/0 on input line 352.
207 | Package textcomp Info: Setting cmbr sub-encoding to TS1/0 on input line 353.
208 | Package textcomp Info: Setting cmtl sub-encoding to TS1/0 on input line 354.
209 | Package textcomp Info: Setting ccr sub-encoding to TS1/0 on input line 355.
210 | Package textcomp Info: Setting ptm sub-encoding to TS1/4 on input line 356.
211 | Package textcomp Info: Setting pcr sub-encoding to TS1/4 on input line 357.
212 | Package textcomp Info: Setting phv sub-encoding to TS1/4 on input line 358.
213 | Package textcomp Info: Setting ppl sub-encoding to TS1/3 on input line 359.
214 | Package textcomp Info: Setting pag sub-encoding to TS1/4 on input line 360.
215 | Package textcomp Info: Setting pbk sub-encoding to TS1/4 on input line 361.
216 | Package textcomp Info: Setting pnc sub-encoding to TS1/4 on input line 362.
217 | Package textcomp Info: Setting pzc sub-encoding to TS1/4 on input line 363.
218 | Package textcomp Info: Setting bch sub-encoding to TS1/4 on input line 364.
219 | Package textcomp Info: Setting put sub-encoding to TS1/5 on input line 365.
220 | Package textcomp Info: Setting uag sub-encoding to TS1/5 on input line 366.
221 | Package textcomp Info: Setting ugq sub-encoding to TS1/5 on input line 367.
222 | Package textcomp Info: Setting ul8 sub-encoding to TS1/4 on input line 368.
223 | Package textcomp Info: Setting ul9 sub-encoding to TS1/4 on input line 369.
224 | Package textcomp Info: Setting augie sub-encoding to TS1/5 on input line 370.
225 | Package textcomp Info: Setting dayrom sub-encoding to TS1/3 on input line 371.
226 | Package textcomp Info: Setting dayroms sub-encoding to TS1/3 on input line 372.
227 |
228 | Package textcomp Info: Setting pxr sub-encoding to TS1/0 on input line 373.
229 | Package textcomp Info: Setting pxss sub-encoding to TS1/0 on input line 374.
230 | Package textcomp Info: Setting pxtt sub-encoding to TS1/0 on input line 375.
231 | Package textcomp Info: Setting txr sub-encoding to TS1/0 on input line 376.
232 | Package textcomp Info: Setting txss sub-encoding to TS1/0 on input line 377.
233 | Package textcomp Info: Setting txtt sub-encoding to TS1/0 on input line 378.
234 | Package textcomp Info: Setting lmr sub-encoding to TS1/0 on input line 379.
235 | Package textcomp Info: Setting lmdh sub-encoding to TS1/0 on input line 380.
236 | Package textcomp Info: Setting lmss sub-encoding to TS1/0 on input line 381.
237 | Package textcomp Info: Setting lmssq sub-encoding to TS1/0 on input line 382.
238 | Package textcomp Info: Setting lmvtt sub-encoding to TS1/0 on input line 383.
239 | Package textcomp Info: Setting lmtt sub-encoding to TS1/0 on input line 384.
240 | Package textcomp Info: Setting qhv sub-encoding to TS1/0 on input line 385.
241 | Package textcomp Info: Setting qag sub-encoding to TS1/0 on input line 386.
242 | Package textcomp Info: Setting qbk sub-encoding to TS1/0 on input line 387.
243 | Package textcomp Info: Setting qcr sub-encoding to TS1/0 on input line 388.
244 | Package textcomp Info: Setting qcs sub-encoding to TS1/0 on input line 389.
245 | Package textcomp Info: Setting qpl sub-encoding to TS1/0 on input line 390.
246 | Package textcomp Info: Setting qtm sub-encoding to TS1/0 on input line 391.
247 | Package textcomp Info: Setting qzc sub-encoding to TS1/0 on input line 392.
248 | Package textcomp Info: Setting qhvc sub-encoding to TS1/0 on input line 393.
249 | Package textcomp Info: Setting futs sub-encoding to TS1/4 on input line 394.
250 | Package textcomp Info: Setting futx sub-encoding to TS1/4 on input line 395.
251 | Package textcomp Info: Setting futj sub-encoding to TS1/4 on input line 396.
252 | Package textcomp Info: Setting hlh sub-encoding to TS1/3 on input line 397.
253 | Package textcomp Info: Setting hls sub-encoding to TS1/3 on input line 398.
254 | Package textcomp Info: Setting hlst sub-encoding to TS1/3 on input line 399.
255 | Package textcomp Info: Setting hlct sub-encoding to TS1/5 on input line 400.
256 | Package textcomp Info: Setting hlx sub-encoding to TS1/5 on input line 401.
257 | Package textcomp Info: Setting hlce sub-encoding to TS1/5 on input line 402.
258 | Package textcomp Info: Setting hlcn sub-encoding to TS1/5 on input line 403.
259 | Package textcomp Info: Setting hlcw sub-encoding to TS1/5 on input line 404.
260 | Package textcomp Info: Setting hlcf sub-encoding to TS1/5 on input line 405.
261 | Package textcomp Info: Setting pplx sub-encoding to TS1/3 on input line 406.
262 | Package textcomp Info: Setting pplj sub-encoding to TS1/3 on input line 407.
263 | Package textcomp Info: Setting ptmx sub-encoding to TS1/4 on input line 408.
264 | Package textcomp Info: Setting ptmj sub-encoding to TS1/4 on input line 409.
265 | )
266 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/fontenc.sty
267 | Package: fontenc 2005/09/27 v1.99g Standard LaTeX package
268 |
269 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/t1enc.def
270 | File: t1enc.def 2005/09/27 v1.99g Standard LaTeX file
271 | LaTeX Font Info: Redeclaring font encoding T1 on input line 48.
272 | ))
273 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/ae/ae.sty
274 | Package: ae 2001/02/12 1.3 Almost European Computer Modern
275 |
276 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/fontenc.sty
277 | Package: fontenc 2005/09/27 v1.99g Standard LaTeX package
278 |
279 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/t1enc.def
280 | File: t1enc.def 2005/09/27 v1.99g Standard LaTeX file
281 | LaTeX Font Info: Redeclaring font encoding T1 on input line 48.
282 | )
283 | LaTeX Font Info: Try loading font information for T1+aer on input line 105.
284 |
285 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/ae/t1aer.fd
286 | File: t1aer.fd 1997/11/16 Font definitions for T1/aer.
287 | ))))
288 | (./OncoSig.aux)
289 | \openout1 = `OncoSig.aux'.
290 |
291 | LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 8.
292 | LaTeX Font Info: ... okay on input line 8.
293 | LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 8.
294 | LaTeX Font Info: ... okay on input line 8.
295 | LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 8.
296 | LaTeX Font Info: ... okay on input line 8.
297 | LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 8.
298 | LaTeX Font Info: ... okay on input line 8.
299 | LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 8.
300 | LaTeX Font Info: ... okay on input line 8.
301 | LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 8.
302 | LaTeX Font Info: ... okay on input line 8.
303 | LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 8.
304 | LaTeX Font Info: ... okay on input line 8.
305 | LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 8.
306 | LaTeX Font Info: Try loading font information for TS1+cmr on input line 8.
307 |
308 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/base/ts1cmr.fd
309 | File: ts1cmr.fd 2014/09/29 v2.5h Standard LaTeX font definitions
310 | )
311 | LaTeX Font Info: ... okay on input line 8.
312 | \AtBeginShipoutBox=\box27
313 | Package hyperref Info: Link coloring OFF on input line 8.
314 |
315 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/hyperref/nameref.sty
316 | Package: nameref 2012/10/27 v2.43 Cross-referencing by name of section
317 |
318 | (/usr/local/texlive/2016basic/texmf-dist/tex/generic/oberdiek/gettitlestring.st
319 | y
320 | Package: gettitlestring 2016/05/16 v1.5 Cleanup title references (HO)
321 | )
322 | \c@section@level=\count101
323 | )
324 | LaTeX Info: Redefining \ref on input line 8.
325 | LaTeX Info: Redefining \pageref on input line 8.
326 | LaTeX Info: Redefining \nameref on input line 8.
327 | (./OncoSig.out) (./OncoSig.out)
328 | \@outlinefile=\write4
329 | \openout4 = `OncoSig.out'.
330 |
331 |
332 | (/usr/local/texlive/2016basic/texmf-dist/tex/context/base/mkii/supp-pdf.mkii
333 | [Loading MPS to PDF converter (version 2006.09.02).]
334 | \scratchcounter=\count102
335 | \scratchdimen=\dimen109
336 | \scratchbox=\box28
337 | \nofMPsegments=\count103
338 | \nofMParguments=\count104
339 | \everyMPshowfont=\toks15
340 | \MPscratchCnt=\count105
341 | \MPscratchDim=\dimen110
342 | \MPnumerator=\count106
343 | \makeMPintoPDFobject=\count107
344 | \everyMPtoPDFconversion=\toks16
345 | ) (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/epstopdf-base.sty
346 | Package: epstopdf-base 2016/05/15 v2.6 Base part for package epstopdf
347 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/oberdiek/grfext.sty
348 | Package: grfext 2016/05/16 v1.2 Manage graphics extensions (HO)
349 | )
350 | Package grfext Info: Graphics extension search list:
351 | (grfext) [.png,.pdf,.jpg,.mps,.jpeg,.jbig2,.jb2,.PNG,.PDF,.JPG,.JPE
352 | G,.JBIG2,.JB2,.eps]
353 | (grfext) \AppendGraphicsExtensions on input line 456.
354 |
355 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg
356 | File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv
357 | e
358 | )) (./OncoSig-concordance.tex)
359 | LaTeX Font Info: External font `cmex10' loaded for size
360 | (Font) <12> on input line 12.
361 | LaTeX Font Info: External font `cmex10' loaded for size
362 | (Font) <8> on input line 12.
363 | LaTeX Font Info: External font `cmex10' loaded for size
364 | (Font) <6> on input line 12.
365 | LaTeX Font Info: Try loading font information for T1+aett on input line 25.
366 |
367 | (/usr/local/texlive/2016basic/texmf-dist/tex/latex/ae/t1aett.fd
368 | File: t1aett.fd 1997/11/16 Font definitions for T1/aett.
369 | ) [1
370 |
371 | {/usr/local/texlive/2016basic/texmf-var/fonts/map/pdftex/updmap/pdftex.map}] [2
372 | ] [3]
373 | Package atveryend Info: Empty hook `BeforeClearDocument' on input line 152.
374 |
375 | [4]
376 | Package atveryend Info: Empty hook `AfterLastShipout' on input line 152.
377 | (./OncoSig.aux)
378 | Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 152.
379 | Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 152.
380 | Package rerunfilecheck Info: File `OncoSig.out' has not changed.
381 | (rerunfilecheck) Checksum: D41D8CD98F00B204E9800998ECF8427E;0.
382 | Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 152.
383 | )
384 | Here is how much of TeX's memory you used:
385 | 6210 strings out of 494447
386 | 92542 string characters out of 6166765
387 | 166594 words of memory out of 5000000
388 | 9465 multiletter control sequences out of 15000+600000
389 | 20787 words of font info for 49 fonts, out of 8000000 for 9000
390 | 319 hyphenation exceptions out of 8191
391 | 35i,6n,23p,549b,472s stack positions out of 5000i,500n,10000p,200000b,80000s
392 |
399 | Output written on OncoSig.pdf (4 pages, 103165 bytes).
400 | PDF statistics:
401 | 58 PDF objects out of 1000 (max. 8388607)
402 | 44 compressed objects within 1 object stream
403 | 10 named destinations out of 1000 (max. 500000)
404 | 5 words of extra memory for PDF output out of 10000 (max. 10000000)
405 |
406 |
--------------------------------------------------------------------------------
/vignettes/OncoSig.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/vignettes/OncoSig.pdf
--------------------------------------------------------------------------------
/vignettes/OncoSig.synctex.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/califano-lab/OncoSig/add92584912d4b0944d6579d89834019ad65345d/vignettes/OncoSig.synctex.gz
--------------------------------------------------------------------------------
/vignettes/OncoSig.tex:
--------------------------------------------------------------------------------
1 | \documentclass{article}
2 | \usepackage{hyperref}
3 | \usepackage{hyperref}
4 | %\VignetteIndexEntry{Using OncoSig}
5 | \title{Using the OncoSig Classifiers to Discover Novel Oncoprotein network Dependencies}
6 | \date{\today}
7 | \usepackage{Sweave}
8 | \begin{document}
9 | \author{Joshua Broyde, Diana Murray, Barry Honig, Andrea Califano\\Columbia University, New York, USA}
10 | \input{OncoSig-concordance}
11 | \maketitle
12 |
13 | \section*{Introduction}
14 |
15 | OncoSig comprises a set of machine learning approaches for determinging novel sets of gene products (i.e. genes or proteins) that support
16 | the activity of an oncogene or tumor suppressor (i.e. Oncoprotein-Centric Map,
17 | or OC-map). This is relevant for determining which genes/proteins are involved in an oncoprotein's functional network. OncoSig queries a molecular interaction network or other features regarding a protein's function to predict novel members of the OC-map. This molecular interaction network could contain features such as protein-protein interactions, or gene regulatory networks.
18 | OncoSig can be used primarily in two ways, in a supervised or unsupervised fashion. In the supervised fashion OncoSig uses either a Naive Bayes or Random Forest classifier to train on the molecular interaction network and a gold standard of known members of a particular OC-Map (for example, the members of the KRAS signaling pathway). This approach is appropriate for cases where some members of an OC-Map are known and one wants to leverage the known ones to predict other OC-Map members.
19 |
20 | In a cases where a gold standard is not known, OnconSig can be used in an unsupervised fashion, where an OC-Map trained on a well characterized Oncoprotein is applied to one that is poorly characterized. This usage is appropriate where there is no gold standard for a particular Oncoprotein.
21 |
22 | \section*{Installation and loading}
23 | After first installing R (\href{url}{http://www.r-project.org}) and the OncoSig library, load OncoSig.
24 | \begin{Schunk}
25 | \begin{Sinput}
26 | > library("OncoSig")
27 | \end{Sinput}
28 | \end{Schunk}
29 |
30 | \section*{OncoSig Naive Bayes Classifier}
31 | The OncoSig Naive Bayes (OncoSigNB) Classifier is a supervised learning approach that is well suited to discovering OC-Map members when there are a few number of features describing each gene product and when the features have no or low statistical dependence. To run OncoSigNB, we create dataframes that correspond to the training and testing sets (which are labeled as 1 or 0, if they are in the gold standard OC-Map or not, respectivley).
32 | For example:
33 |
34 | \begin{Schunk}
35 | \begin{Sinput}
36 | > df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
37 | > df_2=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_2.txt",header=TRUE)
38 | > the_bins=list(c(0,40,200,1200),c(0,.1),c(-2,-0.15,-0.02,0.0925),c(1,2,6),
39 | + c(0,0.25),c(1,3,20),c(1,4,20),c(1,4,20),c(0,0.0001,0.9999),
40 | + c(0,0.01,0.05))
41 | > predictions=OncoSigNB(training_set = df_1,testing_set = df_2,
42 | + the_bins=the_bins,correlated_features =list())
43 | \end{Sinput}
44 | \end{Schunk}
45 |
46 | In this example, we specified the training and testing sets, how to bin the data for each feature, and passed an empty list to indicate that there are no correlated features.
47 |
48 | The input training and testings should be formatted like so.
49 | \begin{Schunk}
50 | \begin{Sinput}
51 | > df_1=read.delim("~/OncoSig/Input_data_files/Naive_Bayes_evidences_set_1.txt",header=TRUE)
52 | > df_1[1:5,]
53 | \end{Sinput}
54 | \begin{Soutput}
55 | V1 df_labels PREP_LR luad_vip_pv luad_lincs MS_RALGDS
56 | 1 Q16539 1 676.84 1.00e+00 0.13985800 NA
57 | 2 P78383 1 NA NA NA NA
58 | 3 P30281 1 NA 1.00e+00 -0.02819036 NA
59 | 4 P28799 1 30.83 7.97e-05 0.34413929 NA
60 | 5 Q96HE7 1 NA 1.00e+00 -0.33786435 NA
61 | Luad_vip_up MS_TBK1 MS_RALA MS_RALB Demand_pv mindy_overlap_lung
62 | 1 NA NA NA NA 1 1.0000000000
63 | 2 NA NA NA NA 1 NA
64 | 3 NA NA NA NA 1 0.0682243033
65 | 4 NA NA NA NA 1 0.0002509032
66 | 5 NA NA NA NA 1 0.0771524132
67 | \end{Soutput}
68 | \end{Schunk}
69 | Note that when a feature is missing, it is simply coded as NA. The first column is the name (e.g. a gene name), the second column is the label, and all later columns are features.
70 |
71 | \section*{OncoSigRF}
72 | Supervised classification can also be run using the OncoSig Random Forest (OncoSig RF) classifier. Random Forests can be used when the features are statistically dependent, and can more easily be used when the number of features number in the thousands or more. We recommend OncoSigRF when integrating a molecular interaction network that may contain many tens of thousands (note that when using a network of more than a few hundred thousand interactions, OncoSig RF will require a few hours and a prohibitive amount of memory to run).
73 |
74 | \begin{Schunk}
75 | \begin{Sinput}
76 | > library (randomForest)
77 | \end{Sinput}
78 | \end{Schunk}
79 | First we read in the molecular interaction network and convert it to a matrix
80 | \begin{Schunk}
81 | \begin{Sinput}
82 | > Network_location="~/OncoSig/Input_data_files/LUAD/original_network_sample.txt"
83 | > Network=read.delim(Network_location,header=F)
84 | > Network$V1=as.character(Network$V1)
85 | > Network$V2=as.character(Network$V2)
86 | > Network$V3=as.numeric(Network$V3)
87 | > Network=as.matrix(Network)
88 | > Network[,3]=as.numeric(Network[,3])
89 | > Network_matrix=listToMatrix(Network)
90 | \end{Sinput}
91 | \end{Schunk}
92 | Note the format of the input network. The first column is the feature name, the second column is the gene product (i.e. the row of data), and the third column is the score.
93 | \begin{Schunk}
94 | \begin{Sinput}
95 | > Network[1:5,]
96 | \end{Sinput}
97 | \begin{Soutput}
98 | V1 V2 V3
99 | [1,] "Q9Y5P4_CINDY_SIG" "Q8N653" "79"
100 | [2,] "A6NF89_PREPPI" "P47881" "2109.56"
101 | [3,] "P61586_PREPPI" "Q6ZUM4" "6546.191"
102 | [4,] "Q06124_CINDY_SIG" "O14647" "50"
103 | [5,] "Q9Y606_ARACNE" "Q00613" "0.3334542"
104 | \end{Soutput}
105 | \end{Schunk}
106 |
107 | Now read in and process the gold standard
108 | \begin{Schunk}
109 | \begin{Sinput}
110 | > Gold_Standard_location=
111 | + "~/OncoSig/Input_data_files/LUAD/10_oncogene_pathways/KRAS/total.txt"
112 | > Gold_Standard=read.delim(Gold_Standard_location,header=F)
113 | > Gold_Standard$V1=as.character(Gold_Standard$V1)
114 | \end{Sinput}
115 | \end{Schunk}
116 |
117 | preprocess the data
118 | \begin{Schunk}
119 | \begin{Sinput}
120 | > #Convert Matrix to Dataframe for future steps
121 | > Network_matrix_df=as.data.frame(Network_matrix)
122 | > #Remove Members of Gold Standard Not in the Network:
123 | > Gold_Standard_in_Network_names=intersect(rownames(Network_matrix_df),
124 | + Gold_Standard$V1)
125 | > Negative_Set_names=setdiff(rownames(Network_matrix_df),
126 | + Gold_Standard_in_Network_names)
127 | > remove(Network_matrix) #delete Matrix.
128 | \end{Sinput}
129 | \end{Schunk}
130 | Run the OncoSigRF Classifier
131 | \begin{Schunk}
132 | \begin{Sinput}
133 | > Query_output_results=OncoSigRF(Network_matrix_df,
134 | + Gold_Standard_in_Network_names, max_iterations=5)
135 | > Query_output_results_scores=as.data.frame(Query_output_results[[1]])
136 | \end{Sinput}
137 | \end{Schunk}
138 |
139 | \section*{Unsupervised OncoSig}
140 | Supervised classification is only applicable when a gold standard suitable for training can be found. However, some Oncogenes/Tumor Suppressors may have no known gene product dependencies. In this case, we can apply a forest created specifically using one Oncogene/Tumor Suppressor and apply it to another. See the documentation for the OncoSigUnsup function for further details. In this example, we read in a random forest created using features for the EGFR oncogene and apply it to the KRAS oncogene.
141 | \begin{Schunk}
142 | \begin{Sinput}
143 | > KRAS_features=
144 | + "~/OncoSig/Input_data_files/LUAD/OncoSigUnsup/feature_list_KRAS.txt"
145 | > EGFR_forest=
146 | + "~/OncoSig/Input_data_files/LUAD/OncoSigUnsup/All_forests_EGFR.r"
147 | > results=OncoSigUnsup(KRAS_features,EGFR_forest)
148 | \end{Sinput}
149 | \end{Schunk}
150 |
151 |
152 | \end{document}
153 |
--------------------------------------------------------------------------------