├── .gitignore ├── Evaluation ├── evaluate_model.R └── evaluate_model_old.R ├── LICENSE ├── README.md └── Train_SDAE ├── dae.py ├── run.py ├── stacked_dae.py ├── test_dae.py └── tools ├── ADASYN.py ├── __init__.py ├── config.py ├── data_handler.py ├── evaluate.py ├── evaluate_model.py ├── start_tensorboard.py ├── utils.py └── visualize.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /Evaluation/evaluate_model.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages(library("randomForest")) 2 | library("Rtsne") 3 | 4 | # # Function definitions 5 | 6 | sgm <- function(x){ 7 | # Sigmoid function 8 | return(1/(1+exp(-x))) 9 | } 10 | 11 | get_activations <- function(exp_data, w, b){ 12 | # Propagate inputs through to the hidden layer 13 | # Linear transform 14 | print(dim(w)) 15 | print(dim(exp_data)) 16 | lin <- t(w) %*% as.matrix(exp_data) 17 | # Add bias (a bit ugly) 18 | bia <- lin 19 | for(i in 1:nrow(lin)){ 20 | bia[i,] <- lin[i,] + b[[i]] 21 | } 22 | act <- t(sgm(bia)) 23 | return(act) 24 | } 25 | 26 | node.act.per.type <- function(act, node, m){ 27 | lev <- levels(coi) 28 | boxes <- NULL 29 | for (ctype in lev){ 30 | box <- t(act[which(m==ctype), node]) 31 | boxes[[ctype]] <- box 32 | } 33 | boxplot(boxes, las=2, main=paste("Node", node), ylim=c(0,1)) 34 | } 35 | 36 | type.act.per.node <- function(act, m, filename){ 37 | par(mfcol=c(3,1)) 38 | for(cell in levels(coi)){ 39 | boxplot(act[which(coi==cell),], main=cell, las=2, names=paste0("Node",1:ncol(act)), ylim=c(0,1)) 40 | } 41 | par(mfrow=c(1,1)) 42 | } 43 | 44 | # # Define colors and such for the metadata 45 | def_colors <- function(meta){ 46 | # print(meta) 47 | # Now 1st column is the former 2nd column. So we use this to take tha names 48 | typeNames <<- levels(meta[, colnames(meta)[1]]) 49 | # print(typeNames) 50 | 51 | ## COLORS : red=552, blue=26, black=24, green=254, yellow=652 --> change-to yellow2=654 52 | ## COLORS : orange=498 --> change-to darkorange1=91, brown=32 --> change-to chocolate4=56 53 | ## COLORS : purple=547, grey39=300, violetred=641, darkgreen=81, cyan=68, magenta=450 54 | ## COLORS : goldenrod4=151, hotpink=367, darkolivegreen2=87, midnightblue=477, lightcoral=404 55 | ## COLORS : darkslategrey=113, 56 | 57 | distinct_color_pool <- c("red","blue","black","green","yellow2","darkorange1", 58 | "chocolate4","purple","grey39","violetred","darkgreen", 59 | "cyan","magenta","goldenrod4","hotpink","darkolivegreen2", 60 | "midnightblue","midnightblue","darkslategrey") 61 | # typeColors <<- rainbow(length(typeNames)) 62 | typeColors <<- distinct_color_pool[1:length(typeNames)] 63 | # print(typeColors) 64 | names(typeColors) <<- typeNames 65 | # print(typeColors) 66 | 67 | # Take the column of interest (coi) and assign example names to the labels 68 | coi <<- meta[, colnames(meta)[1]] 69 | # print(coi) 70 | names(coi) <<- 1:nrow(meta) 71 | # print(coi) 72 | } 73 | 74 | # # Handle several analysis functions 75 | do_analysis <- function(act, w, b, outfile_pref, bias_node=FALSE){ 76 | for(i in 1:length(w)){ 77 | if(bias_node == TRUE){ 78 | act <- cbind(rep(1, nrow(act)), act) 79 | } 80 | act <- get_activations(t(act), w[[i]], b[[i]]) 81 | # print(act) 82 | nondup <- act[which(!duplicated(act)),] 83 | print(dim(act)) 84 | print(dim(nondup)) 85 | 86 | colrs <- typeColors[coi[1:nrow(act)]] 87 | plot_pca(nondup, colrs, paste(outfile_pref, i, sep='_')) 88 | 89 | colrs <- typeColors[coi[1:nrow(nondup)]] 90 | plot_tsne(nondup, colrs, paste(outfile_pref, i, sep='_')) 91 | 92 | node_profiles(act, paste(outfile_pref, i, sep='_')) 93 | cell_profiles(act, paste(outfile_pref, i, sep='_')) 94 | calc_rf(act) 95 | } 96 | } 97 | 98 | # # PCA on activations 99 | plot_pca <- function(act, colrs, outfile_pref){ 100 | pcafile <- paste(outfile_pref, "PCA.pdf", sep="_") 101 | 102 | p <- prcomp(act) 103 | 104 | pdf(file=pcafile, paper="a4r") 105 | # par mar(Bottom, Left, Top, Right) 106 | layout(matrix(c(1,2,3,3), ncol=2, byrow=TRUE), heights=c(4, 1)) 107 | plot(p$x, col=colrs, pch=20) 108 | plot(p$x[,2:3], col=colrs, pch=20) 109 | par(mai=c(0,0,0,0)) 110 | plot.new() 111 | legend("center", bty="n", legend=names(typeColors), col=typeColors, pch=rep(20,length(typeColors)), ncol=as.integer((length(typeColors)/10)+0.5), cex=0.8, pt.cex=0.8) 112 | dev.off() 113 | } 114 | 115 | # # Rtsne 116 | plot_tsne <- function(act, colrs, outfile_pref){ 117 | tsnefile <- paste(outfile_pref, "tSNE.pdf", sep="_") 118 | 119 | # nondup <- act[which(!duplicated(act)),] 120 | r <- Rtsne(act, perplexity=10) 121 | 122 | pdf(file=tsnefile, paper="a4r") 123 | layout(matrix(c(1,2), ncol=1), heights=c(4, 1)) 124 | plot(r$Y, pch=20, col=colrs, xlab="", ylab="") 125 | par(mai=c(0,0,0,0)) 126 | plot.new() 127 | legend("center", bty="n", legend=names(typeColors), col=typeColors, pch=rep(20,length(typeColors)), ncol=as.integer((length(typeColors)/10)+0.5), cex=0.7, pt.cex=0.7) 128 | dev.off() 129 | } 130 | 131 | # # Look at the nodes in order of decreasing standard deviation 132 | node_profiles <- function(act, outfile_pref){ 133 | filename <- paste(outfile_pref, "node_profiles.pdf", sep="_") 134 | 135 | pdf(filename, paper="a4") 136 | layout(matrix(c(1,2,3), nrow=1, ncol=3,byrow=TRUE)) 137 | par(mar=c(15.0, 2.3, 2.6, 2.1)) 138 | 139 | for(node in order(apply(act, 2, sd),decreasing=TRUE)){ 140 | node.act.per.type(act, node, coi) 141 | } 142 | dev.off() 143 | } 144 | 145 | # # Or per cell type 146 | cell_profiles <- function(act, outfile_pref){ 147 | filename <- paste(outfile_pref, "cell_profiles.pdf", sep="_") 148 | 149 | pdf(filename, paper="a4") 150 | par(mar=c(4.5, 2.3, 1.7, 0.1)) 151 | type.act.per.node(act, coi) 152 | dev.off() 153 | } 154 | 155 | # # Check predictivity 156 | calc_rf <- function(act){ 157 | rf <- randomForest(x=act, y=as.factor(coi), importance=TRUE) 158 | print(paste("RF estimated error rate", tail(rf$err.rate, n=1)[,1], sep=":")) 159 | } 160 | -------------------------------------------------------------------------------- /Evaluation/evaluate_model_old.R: -------------------------------------------------------------------------------- 1 | suppressPackageStartupMessages(library("randomForest")) 2 | library("Rtsne") 3 | 4 | ## Function definitions 5 | 6 | sgm <- function(x){ 7 | # Sigmoid function 8 | return(1/(1+exp(-x))) 9 | } 10 | 11 | get_activations <- function(exp_data, w, b){ 12 | # Propagate inputs through to the hidden layer 13 | # Linear transform 14 | lin <- t(w) %*% as.matrix(exp_data) 15 | # Add bias (a bit ugly) 16 | bia <- lin 17 | for(i in 1:nrow(lin)){ 18 | bia[i,] <- lin[i,] + b[i,] 19 | } 20 | act <- t(sgm(bia)) 21 | return(act) 22 | } 23 | 24 | node.act.per.type <- function(act, node, m){ 25 | shortNames <- c("Astro","Endo","GABA","Glut","Microglia","Oligo","OligoPC","Uncl") 26 | boxplot(act[which(m=="Astrocyte"),node], act[which(m=="Endothelial Cell"),node], 27 | act[which(m=="GABA-ergic Neuron"),node],act[which(m=="Glutamatergic Neuron"),node], 28 | act[which(m=="Microglia"),node], act[which(m=="Oligodendrocyte"),node], 29 | act[which(m=="Oligodendrocyte Precursor Cell"),node], act[which(m=="Unclassified"),node], 30 | names=shortNames,main=paste("Node",node),las=2,cex=0.5) 31 | } 32 | 33 | type.act.per.node <- function(act, m){ 34 | par(mfrow=c(4,2)) 35 | for(cell in levels(btype)){ 36 | boxplot(act[which(btype==cell),],main=cell,las=2,names=paste0("Node",1:ncol(act))) 37 | } 38 | par(mfrow=c(1,1)) 39 | } 40 | 41 | 42 | args <- commandArgs(trailingOnly = TRUE) 43 | numLayers <- (length(args) - 2)/2 44 | print(paste("Number of layers:", numLayers)) 45 | 46 | # Read expression data. (Currently used only to get gene names.) 47 | print("Reading expression data...") 48 | exp_data <- read.delim(args[1],check.names=FALSE,row.names=1) 49 | # Read metadata (clustering results) 50 | print("Reading metadata...") 51 | meta <- read.delim(args[2],check.names=FALSE,row.names=1) 52 | # Check for same ordering 53 | stopifnot(identical(colnames(exp_data), rownames(meta))) 54 | 55 | # Propagate activity through the network 56 | # Activation of visible layer is the actual expression data 57 | act <- t(exp_data) 58 | for(i in 1:numLayers){ 59 | # Read weights and bias for the layer in question 60 | print(paste("Reading weights for layer", i)) 61 | w <- read.delim(args[2*i+1],header=FALSE) 62 | print(paste("Reading biases for layer", i)) 63 | b <- read.delim(args[2*i+2],header=FALSE) 64 | act <- get_activations(t(act), w, b) 65 | } 66 | 67 | # Define colors and such for the metadata 68 | typeNames <- levels(meta$broad_type) 69 | typeCols <- c("red","blue","black","green","yellow","orange","brown","purple") 70 | names(typeCols) <- typeNames 71 | btype <- meta$broad_type 72 | names(btype) <- rownames(meta) 73 | 74 | outfile_pref <- strsplit(basename(args[1]),"\\.")[[1]][1] 75 | print(outfile_pref) 76 | 77 | # PCA on activations 78 | pcafile <- paste(outfile_pref, "PCA.pdf", sep="_") 79 | pdf(pcafile) 80 | par(mfrow=c(1,2)) 81 | p <- prcomp(act) 82 | plot(p$x,col=typeCols[btype[rownames(act)]],pch=20) 83 | plot(p$x[,2:3],col=typeCols[btype[rownames(act)]],pch=20) 84 | dev.off() 85 | 86 | # Rtsne 87 | #nondup <- act[which(!duplicated(act)),] 88 | #tsnefile <- paste(outfile_pref, "tSNE.pdf", sep="_") 89 | #pdf(tsnefile) 90 | #r <- Rtsne(nondup) 91 | #plot(r$Y, col=typeCols[btype[rownames(act)]],pch=20) 92 | #dev.off() 93 | 94 | nondup <- act[which(!duplicated(act)),] 95 | tsnefile <- paste(outfile_pref, "tSNE.pdf", sep="_") 96 | pdf(tsnefile) 97 | r <- Rtsne(nondup, perplexity=10) 98 | plot(r$Y, col=typeCols[btype[rownames(nondup)]],pch=20) 99 | dev.off() 100 | 101 | 102 | # Look at the nodes in order of decreasing standard deviation 103 | pdf(paste(outfile_pref, "node_profiles.pdf", sep="_")) 104 | par(mfrow=c(2,4)) 105 | for(node in order(apply(act, 2, sd),decreasing=TRUE)){ 106 | node.act.per.type(act, node, btype) 107 | } 108 | dev.off() 109 | 110 | # Or per cell type 111 | pdf(paste(outfile_pref, "cell_profiles.pdf", sep="_")) 112 | type.act.per.node(act, btype) 113 | dev.off() 114 | 115 | # Check predictivity 116 | rf <- randomForest(x=act, y=as.factor(btype), importance=TRUE) 117 | print(paste("RF estimated error rate", tail(rf$err.rate, n=1)[,1], sep=":")) 118 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StackedDAE 2 | Stacked Denoising AutoEncoder based on TensorFlow 3 | 4 | This project is intended to be a Bioinformatics tool. However, this repository hosts the project's code, which is not strictly binded to biology, so someone could use it for another purpose with little effort (on the other hand it's not generalized so to fit in every occasion, so a bit of effort is required). 5 | 6 | Also, in our project we try to find a good setup for the algorithm, so there are a lot options (and there are more to come) implemented in the code. Masking and Salt-and-pepper noise, with or without Emphasis, and Sigmoid or Tahn activation functions to name a few. 7 | 8 | -------------------------------------------------------------------------------- /Train_SDAE/dae.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tools.config import FLAGS 4 | 5 | class DAE_Layer(object): 6 | 7 | def __init__(self, in_data=None, prev_layer_size=None, next_layer_size=None, nth_layer=None, sess=None, last_layer=True): 8 | self._is_last = last_layer 9 | self._layer = nth_layer 10 | 11 | self._prev_layer_size = prev_layer_size + 1 if FLAGS.bias_node else prev_layer_size 12 | self._next_layer_size = next_layer_size 13 | self._shape = [self._prev_layer_size, self._next_layer_size] 14 | 15 | self._x = in_data 16 | 17 | self._l_rate = self._get_l_rate 18 | 19 | self._noise = [None, None] if self._is_last else self._get_noise 20 | 21 | self.vars_to_init = self._setup_variables() 22 | 23 | 24 | def _setup_variables(self): 25 | with tf.name_scope("Initialize_Variables"): 26 | self._w = self._init_w_or_b(shape=self._shape, trainable=True, name='weights')#_{0}'.format(self._layer)) 27 | # lmt = tf.mul(4.0, tf.sqrt(6.0 / (self._shape[0] + self._shape[1]))) 28 | # self._w = tf.Variable(tf.random_uniform(self._shape, -1*lmt, lmt), trainable=True, name='weights') 29 | self._b_y = self._init_w_or_b(shape=[self._next_layer_size], trainable=True, is_bias=True, name='prev_biases') 30 | 31 | vars_to_init = [self._w, self._b_y] 32 | if not self._is_last: 33 | self._fixed_w = tf.Variable(tf.identity(self._w.initialized_value()), trainable=False, name="weights_fixed") 34 | self._fixed_b = tf.Variable(tf.identity(self._b_y.initialized_value()), trainable=False, name="biases_fixed") 35 | self._b_z = self._init_w_or_b(shape=[self._prev_layer_size], trainable=True, is_bias=True, name='next_biases') 36 | vars_to_init.append(self._fixed_w) 37 | vars_to_init.append(self._fixed_b) 38 | vars_to_init.append(self._b_z) 39 | 40 | return vars_to_init 41 | 42 | 43 | """ TODO: TRY initialization for different functions (e.g. tanh) """ 44 | def _init_w_or_b(self, shape, trainable=True, name=None, is_bias=False, method='sigmoid'): 45 | # with tf.name_scope("dae_{0}_{1}".format(self._layer, name)): 46 | if is_bias: 47 | return tf.Variable(tf.zeros(shape), trainable=trainable, name=name) 48 | 49 | if method=='sigmoid': 50 | # Upper and Lower limit for the weights 51 | lmt = tf.mul(4.0, tf.sqrt(6.0 / (shape[0] + shape[1]))) 52 | return tf.Variable(tf.random_uniform(shape, -1*lmt, lmt), trainable=trainable, name=name) 53 | 54 | 55 | def clean_activation(self, x_in=None, use_fixed=True): 56 | if x_in is None: 57 | x = self._x 58 | else: 59 | x = x_in 60 | if use_fixed: 61 | return self._activate(x, self._fixed_w, self._fixed_b, name='Latent_layer_next') 62 | else: 63 | return self._activate(x, self._w, self._b_y, name='Latent_layer_next') 64 | 65 | 66 | def encode(self, x_in=None, noise=None): 67 | if x_in is None: 68 | x = self._x 69 | else: 70 | x = x_in 71 | 72 | if noise is None: 73 | ratio = self._noise[0] 74 | ntype = self._noise[1] 75 | else: 76 | ratio = noise[0] 77 | ntype = noise[1] 78 | 79 | self._x_tilde, self._noise_map = self._corrupt(x, ratio=ratio, n_type=ntype) 80 | with tf.name_scope("Encoder"): 81 | self._y = self._activate(self._x_tilde, self._w, self._b_y, name='Latent_layer_next') 82 | return self._y 83 | 84 | def decode(self): 85 | # self._y = self.encode() 86 | with tf.name_scope("Decoder"): 87 | y = self.encode() 88 | if self._is_last: 89 | exit("This is the last layer. Currently the reconstruction of this layer cannot be done.") 90 | self._z = self._activate(y, self._w, self._b_z, transpose_w=True, name='Reconstr_layer_{0}'.format(self._layer)) 91 | return self._z 92 | 93 | @property 94 | def get_loss(self): 95 | z = self.decode() 96 | noise_map = None 97 | 98 | if FLAGS.emphasis: 99 | noise_map = self._noise_map 100 | 101 | loss = self._loss_x_entropy(x=self._x, z=z, noise=noise_map) 102 | 103 | return loss 104 | 105 | @property 106 | def get_w_all_b(self): 107 | return [self._w, self._b_y, self._b_z] 108 | 109 | @property 110 | def get_w_b(self): 111 | return [self._w, self._b_y] 112 | 113 | @property 114 | def get_w(self): 115 | return self._w 116 | 117 | @property 118 | def get_fixed_w(self): 119 | return self._fixed_w 120 | 121 | @property 122 | def get_b(self): 123 | return self._b_y 124 | 125 | @property 126 | def get_fixed_b(self): 127 | return self._fixed_b 128 | 129 | @property 130 | def get_b_recon(self): 131 | return self._b_z 132 | 133 | @property 134 | def get_representation_y(self): 135 | return self._y 136 | 137 | @property 138 | def get_reconstruction_z(self): 139 | return self._z 140 | 141 | @property 142 | def which(self): 143 | return self._layer - 1 144 | 145 | @staticmethod 146 | def _activate(x, w, b, transpose_w=False, name=None): 147 | """ TODO: TRY different activation functions (e.g. tanh, sigmoid...) """ 148 | return tf.sigmoid(tf.nn.bias_add(tf.matmul(x, w, transpose_b=transpose_w), b), name=name) 149 | 150 | @property 151 | def _get_noise(self): 152 | assert self._layer >= 0 153 | 154 | try: 155 | return getattr(FLAGS, "noise_{0}".format(self._layer)) 156 | except AttributeError: 157 | print "Noise out of bounds. Using default noise for this Layer (Layer {0})".format(self._layer) 158 | return FLAGS.default_noise 159 | 160 | @property 161 | def _get_l_rate(self): 162 | return getattr(FLAGS, "unsupervised_learning_rate") 163 | 164 | @property 165 | def _get_emph_params(self): 166 | if FLAGS.emphasis_type == 'Full': 167 | return 1, 0 168 | elif FLAGS.emphasis_type == 'Double': 169 | return 1, 0.5 170 | else: 171 | print("Unspecified/Wrong Emphasis type. Default Full [0-1] is used.") 172 | return 1, 0 173 | 174 | def _loss_x_entropy(self, x, z, noise=None): 175 | with tf.name_scope("xentropy_loss"): 176 | z_clipped = tf.clip_by_value(z, FLAGS.zero_bound, FLAGS.one_bound) 177 | z_minus_1_clipped = tf.clip_by_value((1.0 - z), FLAGS.zero_bound, FLAGS.one_bound) 178 | x_clipped = tf.clip_by_value(x, FLAGS.zero_bound, FLAGS.one_bound) 179 | x_minus_1_clipped = tf.clip_by_value((1.0 - x), FLAGS.zero_bound, FLAGS.one_bound) 180 | 181 | # cross_entropy = x * log(z) + (1 - x) * log(1 - z) 182 | 183 | cross_entropy = tf.add(tf.mul(tf.log(z_clipped), x_clipped), 184 | tf.mul(tf.log(z_minus_1_clipped), x_minus_1_clipped), name='X-Entr') 185 | 186 | if noise: 187 | with tf.name_scope("Given_Emphasis"): 188 | a, b = self._get_emph_params 189 | corrupted = tf.select(noise, cross_entropy, tf.zeros_like(cross_entropy), name='Corrupted_Emphasis') 190 | 191 | # OR -- tf.select(tf.logical_not(noisy_points), cross_entropy, tf.zeros_like(cross_entropy), name='Uncorrupted_Emphasis') 192 | uncorrupted = tf.select(noise, tf.zeros_like(cross_entropy), cross_entropy, name='Uncorrupted_Emphasis') 193 | 194 | loss = a * (-1 * tf.reduce_sum(corrupted, 1)) + b * (-1 * tf.reduce_sum(uncorrupted, 1)) 195 | else: 196 | # Sum the cost for each example 197 | loss = -1 * tf.reduce_sum(cross_entropy, 1) 198 | 199 | # Reduce mean to find the overall cost of the loss 200 | cross_entropy_mean = tf.reduce_mean(loss, name='xentropy_mean') 201 | 202 | return cross_entropy_mean 203 | 204 | 205 | # @property 206 | # def get_cost(self): 207 | # z = self.get_reconstruction_z 208 | # noise_map = None 209 | # 210 | # if FLAGS.emphasis: 211 | # noise_map = self._noise_map 212 | # 213 | # cost = self._loss_x_entropy(x=self._x, z=z, noise=noise_map) 214 | # 215 | # return cost 216 | 217 | 218 | def _corrupt(self, x, ratio, n_type='MN'): 219 | with tf.name_scope("Corruption"): 220 | """ Noise adding (or input corruption) 221 | This function adds noise to the given data. 222 | 223 | Args: 224 | x : The input data for the noise to be applied 225 | ratio: The percentage of the data affected by the noise addition 226 | n_type: The type of noise to be applied. 227 | Choices: MN (masking noise), SP (salt-and-pepper noise) 228 | """ 229 | 230 | # Safety check. If unspecified noise type given, use Masking noise instead. 231 | if n_type != 'MN' and n_type != 'SP' and n_type != 'TFDO': 232 | n_type = 'MN' 233 | print("Unknown noise type. Masking noise will be used instead.") 234 | 235 | 236 | # if there is no noise to be added there is no need to proceed further 237 | if ratio == 0.0: 238 | return x_tilde, None 239 | 240 | if n_type == 'TFDO': 241 | x_tilde = tf.nn.dropout(x, keep_prob= 1 - ratio) 242 | # points_to_alter = x_tilde == 0. 243 | # print points_to_alter 244 | # x_tilde = tf.select(points_to_alter, tf.add(tf.zeros_like(x_tilde, dtype=tf.float32), 245 | # FLAGS.zero_bound), x_tilde, name='X_tilde') 246 | # x_tilde[x_tilde == 0.] = tf.constant(FLAGS.zero_bound) 247 | else: 248 | # It makes a copy of the data, otherwise 'target_feed' will also be affected 249 | x_tilde = tf.identity(x, name='X_tilde') 250 | shape = tf.Tensor.get_shape(x_tilde) 251 | # Creating and applying random noise to the data. (Masking noise) 252 | points_to_alter = tf.random_uniform(shape=shape, dtype=tf.float32) < ratio 253 | 254 | if n_type == 'MN': 255 | x_tilde = tf.select(points_to_alter, tf.add(tf.zeros_like(x_tilde, dtype=tf.float32), 256 | FLAGS.zero_bound), x_tilde, name='X_tilde') 257 | 258 | elif n_type == 'SP': 259 | coin_flip = np.asarray([np.random.choice([FLAGS.zero_bound, FLAGS.one_bound]) for _ in range(shape[0]) for _ in range(shape[1])]).reshape(shape) 260 | x_tilde = tf.select(points_to_alter, tf.to_float(coin_flip), x_tilde, name='X_tilde') 261 | 262 | 263 | # Also returns the 'points_to_alter' in case of applied Emphasis 264 | if not FLAGS.emphasis or n_type == 'TFDO': 265 | points_to_alter = None 266 | 267 | return x_tilde, points_to_alter 268 | 269 | -------------------------------------------------------------------------------- /Train_SDAE/run.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import shutil 5 | import time 6 | import stacked_dae as SDAE 7 | 8 | from os.path import join as pjoin 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from tools.config import FLAGS, home_out 13 | from tools.start_tensorboard import start_tb 14 | from tools.data_handler import load_data, load_linarsson_labels, load_extra 15 | 16 | from tools.utils import load_data_sets_pretraining, load_data_sets 17 | from tools.utils import normalize_data, label_metadata, write_csv 18 | from tools.ADASYN import Adasyn, all_indices 19 | from tools.evaluate_model import run_random_forest as run_rf 20 | from tools.evaluate_model import plot_tSNE 21 | from tools.evaluate import predict 22 | 23 | import rpy2.robjects as robjects 24 | from rpy2.robjects.packages import importr 25 | from rpy2.robjects import numpy2ri 26 | from rpy2.robjects import pandas2ri 27 | from tensorflow.python.framework.errors import FailedPreconditionError 28 | 29 | from scipy import stats, integrate 30 | import seaborn as sns 31 | from rpy2.rinterface._rinterface import RRuntimeError 32 | sns.set(color_codes=True) 33 | 34 | # Initialize R - Python connection 35 | pandas2ri.activate() 36 | numpy2ri.activate() 37 | r = robjects.r 38 | r_source = r['source'] 39 | r_source('../Evaluation/evaluate_model.R', **{'print.eval': True}) 40 | 41 | 42 | np.set_printoptions(threshold=np.nan) 43 | 44 | # Assign config variables 45 | _data_dir = FLAGS.data_dir 46 | _output_dir = FLAGS.output_dir 47 | _summary_dir = FLAGS.summary_dir 48 | _chkpt_dir = FLAGS.chkpt_dir 49 | 50 | 51 | def main(): 52 | """ 53 | TODO: Break to 2 or 3 functions 54 | for better comprehension. 55 | """ 56 | 57 | # Initialize the directory environment 58 | initialize() 59 | 60 | # Start TensorBoard 61 | start_tb() 62 | 63 | # Set Hyper-parameters 64 | bias_node = FLAGS.bias_node 65 | nHLay = FLAGS.num_hidden_layers 66 | nHUnits = [getattr(FLAGS, "hidden{0}_units".format(j + 1))\ 67 | for j in xrange(nHLay)] 68 | 69 | if FLAGS.use_balanced: 70 | transp = True 71 | else: 72 | transp = False 73 | 74 | 75 | # ...... Read/Upload/Process the Data ...... # 76 | 77 | # Capture time for logging loading duration 78 | start_time = time.time() 79 | 80 | # Load data (Allen dataset). Label_col {9: types, 7: subtypes} 81 | # datafile, (mapped_labels, label_map) = load_data('TPM', label_col=9,\ 82 | # transpose=True) 83 | 84 | # Load data (Linnarsson dataset) 85 | datafile, labels, meta = load_data(FLAGS.dataset, d_type='filtered',\ 86 | label_col=1, transpose=transp) 87 | 88 | # datafile_orig, labels, meta = load_data(FLAGS.dataset, d_type='filtered',\ 89 | # label_col=7, transpose=transp) 90 | 91 | print("Data Loaded. Duration:", time.time() - start_time) 92 | 93 | 94 | # ...... Receive/Set Metadata (Labels) ...... # 95 | 96 | mapped_labels_df, label_map = meta 97 | 98 | mapped_labels = np.reshape(mapped_labels_df.values,\ 99 | (mapped_labels_df.shape[0],)) 100 | 101 | num_classes = label_map.shape[0] 102 | 103 | # Print class statistics using ADASYN's function all_indices() 104 | print("\nClass Statistics:") 105 | 106 | for i in xrange(num_classes): 107 | print("{: >30}\t".format(label_map[i,0]),\ 108 | len(all_indices(i, mapped_labels.tolist()))) 109 | 110 | 111 | # ...... Class Balancing ...... # 112 | 113 | balanced_data = None 114 | recr_labels = None 115 | 116 | # "transp" is True if the flag "use_balanced" is True, False otherwise 117 | if transp: 118 | a = Adasyn(datafile, mapped_labels, label_map[:,1], beta=1) 119 | 120 | # Balance the data and collect them 121 | balanced_data, mapped_labels = a.balance_all() 122 | 123 | recr_labels = pd.DataFrame(data=mapped_labels) 124 | recr_labels = recr_labels.replace(label_map[:,1].tolist(),\ 125 | label_map[:,0].tolist()) 126 | 127 | # Control the transposition of the data if we use ADASYN or not 128 | data = balanced_data if transp else datafile 129 | 130 | # Save some space 131 | del(balanced_data) 132 | 133 | 134 | # ...... Data Normalization ...... # 135 | 136 | # Capture time for logging processing duration 137 | start_time = time.time() 138 | norm_data = normalize_data(data, transpose=transp) 139 | 140 | # Normalize the unbalanced data (experimenting) 141 | if transp: 142 | norm_orig = normalize_data(datafile, transpose=transp) 143 | else: 144 | norm_orig = norm_data 145 | 146 | # Save some space 147 | del(datafile) 148 | 149 | print("Data Normalized. Duration:", time.time() - start_time) 150 | 151 | 152 | # Get the number of existed features 153 | # (e.g. genes), in the data-set 154 | num_features = norm_data.shape[1] 155 | 156 | # Create the shape of the AutoEncoder 157 | sdae_shape = [num_features] + nHUnits + [num_classes] 158 | print(sdae_shape) 159 | 160 | 161 | # ...... Pre-training Phase ...... # 162 | 163 | # Get data-sets (train, test) for pretraining in a proper way 164 | data = load_data_sets_pretraining(norm_data, split_only=False) 165 | 166 | # Run pretraining step 167 | # TODO: Change function name to "fit()" 168 | sdae = SDAE.pretrain_sdae(input_x=data, shape=sdae_shape) 169 | 170 | # Save some space 171 | del(data) 172 | 173 | 174 | # Load another dataset to test it on the created model 175 | 176 | # sub_labels, _ = load_linarsson_labels(sub_labels=True) 177 | # data_an, labels_an, meta = load_extra('Allen',\ 178 | # 'TPM_common_ready_data.csv',\ 179 | # transpose=True, label_col=7) 180 | 181 | data_an, labels_an, meta = load_extra('Lin-Allen',\ 182 | 'Lin-Allen_compendium.csv',\ 183 | transpose=True, label_col=0) 184 | 185 | # Data Normalization 186 | data_an = normalize_data(data_an, transpose=False) 187 | data_an = np.transpose(data_an) 188 | 189 | # Get the labels 190 | mapped_an_df, l_map = meta 191 | mapped_an_labs = np.reshape(mapped_an_df.values,\ 192 | (mapped_an_df.shape[0],)) 193 | print(l_map) 194 | 195 | # Create comprehensive plots/graphs 196 | try: 197 | analyze(sdae, data_an, labels_an,\ 198 | bias_node=bias_node, prefix='Foreign_Pretraining') 199 | analyze(sdae, norm_orig, labels,\ 200 | bias_node=bias_node, prefix='Pretraining') 201 | except: 202 | pass 203 | # analyze(sdae, datafile_norm, recr_labels,\ 204 | # prefix='recr_Pretraining') 205 | # analyze(sdae, datafile_norm, sub_labels,\ 206 | # mapped_labels, prefix='recr_Pretraining') 207 | 208 | 209 | # ...... Fine-tuning Phase ...... # 210 | 211 | # Get data-sets (train, test) for finetuning in a proper way 212 | data = load_data_sets(norm_data, mapped_labels) 213 | 214 | # print("\nTotal Number of Examples:",\ 215 | # data.train.num_examples + data.test.num_examples) 216 | 217 | # Run finetuning step 218 | # TODO: Change function name to "finetune()" or similar 219 | sdae = SDAE.finetune_sdae(sdae=sdae, input_x=data,\ 220 | n_classes=num_classes,\ 221 | label_map=label_map[:,0]) 222 | 223 | # Save some space 224 | del(data) 225 | 226 | # Evaluate the results on a totally different data-set 227 | foreign_data = load_data_sets(data_an, mapped_an_labs, split_only=False) 228 | 229 | # TODO: make the "predict" function part of the Stacked_DAE class 230 | p, t = predict(sdae, foreign_data.all, bias_node=bias_node) 231 | p = pd.DataFrame(data=p).replace(l_map[:,1].tolist(), l_map[:,0].tolist()) 232 | t = pd.DataFrame(data=t).replace(l_map[:,1].tolist(), l_map[:,0].tolist()) 233 | print(p, t) 234 | p.to_csv(pjoin(FLAGS.output_dir, 'Predictions_of_Foreign.txt'), sep='\t') 235 | t.to_csv(pjoin(FLAGS.output_dir, 'True_labels_of_Foreign.txt'), sep='\t') 236 | 237 | # Save some space 238 | del(foreign_data) 239 | del(norm_data) 240 | 241 | # Create comprehensive plots/graphs 242 | # analyze(sdae, datafile_norm, recr_labels,\ 243 | # mapped_labels, prefix='recr_Finetuning') 244 | try: 245 | analyze(sdae, data_an, labels_an, mapped_labels,\ 246 | bias_node=bias_node, prefix='Foreign_Finetuning') 247 | analyze(sdae, norm_orig, labels, mapped_labels,\ 248 | bias_node=bias_node, prefix='Finetuning') 249 | except: 250 | pass 251 | 252 | # Print the used set up 253 | print_setup() 254 | 255 | # ...... The End ...... # 256 | 257 | 258 | def _check_and_clean_dir(d): 259 | """ 260 | Clears the given directory. 261 | """ 262 | if os.path.exists(d): 263 | shutil.rmtree(d) 264 | os.mkdir(d) 265 | 266 | 267 | def initialize(): 268 | """ 269 | Performs initialization of the directory environment. 270 | """ 271 | home = home_out('') 272 | 273 | # Make sure core directories exist 274 | if not os.path.exists(home): 275 | os.makedirs(home) 276 | 277 | if not os.path.exists(_data_dir): 278 | os.mkdir(_data_dir) 279 | 280 | if not os.path.exists(_output_dir): 281 | os.makedirs(_output_dir) 282 | 283 | elif os.listdir(_output_dir): 284 | 285 | # If the output folder is not empty, Prompt before delete contents. 286 | var = raw_input("{0} {1}"\ 287 | .format("Output folder is not empty. Clean it?",\ 288 | "(This will delete every file in it.) y/N: ")) 289 | 290 | if var == 'y' or var == 'Y' or var == '1': 291 | _check_and_clean_dir(_output_dir) 292 | else: 293 | exit("Exiting... Please save your former \ 294 | output data and restart SDAE.") 295 | else: 296 | _check_and_clean_dir(_output_dir) 297 | 298 | # Clean the rest directories 299 | _check_and_clean_dir(_summary_dir) 300 | _check_and_clean_dir(_chkpt_dir) 301 | 302 | # Create checkpoint directories (depricated) 303 | os.mkdir(os.path.join(_chkpt_dir, '1')) 304 | os.mkdir(os.path.join(_chkpt_dir, '2')) 305 | os.mkdir(os.path.join(_chkpt_dir, '3')) 306 | os.mkdir(os.path.join(_chkpt_dir, 'fine_tuning')) 307 | 308 | 309 | def analyze(sdae, datafile_norm,\ 310 | labels, mapped_labels=None,\ 311 | bias_node=False, prefix=None): 312 | 313 | """ 314 | Speeks to R, and submits it analysis jobs. 315 | """ 316 | 317 | # Get some R functions on the Python environment 318 | def_colors = robjects.globalenv['def_colors'] 319 | do_analysis = robjects.globalenv['do_analysis'] 320 | 321 | # labels.reset_index(level=0, inplace=True) 322 | def_colors(labels) 323 | act = np.float32(datafile_norm) 324 | 325 | try: 326 | do_analysis(act, sdae.get_weights, sdae.get_biases,\ 327 | pjoin(FLAGS.output_dir, "{}_R_Layer_".format(prefix)),\ 328 | bias_node=bias_node) 329 | except RRuntimeError as e: 330 | pass 331 | 332 | # for layer in sdae.get_layers: 333 | # fixed = False if layer.which > sdae.nHLayers - 1 else True 334 | # 335 | # try: 336 | # act = sdae.get_activation(act, layer.which, use_fixed=fixed) 337 | # print("Analysis for layer {}:".format(layer.which + 1)) 338 | # temp = pd.DataFrame(data=act) 339 | # do_analysis(temp, pjoin(FLAGS.output_dir,\ 340 | # "{}_Layer_{}"\ 341 | # .format(prefix, layer.which))) 342 | # 343 | # # if not fixed: 344 | # # weights = sdae.get_weights[layer.which] 345 | # # for node in weights.transpose(): 346 | # # sns.distplot(node, kde=False,\ 347 | # fit=stats.gamma, rug=True); 348 | # # sns.plt.show() 349 | # try: 350 | # plot_tSNE(act, mapped_labels,\ 351 | # plot_name="Pyhton_{}_tSNE_layer_{}"\ 352 | # .format(prefix, layer.which)) 353 | # except IndexError as e: 354 | # pass 355 | # except FailedPreconditionError as e: 356 | # break 357 | 358 | 359 | def print_setup(): 360 | nHLay = FLAGS.num_hidden_layers 361 | nHUnits = [getattr(FLAGS, "hidden{0}_units"\ 362 | .format(j + 1)) for j in xrange(nHLay)] 363 | l_rates = [getattr(FLAGS, "pre_layer{}_learning_rate"\ 364 | .format(i)) for i in xrange(1,nHLay+1)] 365 | noise_ratios = [getattr(FLAGS, "noise_{0}"\ 366 | .format(i)) for i in xrange(1,nHLay+1)] 367 | 368 | print("\nConfiguration:") 369 | print("\n{: >45}\t".format("Dataset:"), FLAGS.dataset) 370 | print("\n{: >45}\t".format("Use Bias Node:"), FLAGS.bias_node) 371 | print("{: >45}\t".format("# Hidden Layers:"), nHLay) 372 | print("{: >45}\t".format("# Hidden Units:"), nHUnits) 373 | print("{: >45}\t".format("Noise Ratio (per layer):"),\ 374 | [row[0] for row in noise_ratios]) 375 | print("{: >45}\t".format("Noise Type (MN, SP, TFDO):"),\ 376 | [row[1] for row in noise_ratios]) 377 | 378 | if FLAGS.emphasis: 379 | print("{: >45}\t"\ 380 | .format("Emphasis (Double, Full, No):"),\ 381 | FLAGS.emphasis_type) 382 | else: 383 | print("{: >45}\t"\ 384 | .format("Emphasis (Double, Full, No):"), "No") 385 | 386 | print("{: >45}\t"\ 387 | .format("Unsupervised Learning Rate (per layer?):"),\ 388 | l_rates) 389 | 390 | print("{: >45}\t"\ 391 | .format("Supervised Learning Rate:"),\ 392 | FLAGS.supervised_learning_rate) 393 | 394 | print("{: >45}\t".format("Batch size:"),\ 395 | FLAGS.batch_size) 396 | 397 | print("{: >45}\t"\ 398 | .format("# Pretraining epochs:"),\ 399 | FLAGS.pretraining_epochs) 400 | 401 | print("{: >45}\t".format("# Finetuning epochs:"),\ 402 | FLAGS.finetuning_epochs) 403 | # Activation Function (Sigmoid, Tanh, ReLU) 404 | # Weight Initialization (Sigmoid, Tanh, ReLU) 405 | # Loss Function (X-Entropy, sum of sq. error) 406 | 407 | 408 | if __name__ == '__main__': 409 | total_time = time.time() 410 | main() 411 | print("\n{}".format(time.strftime("%Y-%m-%d %H:%M:%S"))) 412 | print("Total time:", time.time() - total_time) 413 | 414 | -------------------------------------------------------------------------------- /Train_SDAE/stacked_dae.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import time 6 | import sklearn 7 | 8 | from sklearn.metrics import precision_score, confusion_matrix 9 | from sklearn.metrics import recall_score, f1_score, roc_curve 10 | 11 | from dae import DAE_Layer 12 | from os.path import join as pjoin 13 | 14 | #from utils import load_data_sets_pretraining, write_csv 15 | from tools.utils import fill_feed_dict, fill_feed_dict_dae 16 | from tools.evaluate import do_eval_summary, evaluation, do_eval 17 | from tools.config import FLAGS 18 | from tools.visualize import make_heatmap 19 | from tensorflow.python.framework.errors import FailedPreconditionError 20 | 21 | 22 | class Stacked_DAE(object): 23 | 24 | def __init__(self, net_shape, session=None, selfish_layers=False): 25 | """ Stack De-noising Autoencoder (SDAE) initialization 26 | 27 | Args: 28 | net_shape: The network architecture of the SDAE 29 | session : The tensorflow session 30 | selfish_layers: Whether the layers are going to be trained individually 31 | or dependent to the direct output of the previous layer 32 | (Theoretically: using it is faster, but memory costly) 33 | Tips: 34 | Using selfish_layers needs some extra handling. 35 | * Feed each individual De-noising Autoencoder (DAE) directly. 36 | (e.g. feed_dict = {sdae.get_layers[i]._x : input_data}) 37 | * Reassign/Reload the input data-set with the data-set for the next 38 | layer, obtained by using the genrate_next_dataset() function. 39 | (e.g. in this case load_data_sets_pretraining(next_dataset, split_only=False)) 40 | """ 41 | self._sess = session 42 | self._net_shape = net_shape 43 | self.nHLayers = len(self._net_shape) - 2 44 | self._selfish_layers = selfish_layers 45 | self.loss_summaries = None 46 | 47 | if self._selfish_layers: 48 | self._x = None 49 | self._y_dataset = {} 50 | else: 51 | self._x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, self._net_shape[0]), name='dae_input_layer') 52 | 53 | self._dae_layers = [] 54 | self._weights = [] 55 | self._biases = [] 56 | self.weights = [] 57 | self.biases = [] 58 | self._create_network() 59 | 60 | def _create_network(self): 61 | is_last_layer = False 62 | for layer in xrange(self.nHLayers + 1): 63 | with tf.name_scope("Layer_{0}".format(layer)): 64 | if self._selfish_layers: 65 | x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, self._net_shape[layer]), name='dae_input_from_layer_{0}'.format(layer)) 66 | self._y_dataset[layer] = [] 67 | else: 68 | if layer == 0: 69 | x = self._x 70 | else: 71 | x = self._dae_layers[layer-1].clean_activation() 72 | # x = self._dae_layers[layer-1].get_representation_y 73 | 74 | new_x = tf.identity(x) 75 | 76 | if layer == self.nHLayers: 77 | is_last_layer = True 78 | 79 | if FLAGS.bias_node and layer < self.nHLayers: 80 | # Add bias node (experimental) 81 | bias_node = tf.ones(shape=[FLAGS.batch_size, 1], dtype=tf.float32) 82 | new_x = tf.concat(1, [bias_node, x]) 83 | 84 | dae_layer = DAE_Layer(in_data=new_x, prev_layer_size=self._net_shape[layer], 85 | next_layer_size=self._net_shape[layer+1], nth_layer=layer+1, 86 | last_layer=is_last_layer) 87 | 88 | self._dae_layers.append(dae_layer) 89 | 90 | @property 91 | def session(self): 92 | return self._sess 93 | 94 | @property 95 | def get_layers(self): 96 | return self._dae_layers 97 | 98 | @property 99 | def get_weights(self): 100 | # if len(self.weights) != self.nHLayers + 1: 101 | self.weights = [] 102 | for n in xrange(self.nHLayers + 1): 103 | if self.get_layers[n].get_w: 104 | try: 105 | self.weights.append(self.session.run(self.get_layers[n].get_w)) 106 | except FailedPreconditionError: 107 | break 108 | else: 109 | break 110 | 111 | return self.weights 112 | 113 | @property 114 | def get_biases(self): 115 | # if len(self.biases) != self.nHLayers + 1: 116 | self.biases = [] 117 | for n in xrange(self.nHLayers + 1): 118 | if self.get_layers[n].get_b: 119 | try: 120 | self.biases.append(self.session.run(self.get_layers[n].get_b)) 121 | except FailedPreconditionError: 122 | break 123 | else: 124 | break 125 | 126 | return self.biases 127 | 128 | def get_activation(self, x, layer, use_fixed=True): 129 | return self.session.run(self.get_layers[layer].clean_activation(x_in=x, use_fixed=use_fixed)) 130 | # return self.session.run(tf.sigmoid(tf.nn.bias_add(tf.matmul(x, self.get_weights[layer]), self.get_biases[layer]), name='activate')) 131 | 132 | def train(self, cost, layer=None): 133 | # with tf.name_scope("Training"): 134 | # Add a scalar summary for the snapshot loss. 135 | self.loss_summaries = tf.scalar_summary(cost.op.name, cost) 136 | 137 | if layer is None: 138 | lr = FLAGS.supervised_learning_rate 139 | else: 140 | lr = self.get_layers[layer]._l_rate 141 | 142 | # Create the gradient descent optimizer with the given learning rate. 143 | optimizer = tf.train.GradientDescentOptimizer(lr) 144 | 145 | # Create a variable to track the global step. 146 | global_step = tf.Variable(0, trainable=False, name='global_step') 147 | 148 | # Use the optimizer to apply the gradients that minimize the loss 149 | # (and also increment the global step counter) as a single training step. 150 | train_op = optimizer.minimize(cost, global_step=global_step) 151 | return train_op, global_step 152 | 153 | def calc_last_x(self, X, bias_node=False): 154 | tmp = X 155 | for layer in self.get_layers: 156 | if bias_node: 157 | bias_n = tf.ones(shape=[FLAGS.batch_size, 1], dtype=tf.float32) 158 | tmp = tf.concat(1, [bias_n, tmp]) 159 | tmp = layer.clean_activation(x_in=tmp, use_fixed=False) 160 | # print(tmp, self._net_shape[-2], self._net_shape[-1]) 161 | # dae_layer = DAE_Layer(in_data=tmp, prev_layer_size=self._net_shape[-2], 162 | # next_layer_size=self._net_shape[-1], nth_layer=len(self._net_shape)-1, 163 | # last_layer=True) 164 | # 165 | # self._dae_layers.append(dae_layer) 166 | # tmp = self.get_layers[-1].clean_activation(x_in=tmp, use_fixed=False) 167 | 168 | return tmp 169 | 170 | def add_final_layer(self, input_x, bias_node=False): 171 | last_x = self.calc_last_x(input_x, bias_node=bias_node) 172 | print "Last layer added:", last_x.get_shape() 173 | return last_x 174 | 175 | # def finetune_net(self): 176 | # last_output = self._x 177 | # 178 | # for layer in xrange(self.nHLayers + 1): 179 | # w = self.get_layers[layer] 180 | 181 | def genrate_next_dataset(self, from_dataset, layer): 182 | """ Generate next data-set 183 | Note: This function has a meaning only if selfish layers are in use. 184 | It takes as input the data-set and transforms it using the previously 185 | trained layer in order to obtain it's output. The output of that layer 186 | is saved as a data-set to be used as input for the next one. 187 | 188 | Args: 189 | from_dataset: The data-set you want to transform (usually 190 | the one that the previous layer is trained on) 191 | layer : The layer to be used for the data transformation 192 | Returns: 193 | numpy array: The new data-set to be used for the next layer 194 | """ 195 | if self._selfish_layers: 196 | for _ in xrange(from_dataset.num_batches): 197 | feed_dict = fill_feed_dict_dae(from_dataset, self.get_layers[layer]._x) 198 | 199 | y = self.session.run(self.get_layers[layer].clean_activation(), feed_dict=feed_dict) 200 | for j in xrange(np.asarray(y).shape[0]): 201 | self._y_dataset[layer].append(y[j]) 202 | 203 | return np.asarray(self._y_dataset[layer]) 204 | else: 205 | print "Note: This function has a meaning only if selfish layers are in use." 206 | return None 207 | 208 | def pretrain_sdae(input_x, shape): 209 | with tf.Graph().as_default():# as g: 210 | sess = tf.Session() 211 | 212 | sdae = Stacked_DAE(net_shape=shape, session=sess, selfish_layers=False) 213 | 214 | for layer in sdae.get_layers[:-1]: 215 | with tf.variable_scope("pretrain_{0}".format(layer.which)): 216 | cost = layer.get_loss 217 | train_op, global_step = sdae.train(cost, layer=layer.which) 218 | 219 | summary_dir = pjoin(FLAGS.summary_dir, 'pretraining_{0}'.format(layer.which)) 220 | summary_writer = tf.train.SummaryWriter(summary_dir, graph_def=sess.graph_def, flush_secs=FLAGS.flush_secs) 221 | summary_vars = [layer.get_w_b[0], layer.get_w_b[1]] 222 | 223 | hist_summarries = [tf.histogram_summary(v.op.name, v) for v in summary_vars] 224 | hist_summarries.append(sdae.loss_summaries) 225 | summary_op = tf.merge_summary(hist_summarries) 226 | 227 | ''' 228 | You can get all the trainable variables using tf.trainable_variables(), 229 | and exclude the variables which should be restored from the pretrained model. 230 | Then you can initialize the other variables. 231 | ''' 232 | 233 | layer.vars_to_init.append(global_step) 234 | sess.run(tf.initialize_variables(layer.vars_to_init)) 235 | 236 | print("\n\n") 237 | print "| Layer | Epoch | Step | Loss |" 238 | 239 | for step in xrange(FLAGS.pretraining_epochs * input_x.train.num_examples): 240 | feed_dict = fill_feed_dict_dae(input_x.train, sdae._x) 241 | 242 | loss, _ = sess.run([cost, train_op], feed_dict=feed_dict) 243 | 244 | if step % 1000 == 0: 245 | summary_str = sess.run(summary_op, feed_dict=feed_dict) 246 | summary_writer.add_summary(summary_str, step) 247 | 248 | output = "| Layer {0} | Epoch {1} | {2:>6} | {3:10.4f} |"\ 249 | .format(layer.which, step // input_x.train.num_examples + 1, step, loss) 250 | print output 251 | 252 | # Note: Use this style if you are using the shelfish_layer choice. 253 | # This way you keep the activated data to be fed to the next layer. 254 | # next_dataset = sdae.genrate_next_dataset(from_dataset=input_x.all, layer=layer.which) 255 | # input_x = load_data_sets_pretraining(next_dataset, split_only=False) 256 | 257 | # Save Weights and Biases for all layers 258 | for n in xrange(len(shape) - 2): 259 | w = sdae.get_layers[n].get_w 260 | b = sdae.get_layers[n].get_b 261 | W, B = sess.run([w, b]) 262 | 263 | np.savetxt(pjoin(FLAGS.output_dir, 'Layer_' + str(n) + '_Weights.txt'), np.asarray(W), delimiter='\t') 264 | np.savetxt(pjoin(FLAGS.output_dir, 'Layer_' + str(n) + '_Biases.txt'), np.asarray(B), delimiter='\t') 265 | make_heatmap(W, 'weights_'+ str(n)) 266 | 267 | print "\nPretraining Finished...\n" 268 | return sdae 269 | 270 | 271 | 272 | def finetune_sdae(sdae, input_x, n_classes, label_map): 273 | print "Starting Fine-tuning..." 274 | sess = sdae.session 275 | with sess.graph.as_default(): 276 | 277 | n_features = sdae._net_shape[0] 278 | 279 | x_pl = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, n_features), name='input_pl') 280 | labels_pl = tf.placeholder(tf.int32, shape=FLAGS.batch_size, name='labels_pl') 281 | labels = tf.identity(labels_pl) 282 | 283 | # Get the supervised fine tuning net 284 | logits = sdae.add_final_layer(x_pl, bias_node=FLAGS.bias_node) 285 | # logits = sdae.finetune_net(input_x) 286 | loss = loss_supervised(logits, labels_pl, n_classes) 287 | 288 | train_op, _ = sdae.train(loss) 289 | eval_correct, corr, y_pred = evaluation(logits, labels_pl) 290 | 291 | hist_summaries = [layer.get_w for layer in sdae.get_layers] 292 | hist_summaries.extend([layer.get_b for layer in sdae.get_layers]) 293 | 294 | hist_summaries = [tf.histogram_summary(v.op.name + "_fine_tuning", v) for v in hist_summaries] 295 | 296 | summary_op = tf.merge_summary(hist_summaries) 297 | 298 | summary_writer = tf.train.SummaryWriter(pjoin(FLAGS.summary_dir, 'fine_tuning'), 299 | graph_def=sess.graph_def, 300 | flush_secs=FLAGS.flush_secs) 301 | 302 | sess.run(tf.initialize_all_variables()) 303 | 304 | steps = FLAGS.finetuning_epochs * input_x.train.num_examples 305 | for step in xrange(steps): 306 | start_time = time.time() 307 | 308 | feed_dict = fill_feed_dict(input_x.train, x_pl, labels_pl) 309 | 310 | _, loss_value, ev_corr, c, y_true = sess.run([train_op, loss, eval_correct, corr, labels], feed_dict=feed_dict) 311 | 312 | duration = time.time() - start_time 313 | 314 | # Write the summaries and print an overview fairly often. 315 | if step % 1000 == 0: 316 | # Print status to stdout. 317 | print "\nLoss: ", loss_value 318 | # print "Eval corr:", ev_corr 319 | # print "Correct:", c 320 | # print "Y_pred:", y_pred 321 | # print "Label_pred:", y_true 322 | 323 | # y_true = np.argmax(labels_pl, 0) 324 | 325 | 326 | print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) 327 | 328 | print 'Evaluation Sum:', ev_corr, '/', len(c) 329 | # print('Evaluation Corrects:', eval_corr) 330 | # print('Logits:', lgts) 331 | print "---------------" 332 | 333 | # Update the events file. 334 | summary_str = sess.run(summary_op, feed_dict=feed_dict) 335 | summary_writer.add_summary(summary_str, step) 336 | 337 | if (step + 1) % 1000 == 0 or (step + 1) == steps: 338 | train_sum = do_eval_summary("training_error", 339 | sess, 340 | eval_correct, 341 | x_pl, 342 | labels_pl, 343 | input_x.train) 344 | 345 | if input_x.validation is not None: 346 | val_sum = do_eval_summary("validation_error", 347 | sess, 348 | eval_correct, 349 | x_pl, 350 | labels_pl, 351 | input_x.validation) 352 | 353 | test_sum = do_eval_summary("test_error", 354 | sess, 355 | eval_correct, 356 | x_pl, 357 | labels_pl, 358 | input_x.test) 359 | 360 | summary_writer.add_summary(train_sum, step) 361 | if input_x.validation is not None: 362 | summary_writer.add_summary(val_sum, step) 363 | summary_writer.add_summary(test_sum, step) 364 | 365 | for n in xrange(len(sdae._net_shape) - 1): 366 | w = sdae.get_layers[n].get_w 367 | b = sdae.get_layers[n].get_b 368 | W, B = sess.run([w, b]) 369 | 370 | np.savetxt(pjoin(FLAGS.output_dir, 'Finetuned_Layer_' + str(n) + '_Weights.txt'), np.asarray(W), delimiter='\t') 371 | np.savetxt(pjoin(FLAGS.output_dir, 'Finetuned_Layer_' + str(n) + '_Biases.txt'), np.asarray(B), delimiter='\t') 372 | make_heatmap(W, 'Finetuned_weights_'+ str(n)) 373 | 374 | do_eval(sess, eval_correct, y_pred, x_pl, labels_pl, label_map, input_x.train, title='Final_Train') 375 | do_eval(sess, eval_correct, y_pred, x_pl, labels_pl, label_map, input_x.test, title='Final_Test') 376 | if input_x.validation is not None: 377 | do_eval(sess, eval_correct, y_pred, x_pl, labels_pl, label_map, input_x.validation, title='Final_Validation') 378 | 379 | print "Fine-tuning Finished..." 380 | return sdae 381 | 382 | 383 | def loss_supervised(logits, labels, num_classes): 384 | """Calculates the loss from the logits and the labels. 385 | 386 | Args: 387 | logits: Logits tensor, float - [batch_size, NUM_CLASSES]. 388 | labels: Labels tensor, int32 - [batch_size]. 389 | 390 | Returns: 391 | loss: Loss tensor of type float. 392 | """ 393 | 394 | # Convert from sparse integer labels in the range [0, NUM_CLASSSES) 395 | # to 1-hot dense float vectors (that is we will have batch_size vectors, 396 | # each with NUM_CLASSES values, all of which are 0.0 except there will 397 | # be a 1.0 in the entry corresponding to the label). 398 | batch_size = tf.size(labels) 399 | labels = tf.expand_dims(labels, 1) 400 | 401 | indices = tf.expand_dims(tf.range(0, batch_size), 1) 402 | concated = tf.concat(1, [indices, labels]) 403 | onehot_labels = tf.sparse_to_dense(concated, tf.pack([batch_size, num_classes]), 1.0, 0.0) 404 | 405 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, onehot_labels, name='xentropy') 406 | 407 | loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') 408 | return loss 409 | -------------------------------------------------------------------------------- /Train_SDAE/test_dae.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import os 4 | import shutil 5 | import sys 6 | import time 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | from os.path import join as pjoin 11 | from tools.config import FLAGS, home_out 12 | from tools.dae import DAE_Layer 13 | from tools.start_tensorboard import start 14 | from tools.data_handler import load_data 15 | from tools.utils import fill_feed_dict_dae 16 | from tools.utils import load_data_sets_pretraining 17 | from tools.utils import normalize_data, label_metadata 18 | from tools.visualize import hist_comparison 19 | 20 | _data_dir = FLAGS.data_dir 21 | _output_dir = FLAGS.output_dir 22 | _summary_dir = FLAGS.summary_dir 23 | _chkpt_dir = FLAGS.chkpt_dir 24 | 25 | def _check_and_clean_dir(d): 26 | if os.path.exists(d): 27 | shutil.rmtree(d) 28 | os.mkdir(d) 29 | 30 | 31 | def main(): 32 | home = home_out('') 33 | if not os.path.exists(home): 34 | os.makedirs(home) 35 | if not os.path.exists(_data_dir): 36 | os.mkdir(_data_dir) 37 | # os.makedirs also an option 38 | 39 | 40 | if not os.path.exists(_output_dir): 41 | os.mkdir(_output_dir) 42 | elif os.listdir(_output_dir): 43 | var = raw_input("Output folder is not empty. Clean it? (This will delete every file in it.) y/N: ") 44 | if var == 'y' or var == 'Y' or var == '1': 45 | _check_and_clean_dir(_output_dir) 46 | else: 47 | exit("Exiting... Please save your former output data and restart SDAE.") 48 | else: 49 | _check_and_clean_dir(_output_dir) 50 | 51 | _check_and_clean_dir(_summary_dir) 52 | _check_and_clean_dir(_chkpt_dir) 53 | 54 | os.mkdir(os.path.join(_chkpt_dir, '1')) 55 | os.mkdir(os.path.join(_chkpt_dir, '2')) 56 | os.mkdir(os.path.join(_chkpt_dir, '3')) 57 | os.mkdir(os.path.join(_chkpt_dir, 'fine_tuning')) 58 | 59 | start() 60 | 61 | start_time = time.time() 62 | datafile = load_data('RPKM', transpose=False) 63 | labelfile = load_data('Labels') 64 | print("Data Loaded. Duration:", time.time() - start_time) 65 | 66 | # Data Normalization 67 | datafile_norm = normalize_data(datafile, transpose=False) 68 | 69 | # Get data-sets (train, test) in a proper way 70 | data = load_data_sets_pretraining(datafile_norm, split_only=False) 71 | 72 | # Get Label Metadata 73 | mapped_labels, label_map = label_metadata(label_matrix=labelfile, label_col=7) 74 | num_classes = label_map.shape[0] 75 | 76 | nHLay = FLAGS.num_hidden_layers 77 | nHUnits = [getattr(FLAGS, "hidden{0}_units".format(j + 1)) for j in xrange(nHLay)] 78 | 79 | # Get the number of existed features (e.g. genes) in the data-set 80 | num_features = datafile_norm.shape[1] 81 | # Create the shape of the AutoEncoder 82 | sdae_shape = [num_features] + nHUnits + [num_classes] 83 | 84 | with tf.Graph().as_default() as g: 85 | sess = tf.Session() 86 | 87 | y_all = {} 88 | for layer in xrange(3): 89 | y_all[layer] = [] 90 | 91 | if layer == 0: 92 | x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, num_features), name='dae_input_from_layer_{0}'.format(layer)) 93 | dae = DAE_Layer(in_data=x, prev_layer_size=num_features, next_layer_size=FLAGS.hidden1_units, nth_layer=layer+1, last_layer=False) 94 | elif layer == 1: 95 | x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, FLAGS.hidden1_units), name='dae_input_from_layer_{0}'.format(layer)) 96 | dae = DAE_Layer(in_data=x, prev_layer_size=FLAGS.hidden1_units, next_layer_size=FLAGS.hidden2_units, nth_layer=layer+1, last_layer=False) 97 | else: 98 | x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, FLAGS.hidden2_units), name='dae_input_from_layer_{0}'.format(layer)) 99 | dae = DAE_Layer(in_data=x, prev_layer_size=FLAGS.hidden2_units, next_layer_size=num_classes, nth_layer=layer+1, last_layer=False)# or True 100 | 101 | cost = dae.get_loss 102 | 103 | with tf.variable_scope("pretrain_{0}".format(layer+1)): 104 | train_op, global_step, loss_summaries = train(cost) 105 | 106 | summary_dir = pjoin(FLAGS.summary_dir, 'pretraining_{0}'.format(layer+1)) 107 | summary_writer = tf.train.SummaryWriter(summary_dir, graph_def=sess.graph_def, flush_secs=FLAGS.flush_secs) 108 | summary_vars = [dae.get_w_b[0], dae.get_w_b[1]] 109 | 110 | hist_summarries = [tf.histogram_summary(v.op.name, v) for v in summary_vars] 111 | hist_summarries.append(loss_summaries) 112 | summary_op = tf.merge_summary(hist_summarries) 113 | 114 | dae.vars_to_init.append(global_step) 115 | sess.run(tf.initialize_variables(dae.vars_to_init)) 116 | 117 | print "| Layer | Epoch | Cost | Step |" 118 | print data.train.num_examples 119 | print data.all.num_examples 120 | 121 | for step in xrange(FLAGS.pretraining_epochs):# * data.train.num_examples): 122 | # for i in xrange(data.train.num_examples): 123 | 124 | feed_dict = fill_feed_dict_dae(data.train, x) 125 | 126 | # if layer == 0: 127 | c, _, y, z, w, b_in, b_out = sess.run([cost, train_op, dae.get_representation_y, dae.get_reconstruction_z, dae.get_w_b[0], dae.get_w_b[1], dae.get_b_recon], feed_dict=feed_dict) 128 | # else: 129 | # c, _, gs, y, z, w, b_in, b_out = sess.run([cost, train_op, g_step, dae.get_representation_y, dae.get_reconstruction_z, dae.get_w_and_biases[0], dae.get_w_and_biases[1], dae.get_w_and_biases[2]], feed_dict=fill_feed_dict_dae(data.train, x)) 130 | 131 | if step % 1 == 0: 132 | print '| ', layer+1, ' | ', step // data.train.num_examples + 1, ' | ', c, ' | ', step, ' |' 133 | summary_str = sess.run(summary_op, feed_dict=feed_dict) 134 | summary_writer.add_summary(summary_str, step) 135 | 136 | # 137 | # print np.asarray(y_all).shape 138 | # print np.asarray(y_all[layer]).shape 139 | 140 | for _ in xrange(data.all.num_batches): 141 | feed_dict = fill_feed_dict_dae(data.all, x) 142 | 143 | y = sess.run(dae.get_representation_y, feed_dict=feed_dict) 144 | for j in xrange(np.asarray(y).shape[0]): 145 | y_all[layer].append(y[j]) 146 | 147 | print np.asarray(y_all[layer]).shape 148 | data = load_data_sets_pretraining(np.asarray(y_all[layer]), split_only=False) 149 | 150 | print "Finished..." 151 | 152 | def train(layer, cost): 153 | # with tf.name_scope("Training"): 154 | # Add a scalar summary for the snapshot loss. 155 | loss_summaries = tf.scalar_summary(cost.op.name, cost) 156 | 157 | if layer is None: 158 | lr = FLAGS.supervised_learning_rate 159 | else: 160 | lr = layer._l_rate 161 | 162 | # Create the gradient descent optimizer with the given learning rate. 163 | optimizer = tf.train.GradientDescentOptimizer(lr) 164 | 165 | # Create a variable to track the global step. 166 | global_step = tf.Variable(0, trainable=False, name='global_step') 167 | 168 | # Use the optimizer to apply the gradients that minimize the loss 169 | # (and also increment the global step counter) as a single training step. 170 | train_op = optimizer.minimize(cost, global_step=global_step) 171 | return train_op, global_step, loss_summaries 172 | 173 | if __name__ == '__main__': 174 | main() 175 | 176 | 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /Train_SDAE/tools/ADASYN.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import NearestNeighbors 2 | from random import choice 3 | 4 | ''' 5 | Created on 14-jun.-2013 6 | @author: Olivier.Janssens 7 | ''' 8 | 9 | ''' 10 | Modified on 24-March-2016 11 | @author: Anastasios Glaros 12 | ''' 13 | 14 | import numpy as np 15 | import random 16 | 17 | class Adasyn(object): 18 | def __init__(self, data, labels, classes, K=5, beta=1): 19 | self.X = data 20 | self.K = K 21 | self.beta = beta 22 | self.new_X, self.new_y = [], [] 23 | self.d, self.G = [], [] 24 | 25 | try: 26 | assert not isinstance(classes, list) 27 | self.classes = classes.tolist() 28 | except AssertionError as e: 29 | self.classs = classes 30 | 31 | try: 32 | assert not isinstance(labels, list) 33 | self.y = labels.tolist() 34 | except AssertionError as e: 35 | self.y = labels 36 | 37 | temp = [] 38 | for i in xrange(len(self.classes)): 39 | temp.append(len(all_indices(i, self.y))) 40 | 41 | self.majority_class = self.classes[temp.index(max(temp))] #np.where(np.asarray(temp)==max(temp))[0][0]] 42 | 43 | 44 | def balance_all(self): 45 | classes = np.copy(self.classes).tolist() 46 | classes.remove(self.majority_class) 47 | 48 | print "Classes:", classes 49 | 50 | # Loop for all the classes except the majority 51 | for class_i in classes: 52 | print "\nFor class: ", class_i 53 | ms, ml = self.get_class_count(self.X, self.y, class_i, self.majority_class) 54 | 55 | d = self.get_d(self.X, self.y, ms, ml) 56 | G = self.get_G(self.X, self.y, ms, ml, self.beta) 57 | 58 | rlist = self.get_Ris(self.X, self.y, class_i, self.K) 59 | # print("ms, ml, d, G, len(rlist): ", ms, ml, d, G, len(rlist)) 60 | 61 | new_X, new_y = self.generate_samples(rlist, self.X, self.y, G, class_i, self.K) 62 | print "Length of original_X, new_X:", ms, len(new_X) 63 | # print("shape of new_X, new_y:", new_X.shape, new_y.shape) 64 | self.new_X.append(new_X) 65 | self.new_y.append(new_y) 66 | 67 | return self.join_all_together() 68 | # X, y = self.join_with_the_rest(self.X, self.y, newX, newy, self.classes, class_i) 69 | 70 | def save_data(self, data_filename, label_filename): 71 | from tools.utils import write_csv 72 | import csv 73 | print(type(self.new_X), "saving...") 74 | with open(data_filename, "wb") as f: 75 | writer = csv.writer(f, delimiter='\t') 76 | writer.writerows(self.new_X) 77 | 78 | print("Saved.") 79 | # write_csv(data_filename, self.new_X) 80 | del(self.new_X) 81 | 82 | # with open(label_filename, "wb") as f: 83 | # writer = csv.writer(f, delimiter='\t') 84 | # writer.writerows(self.new_y) 85 | write_csv(label_filename, self.new_y) 86 | del(self.new_y) 87 | 88 | # @param: X The datapoints e.g.: [f1, f2, ... ,fn] 89 | # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn] 90 | # @return ms: The amount of samples in the minority group 91 | # @return ms: The amount of samples in the majority group 92 | def get_class_count(self, X, y, minorityclass, majorityclass): 93 | indicesZero = all_indices(minorityclass, y) 94 | indicesOne = all_indices(majorityclass, y) 95 | 96 | if len(indicesZero) > len(indicesOne): 97 | ms = len(indicesOne) 98 | ml = len(indicesZero) 99 | else: 100 | ms = len(indicesZero) 101 | ml = len(indicesOne) 102 | return ms,ml 103 | 104 | # @param: X The datapoints e.g.: [f1, f2, ... ,fn] 105 | # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn] 106 | # @param ms: The amount of samples in the minority group 107 | # @param ms: The amount of samples in the majority group 108 | # @return: The ratio between the minority and majority group 109 | def get_d(self, X,y,ms,ml): 110 | 111 | return float(ms)/float(ml) 112 | 113 | # @param: X The datapoints e.g.: [f1, f2, ... ,fn] 114 | # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn] 115 | # @param ms: The amount of samples in the minority group 116 | # @param ms: The amount of samples in the majority group 117 | # @return: the G value, which indicates how many samples should be generated in total, this can be tuned with beta 118 | def get_G(self, X,y,ms,ml,beta): 119 | return (ml-ms)*beta 120 | 121 | 122 | # @param: X The datapoints e.g.: [f1, f2, ... ,fn] 123 | # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn] 124 | # @param: minorityclass: The minority class 125 | # @param: K: The amount of neighbours for Knn 126 | # @return: rlist: List of r values 127 | def get_Ris(self, X,y, minorityclass=0, K=5): 128 | indicesMinority = all_indices(minorityclass,y) 129 | ymin = np.array(y)[indicesMinority] 130 | Xmin = np.array(X)[indicesMinority] 131 | neigh = NearestNeighbors(n_neighbors=30,algorithm = 'ball_tree') 132 | neigh.fit(X) 133 | 134 | # print "Shapes:", Xmin[0].shape, Xmin[0].reshape(1,-1).shape 135 | 136 | rlist = [0]*len(ymin) 137 | normalizedrlist = [0]*len(ymin) 138 | 139 | classes = np.copy(self.classes).tolist() 140 | classes.remove(minorityclass) 141 | 142 | for i in xrange(len(ymin)): 143 | indices = neigh.kneighbors(Xmin[i].reshape(1,-1), K, False) 144 | 145 | #print ">", len(all_indices_multi(classes, np.array(y)[indices].tolist()[0])) 146 | rlist[i] = float(len(all_indices_multi(classes, np.array(y)[indices].tolist()[0]))) / K 147 | 148 | 149 | normConst = sum(rlist) 150 | 151 | try: 152 | for j in xrange(len(rlist)): 153 | normalizedrlist[j] = (rlist[j]/normConst) 154 | except ZeroDivisionError as e: 155 | normalizedrlist = rlist 156 | print(rlist) 157 | 158 | return normalizedrlist 159 | 160 | 161 | # @param: rlist: List of r values 162 | # @param: X The datapoints e.g.: [f1, f2, ... ,fn] 163 | # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn] 164 | # @return: the G value, which indicates how many samples should be generated in total, this can be tuned with beta 165 | # @param: minorityclass: The minority class 166 | # @param: K: The amount of neighbours for Knn 167 | # @return: The synthetic data samples 168 | def generate_samples(self, rlist,X,y,G,minorityclasslabel,K): 169 | syntheticdata = [] 170 | 171 | indicesMinority = all_indices(minorityclasslabel,y) 172 | ymin = np.array(y)[indicesMinority] 173 | Xmin = np.array(X)[indicesMinority] 174 | 175 | # print "Xmin shape: ", Xmin.shape, ", len of ymin:", len(ymin) 176 | 177 | neigh = NearestNeighbors(n_neighbors=30,algorithm = 'ball_tree') 178 | neigh.fit(Xmin) 179 | gsum=0 180 | for k in xrange(len(ymin)): 181 | g = int(np.round(rlist[k]*G)) 182 | #print g, "= int round ", rlist[k], "*", G 183 | gsum += g 184 | for l in xrange(g): 185 | ind = random.choice(neigh.kneighbors(Xmin[k].reshape(1,-1),K,False)[0]) 186 | s = Xmin[k] + (Xmin[ind]-Xmin[k]) * random.random() 187 | syntheticdata.append(s) 188 | 189 | # print "synthetic shape: ", np.asarray(syntheticdata).shape, ", gsum:", gsum 190 | 191 | try: 192 | new_data = np.concatenate((syntheticdata, Xmin),axis=0) 193 | new_y = [minorityclasslabel] * len(new_data) 194 | except ValueError as e: 195 | new_data = Xmin 196 | new_y = ymin 197 | 198 | return new_data, new_y 199 | 200 | def join_all_together(self): 201 | X_all, y_all = [], [] 202 | classes = np.copy(self.classes).tolist() 203 | classes.remove(self.majority_class) 204 | print "\nJoining Original and Synthetic datasets..." 205 | # Loop for all classes except 1 (the majority class) 206 | for i, class_i in zip(xrange(len(self.classes) - 1), classes): 207 | classes_no_minor = np.copy(self.classes).tolist() 208 | classes_no_minor.remove(class_i) 209 | # print i, class_i, classes_no_minor 210 | 211 | if i == 0: 212 | indicesMajority = all_indices_multi(classes_no_minor, self.y) 213 | ymaj = np.array(self.y)[indicesMajority] 214 | Xmaj = np.array(self.X)[indicesMajority] 215 | # print "Indices_Majority:", len(indicesMajority), "len ymaj:", len(ymaj), "len Xmaj:", len(Xmaj), "len self.new_X:", len(self.new_X) 216 | 217 | # X_all = np.concatenate((Xmaj, self.new_X[i]), axis=0) 218 | # y_all = np.concatenate((ymaj, self.new_y[i]), axis=0) 219 | else: 220 | indicesMajority = all_indices_multi(classes_no_minor, y_all.tolist()) 221 | ymaj = y_all[indicesMajority] 222 | Xmaj = X_all[indicesMajority] 223 | # print "Indices_Majority:", len(indicesMajority), "len ymaj:", len(ymaj), "len Xmaj:", len(Xmaj), "len self.new_X:", len(self.new_X) 224 | 225 | # X_all = np.concatenate((X_all, np.concatenate((Xmaj, self.new_X[i]), axis=0)), axis=0) 226 | # y_all = np.concatenate((y_all, np.concatenate((ymaj, self.new_y[i]), axis=0)), axis=0) 227 | 228 | X_all = np.concatenate((Xmaj, self.new_X[i]), axis=0) 229 | y_all = np.concatenate((ymaj, self.new_y[i]), axis=0) 230 | print "Joined. Length of X_all and y_all:", len(X_all), len(y_all) 231 | 232 | return X_all, y_all 233 | 234 | def join_with_the_rest(self, X,y,newData,newy,classes, minorityclass): 235 | classes.remove(minorityclass) 236 | indicesMajority = all_indices_multi(classes, y) 237 | ymaj = np.array(y)[indicesMajority] 238 | Xmaj = np.array(X)[indicesMajority] 239 | 240 | return np.concatenate((Xmaj,newData),axis=0), np.concatenate((ymaj,newy),axis=0) 241 | 242 | def joinwithmajorityClass(self, X,y,newData,newy,majorityclasslabel): 243 | indicesMajority = all_indices(majorityclasslabel,y) 244 | ymaj = np.array(y)[indicesMajority] 245 | Xmaj = np.array(X)[indicesMajority] 246 | 247 | return np.concatenate((Xmaj,newData),axis=0),np.concatenate((ymaj,newy),axis=0) 248 | 249 | # @param value: The classlabel 250 | # @param qlist: The list in which to search 251 | # @return: the indices of the values that are equal to the classlabel 252 | def all_indices(value, qlist): 253 | indices = [] 254 | idx = -1 255 | while True: 256 | try: 257 | idx = qlist.index(value, idx+1) 258 | indices.append(idx) 259 | except ValueError: 260 | break 261 | return indices 262 | 263 | 264 | # @param values: The classlabels except the minority's class 265 | # @param qlist: The list in which to search 266 | # @return: the indices of the values that are equal to the classlabel 267 | def all_indices_multi(values, qlist): 268 | indices = [] 269 | for i in xrange(len(values)): 270 | idx = -1 271 | flag = True 272 | while flag: 273 | try: 274 | idx = qlist.index(values[i], idx+1) 275 | indices.append(idx) 276 | except ValueError: 277 | flag = False 278 | return indices 279 | 280 | -------------------------------------------------------------------------------- /Train_SDAE/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glrs/StackedDAE/c21e851dc13e11f201ce7289e854c05956637986/Train_SDAE/tools/__init__.py -------------------------------------------------------------------------------- /Train_SDAE/tools/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import os 3 | from os.path import join as pjoin 4 | 5 | import sys 6 | 7 | import tensorflow as tf 8 | 9 | WEB_OUT = '/var/www/html/' 10 | 11 | def home_out(path): 12 | return pjoin(os.environ['HOME'], 'tmp_StackedDAE', 'Allan', path) 13 | 14 | def web_out(path): 15 | # Just a quick manual flag for changes between local and remote VMs 16 | if False: 17 | return pjoin(WEB_OUT, 'StackedDAE', path) 18 | else: 19 | return home_out(path) 20 | 21 | 22 | flags = tf.app.flags 23 | FLAGS = flags.FLAGS 24 | 25 | # Data Management 26 | flags.DEFINE_string('dataset', 'Linarsson', 'Choose which dataset you want to use') 27 | flags.DEFINE_boolean('use_balanced', False, 'Use balanced data or not. If not existed they will be created.') 28 | 29 | # Autoencoder Architecture Specific Flags 30 | flags.DEFINE_boolean('bias_node', False, 'Whether to use or not a bias node in the network') 31 | 32 | flags.DEFINE_integer('num_hidden_layers', 3, 'Number of hidden layers') 33 | 34 | flags.DEFINE_integer('hidden1_units', 50, 'Number of units in hidden layer 1.') # 2000 35 | flags.DEFINE_integer('hidden2_units', 25, 'Number of units in hidden layer 2.') 36 | flags.DEFINE_integer('hidden3_units', 15, 'Number of units in hidden layer 3.') 37 | 38 | # flags.DEFINE_integer('example_features', EXAMPLE_FEATURES, 'Total number of features (genes)') # image_pixels 39 | # flags.DEFINE_integer('num_classes', 10, 'Number of classes') 40 | 41 | flags.DEFINE_float('unsupervised_learning_rate', 0.0001, 'Unsupervised initial learning rate.') 42 | flags.DEFINE_float('supervised_learning_rate', 0.01, 'Supervised initial learning rate.') 43 | 44 | flags.DEFINE_float('pre_layer1_learning_rate', 0.0001, 'Initial learning rate.') 45 | flags.DEFINE_float('pre_layer2_learning_rate', 0.0001, 'Initial learning rate.') 46 | flags.DEFINE_float('pre_layer3_learning_rate', 0.0001, 'Initial learning rate.') 47 | 48 | flags.DEFINE_boolean('emphasis', False, 'Whether to use Emphasis or Not') 49 | flags.DEFINE_string('emphasis_type', 'Double', 'Type of Emphasis for the Cross Entropy. [Double, Full]') 50 | 51 | flags.DEFINE_float('default_noise', [0.0, 'MN'], 'Default Noise ratio and type to apply on the data') 52 | 53 | flags.DEFINE_float('noise_1', [0.10, 'TFDO'], 'Noise ratio to apply on the data, and the type of noise') 54 | flags.DEFINE_float('noise_2', [0.10, 'TFDO'], 'Noise ratio to apply on the data, and the type of noise') 55 | flags.DEFINE_float('noise_3', [0.10, 'TFDO'], 'Noise ratio to apply on the data, and the type of noise') 56 | 57 | """ TODO: ADD a flag for activation function (sigmoid, tanh, etc.) """ 58 | 59 | # Constants 60 | # flags.DEFINE_integer('seed', 1234, 'Random seed') 61 | 62 | flags.DEFINE_integer('batch_size', 9, 'Batch size. Must divide evenly into the dataset sizes.') # 100 63 | 64 | flags.DEFINE_integer('pretraining_epochs', 50, 'Number of training epochs for pretraining layers') # 60 65 | flags.DEFINE_integer('finetuning_epochs', 50, 'Number of training epochs for fine tuning supervised step') 66 | 67 | flags.DEFINE_float('zero_bound', 1.0e-9, 'Value to use as buffer to avoid numerical issues at 0') 68 | flags.DEFINE_float('one_bound', 1.0 - 1.0e-9, 'Value to use as buffer to avoid numerical issues at 1') 69 | 70 | flags.DEFINE_float('flush_secs', 120, 'Number of seconds to flush summaries') # 120 71 | 72 | # Directories 73 | flags.DEFINE_string('data_dir', home_out('data'), 'Directory to put the training data.') 74 | 75 | flags.DEFINE_string('output_dir', web_out('output'), 'Directory to put the output data.') 76 | 77 | flags.DEFINE_string('summary_dir', home_out('summaries'), 'Directory to put the summary data') 78 | 79 | flags.DEFINE_string('chkpt_dir', home_out('chkpts'), 'Directory to put the model checkpoints') 80 | 81 | # TensorBoard 82 | # flags.DEFINE_boolean('no_browser', True, 'Whether to start browser for TensorBoard') 83 | 84 | # Python 85 | flags.DEFINE_string('python', sys.executable, 'Path to python executable') 86 | 87 | 88 | -------------------------------------------------------------------------------- /Train_SDAE/tools/data_handler.py: -------------------------------------------------------------------------------- 1 | """ Data Handler for Allan's Data-set """ 2 | 3 | import pandas as pd 4 | import os 5 | import gzip 6 | import numpy as np 7 | 8 | from os.path import join as pjoin 9 | from config import FLAGS 10 | 11 | # TODO: Use Dictionary instead! 12 | TPM = {'filtered':'TPM_common_ready_data.csv', 'ordered':'TPM_ready_data.csv', 'original':'GSE71585_RefSeq_TPM.csv', 'zipped':'GSE71585_RefSeq_TPM.csv.gz'} 13 | RPKM = {'ordered':'RPKM_ready_data.csv', 'original':'GSE71585_RefSeq_RPKM.csv', 'zipped':'GSE71585_RefSeq_RPKM.csv.gz'} 14 | COUNTS = {'ordered':'Counts_ready_data.csv', 'original':'GSE71585_RefSeq_counts.csv', 'zipped':'GSE71585_RefSeq_counts.csv.gz'} 15 | LABELS = {'ordered':'Labels_inOrder.csv', 'original':'GSE71585_Clustering_Results.csv', 'zipped':'GSE71585_Clustering_Results.csv.gz'} 16 | 17 | # TPM = ['TPM_ready_data.csv', 'GSE71585_RefSeq_TPM.csv', 'GSE71585_RefSeq_TPM.csv.gz'] 18 | # LABELS = ['Labels_inOrder.csv', 'GSE71585_Clustering_Results.csv', 'GSE71585_Clustering_Results.csv.gz'] 19 | #'counts_ordered_nonzero_zeroone.tsv', 20 | #'metadata_ordered_subset.tsv', 21 | 22 | LINARSSON = {'filtered':'Linarsson_common_data.txt', 'normal':'expression_mRNA_17-Aug-2014.txt'} 23 | 24 | def extract_data(in_f, out_f): 25 | print("Extracting", in_f) 26 | in_file = gzip.open(in_f, 'rb') 27 | out_file = open(out_f, 'wb') 28 | out_file.write(in_file.read()) 29 | in_file.close() 30 | out_file.close() 31 | 32 | 33 | def order_labels(data_in, label_in, data_out=None, label_out=None, sep=','): 34 | print("Ordering Data with Labels...") 35 | 36 | labels = pd.read_csv(label_in, index_col=0) 37 | data = pd.read_csv(data_in, index_col=0, sep=sep) 38 | 39 | common_labels = labels.index.intersection(data.columns) 40 | # common_labels2 = data.columns.intersection(labels.index) 41 | 42 | # data_nonzero = data.loc[(data > 0).any(axis=1)].dropna() 43 | data_nonzero = data[(data.sum(axis=1) > 0)].dropna() 44 | data_nonzero = data_nonzero[common_labels] 45 | 46 | """ Better here with non_zero than above? """ 47 | common_labels2 = data_nonzero.columns.intersection(labels.index) 48 | label_sub = labels.loc[common_labels2] 49 | label_sub.index.names = labels.index.names 50 | 51 | label_sub_sort = label_sub.sort_index(0) 52 | data_sub_sort = data_nonzero.reindex_axis(sorted(data_nonzero.columns), axis=1) 53 | 54 | # Check that it worked 55 | assert(data_sub_sort.columns == label_sub_sort.index).all() 56 | 57 | if data_out is not None and label_out is not None: 58 | data_sub_sort.to_csv(data_out, sep="\t") 59 | label_sub_sort.to_csv(label_out, sep="\t") 60 | 61 | return data_sub_sort, label_sub_sort 62 | 63 | 64 | def label_metadata(label_matrix, label_col): 65 | # Check whether the column value is given as index (number) or name (string) 66 | try: 67 | label_col = int(label_col) 68 | 69 | # If given as number, take the name of the column out of it 70 | label_col = label_matrix.columns[label_col] 71 | except ValueError: 72 | pass 73 | 74 | # Get the unique classes in the given column, and how many of them are there 75 | unique_classes = pd.unique(label_matrix[label_col].ravel()) 76 | 77 | # Map the unique n classes with a number from 0 to n 78 | label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))}) 79 | 80 | # Replace the given column values with the mapped equivalent 81 | mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist()) 82 | # print("label_matrix", label_matrix) 83 | # print("mapped_labels", mapped_labels) 84 | 85 | # Return the mapped labels as ndarray and the label map (unique classes and number can be obtained from map) 86 | # np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],)) 87 | # Return the mapped labels as DataFrame and the label map (unique classes and number can be obtained from map) 88 | return mapped_labels[[label_col]], np.asarray(label_map) #, unique_classes, num_classes 89 | 90 | 91 | def sort_labels(data_in): 92 | d = pd.read_csv(data_in, sep='\t', index_col=0) 93 | return d.sort_index(0) 94 | 95 | 96 | def load_linarsson_data(d_type, transpose=False): 97 | print("Counts file is loading...") 98 | if d_type == 'filtered': 99 | data = LINARSSON['filtered'] 100 | else: 101 | data = LINARSSON['normal'] 102 | 103 | # data = pd.read_csv(pjoin(FLAGS.data_dir, 'expression_mRNA_17-Aug-2014.txt'), skiprows=[0,1,2,3,4,5,6,8,9,10], header=0, sep='\t', index_col=0) 104 | data = pd.read_csv(pjoin(FLAGS.data_dir, data), skiprows=[0,1,2,3,4,5,6,8,9,10], header=0, sep='\t', index_col=0) 105 | data.drop(data.columns[0], axis=1,inplace=True) 106 | 107 | if transpose: 108 | data = data.transpose() 109 | 110 | return np.array(data) 111 | 112 | def load_linarsson_labels(sub_labels=False): 113 | print("Label file is loading...") 114 | rows_to_skip = [0,1,2,3,4,5,6,8] if sub_labels else 7 115 | labels = pd.read_csv(pjoin(FLAGS.data_dir, LINARSSON['normal']), skiprows=rows_to_skip, nrows=2, header=None, sep='\t', index_col=False) 116 | # sub_labels = pd.read_csv(pjoin(FLAGS.data_dir, "expression_mRNA_17-Aug-2014.txt"), skiprows=[0,1,2,3,4,5,6,8], nrows=1, sep='\t', index_col=1) 117 | 118 | labels = labels.transpose() 119 | labels.columns= labels.iloc[1] 120 | labels.drop(labels.index[[0, 1]], inplace=True) 121 | labels.set_index(labels.columns.values[0], inplace=True) 122 | 123 | return labels, label_metadata(label_matrix=labels, label_col=0) 124 | 125 | def load_data(dataset=None, d_type=None, label_col=None, transpose=None):#, sub_labels=False): 126 | if dataset == 'Linarsson': 127 | data = load_linarsson_data(d_type, transpose=transpose) 128 | 129 | if label_col == 1: 130 | sub_labels = False 131 | elif label_col == 2: 132 | sub_labels = True 133 | else: 134 | exit("Error: Options for Linarsson Label columns are 1 or 2.") 135 | 136 | labels, meta = load_linarsson_labels(sub_labels) 137 | return data, labels, meta 138 | elif dataset == 'Allen': 139 | return load_allen(d_type=d_type, label_col=label_col, transpose=transpose) 140 | else: 141 | exit("Usage: load_data(dataset=['Linarsson', 'Allen'],\ 142 | data_type=['filtered', 'TPM', 'RPKM', 'Counts', 'Labels', None],\ 143 | label_col=[int], (optional)transpose=[Boolean (default=None)])") 144 | 145 | # def load_data(d_type=None, label_col=None, transpose=False): 146 | def load_allen(d_type=None, label_col=None, transpose=False): 147 | if d_type == 'TPM': 148 | d = check_and_load(TPM) 149 | print("TPM file is loading...") 150 | elif d_type == 'RPKM': 151 | d = check_and_load(RPKM) 152 | print("RPKM file is loading...") 153 | elif d_type == 'Counts': 154 | d = check_and_load(COUNTS) 155 | print("Counts file is loading...") 156 | elif d_type == 'Labels' or d_type is None and label_col is not None: 157 | d = check_and_load(LABELS) 158 | print("Label file is loading...") 159 | elif d_type == 'filtered': 160 | d = pd.read_csv(pjoin(FLAGS.data_dir, TPM['filtered']), sep='\t', index_col=0) 161 | # d = pd.read_csv(pjoin(FLAGS.data_dir, 'TPM_common_ready_data.csv'), sep='\t', index_col=0) 162 | else: 163 | exit("Usage: load_data(data_type=['filtered', 'TPM', 'RPKM', 'Counts', 'Labels', None],\ 164 | label_col=[int], (optional)transpose=[Boolean (default=None)])") 165 | 166 | 167 | # if not os.path.exists(pjoin(FLAGS.data_dir, data[0])): 168 | # if not os.path.exists(pjoin(FLAGS.data_dir, data[1])): 169 | # if not os.path.exists(pjoin(FLAGS.data_dir, data[2])): 170 | # exit("You should download and place the data in the correct folder.") 171 | # else: 172 | # extract_data(pjoin(FLAGS.data_dir, data[2]), pjoin(FLAGS.data_dir, data[1])) 173 | # if d_type == 'Labels': 174 | # exit("Labels extracted. You need to give a dataset first to receive the labels.") 175 | # else: 176 | # if not os.path.exists(pjoin(FLAGS.data_dir, LABELS[1])): 177 | # extract_data(pjoin(FLAGS.data_dir, LABELS[2]), pjoin(FLAGS.data_dir, LABELS[1])) 178 | # 179 | # d, _ = order_labels(pjoin(FLAGS.data_dir, data[1]), pjoin(FLAGS.data_dir, LABELS[1]), 180 | # pjoin(FLAGS.data_dir, data[0]), pjoin(FLAGS.data_dir, LABELS[0])) 181 | # else: 182 | # if d_type == 'Labels': 183 | # exit("You need to give a dataset first to receive the labels.") 184 | # else: 185 | # d, _ = order_labels(pjoin(FLAGS.data_dir, data[1]), pjoin(FLAGS.data_dir, LABELS[1]), 186 | # pjoin(FLAGS.data_dir, data[0]), pjoin(FLAGS.data_dir, LABELS[0])) 187 | # else: 188 | # d = pd.read_csv(pjoin(FLAGS.data_dir, data[0]), sep='\t', index_col=0) 189 | 190 | 191 | # Use recursion to load and return the labels as well 192 | if d_type == 'Labels' or d_type is None: 193 | # Return Label Metadata 194 | 195 | labels = d[[label_col]] 196 | # print(labels) 197 | return labels, label_metadata(label_matrix=d, label_col=label_col) 198 | else: 199 | if transpose: 200 | d = d.transpose() 201 | 202 | labels, (mapped_labels, label_map) = load_allen(label_col=label_col) 203 | 204 | return np.array(d), labels, (mapped_labels, label_map) 205 | 206 | def check_and_load(data): 207 | if not os.path.exists(pjoin(FLAGS.data_dir, data['ordered'])): 208 | if not os.path.exists(pjoin(FLAGS.data_dir, data['original'])): 209 | if not os.path.exists(pjoin(FLAGS.data_dir, data['zipped'])): 210 | exit("You should download and place the data in the correct folder.") 211 | else: 212 | extract_data(pjoin(FLAGS.data_dir, data['zipped']), pjoin(FLAGS.data_dir, data['original'])) 213 | if d_type == 'Labels': 214 | exit("Labels extracted. You need to give a dataset first to receive the labels.") 215 | else: 216 | if not os.path.exists(pjoin(FLAGS.data_dir, LABELS['original'])): 217 | extract_data(pjoin(FLAGS.data_dir, LABELS['zipped']), pjoin(FLAGS.data_dir, LABELS['original'])) 218 | 219 | d, _ = order_labels(pjoin(FLAGS.data_dir, data['original']), pjoin(FLAGS.data_dir, LABELS['original']), 220 | pjoin(FLAGS.data_dir, data['ordered']), pjoin(FLAGS.data_dir, LABELS['ordered'])) 221 | else: 222 | if d_type == 'Labels': 223 | exit("You need to give a dataset first to receive the labels.") 224 | else: 225 | d, _ = order_labels(pjoin(FLAGS.data_dir, data['original']), pjoin(FLAGS.data_dir, LABELS['original']), 226 | pjoin(FLAGS.data_dir, data['ordered']), pjoin(FLAGS.data_dir, LABELS['ordered'])) 227 | else: 228 | d = pd.read_csv(pjoin(FLAGS.data_dir, data['ordered']), sep='\t', index_col=0) 229 | 230 | return d 231 | 232 | 233 | def load_extra(dataset=None, filename=None, transpose=True, label_col=None, sub_labels=False): 234 | if dataset == 'Allen': 235 | # data, labels = order_labels(pjoin(FLAGS.data_dir, filename), pjoin(FLAGS.data_dir, LABELS[1]), sep='\t') 236 | 237 | labels = pd.read_csv(pjoin(FLAGS.data_dir, LABELS['ordered']), sep='\t', index_col=0) 238 | data = pd.read_csv(pjoin(FLAGS.data_dir, filename), sep='\t', index_col=0) 239 | 240 | if transpose: 241 | data = data.transpose() 242 | return np.array(data), labels[[label_col]], label_metadata(label_matrix=labels, label_col=label_col) 243 | elif dataset == 'Linarsson': 244 | data = load_linarsson_data('filtered', transpose=transpose) 245 | labels, meta = load_linarsson_labels(sub_labels) 246 | return data, labels, meta 247 | elif dataset == 'Lin-Allen': 248 | labels = pd.read_csv(pjoin(FLAGS.data_dir, 'Lin-Allen_comp_labels.csv'), sep='\t', index_col=0) 249 | data = pd.read_csv(pjoin(FLAGS.data_dir, filename), sep='\t', index_col=0) 250 | 251 | if transpose: 252 | data = data.transpose() 253 | return np.array(data), labels[[label_col]], label_metadata(label_matrix=labels, label_col=label_col) 254 | 255 | 256 | -------------------------------------------------------------------------------- /Train_SDAE/tools/evaluate.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | from config import FLAGS 8 | # from data import fill_feed_dict as fill_feed_dict 9 | from utils import fill_feed_dict as fill_feed_dict 10 | 11 | from sklearn.metrics import precision_score, confusion_matrix, classification_report 12 | from sklearn.metrics import recall_score, f1_score, roc_curve, accuracy_score 13 | 14 | from tools.visualize import plot_confusion_matrix as pcm 15 | from tools.visualize import plot_roc_curve as roc 16 | 17 | np.set_printoptions(linewidth=200) 18 | 19 | def evaluation(logits, labels): 20 | """Evaluate the quality of the logits at predicting the label. 21 | 22 | Args: 23 | logits: Logits tensor, float - [batch_size, NUM_CLASSES]. 24 | labels: Labels tensor, int32 - [batch_size], with values in the 25 | range [0, NUM_CLASSES). 26 | 27 | Returns: 28 | A scalar int32 tensor with the number of examples (out of batch_size) 29 | that were predicted correctly. 30 | """ 31 | # For a classifier model, we can use the in_top_k Op. 32 | # It returns a bool tensor with shape [batch_size] that is true for 33 | # the examples where the labels was in the top k (here k=1) 34 | # of all logits for that example. 35 | # correct: type = List (of booleans) 36 | correct = tf.nn.in_top_k(logits, labels, 1) 37 | # correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1)) 38 | 39 | # accuracy = tf.reduce_mean(tf.cast(correct, "float")) 40 | y_p = tf.argmax(logits, 1) 41 | # l_p = tf.argmax(labels, 1) 42 | 43 | 44 | # Return the number of true entries. Cast because originally is bool. 45 | return tf.reduce_sum(tf.cast(correct, tf.int32)), correct, y_p 46 | 47 | def predict(sdae, data_set, bias_node=False): 48 | with sdae.session.graph.as_default(): 49 | labels_placeholder = tf.placeholder(tf.int32, shape=1,\ 50 | name='labels_placeholder') 51 | examples_placeholder = tf.placeholder(tf.float32,\ 52 | shape=(1, sdae._net_shape[0]),\ 53 | name='input_pl') 54 | 55 | logits = tf.identity(examples_placeholder) 56 | 57 | for layer in sdae.get_layers: 58 | if bias_node: 59 | bias_n = tf.ones(shape=[1, 1], dtype=tf.float32) 60 | logits = tf.concat(1, [bias_n, logits]) 61 | logits = layer.clean_activation(x_in=logits, use_fixed=False) 62 | 63 | predictions = tf.argmax(logits, 1) 64 | 65 | labels = tf.identity(labels_placeholder) 66 | 67 | y_pred = [] 68 | y_true = [] 69 | 70 | for _ in xrange(data_set.num_examples): 71 | feed_dict = fill_feed_dict(data_set, 72 | examples_placeholder, 73 | labels_placeholder, 1) 74 | 75 | y_prediction, y_trues = sdae.session.run([predictions, labels], feed_dict=feed_dict) 76 | y_pred += list(y_prediction) 77 | y_true += list(y_trues) 78 | 79 | # print(y_pred) 80 | return y_pred, y_true 81 | 82 | def do_eval(sess, 83 | eval_correct, 84 | predictions, 85 | examples_placeholder, 86 | labels_placeholder, 87 | label_map, 88 | data_set, 89 | title='Evaluation'): 90 | """Runs one evaluation against the full epoch of data. 91 | Args: 92 | sess: The session in which the model has been trained. 93 | eval_correct: The Tensor that returns the number of correct predictions. 94 | images_placeholder: The images placeholder. 95 | labels_placeholder: The labels placeholder. 96 | data_set: The set of images and labels to evaluate, from 97 | utils.read_data_sets(). 98 | """ 99 | # And run one epoch of eval. 100 | true_count = 0 # Counts the number of correct predictions. 101 | y_pred = [] 102 | y_true = [] 103 | steps_per_epoch = data_set.num_examples // FLAGS.batch_size 104 | num_examples = steps_per_epoch * FLAGS.batch_size 105 | 106 | labels = tf.identity(labels_placeholder) 107 | 108 | for _ in xrange(steps_per_epoch): 109 | feed_dict = fill_feed_dict(data_set, 110 | examples_placeholder, 111 | labels_placeholder) 112 | corrects, y_prediction, y_trues = sess.run([eval_correct, predictions,\ 113 | labels], feed_dict=feed_dict) 114 | true_count += corrects 115 | y_pred += list(y_prediction) 116 | y_true += list(y_trues) 117 | 118 | accuracy = true_count / num_examples 119 | print(title + ' - Num examples: %d Num correct: %d Accuracy_score @ 1: %0.08f' % 120 | (num_examples, true_count, accuracy)) 121 | 122 | # print("True output:", y_true) 123 | # print("Pred output:", y_pred) 124 | 125 | print("Precision:") 126 | print("\tNone: ", precision_score(y_true, y_pred, average=None, pos_label=None)) 127 | # print("\tBinary:", precision_score(y_true, y_pred, average='binary')) 128 | print("\tMicro: %0.08f" % precision_score(y_true, y_pred, average='micro', pos_label=None)) 129 | print("\tMacro: %0.08f" % precision_score(y_true, y_pred, average='macro', pos_label=None)) 130 | print("\tWeighted: %0.08f" % precision_score(y_true, y_pred, average='weighted', pos_label=None)) 131 | # print("\tSamples:", sklearn.metrics.precision_score(y_true, y_pred, average='samples')) 132 | # print("\tAccuracy_score: %0.08f" % accuracy_score(y_true, y_pred)) 133 | 134 | print("Recall:") 135 | # print("\tNone: ", recall_score(y_true, y_pred, average=None, pos_label=None)) 136 | # print("\tBinary:", recall_score(y_true, y_pred, average='binary')) 137 | print("\tMicro: %0.08f" % recall_score(y_true, y_pred, average='micro', pos_label=None)) 138 | print("\tMacro: %0.08f" % recall_score(y_true, y_pred, average='macro', pos_label=None)) 139 | print("\tWeighted: %0.08f" % recall_score(y_true, y_pred, average='weighted', pos_label=None)) 140 | # print("\tSamples:", sklearn.metrics.recall_score(y_true, y_pred, average='samples')) 141 | 142 | # print("F1_score:") 143 | # print("\tNone: ", f1_score(y_true, y_pred, average=None, pos_label=None)) 144 | # print("\tBinary:", f1_score(y_true, y_pred, average='binary')) 145 | # print("\tMicro: %0.08f" % f1_score(y_true, y_pred, average='micro', pos_label=None)) 146 | # print("\tMacro: %0.08f" % f1_score(y_true, y_pred, average='macro', pos_label=None)) 147 | print("\nF1 Score (weighted): %0.08f" % f1_score(y_true, y_pred, average='weighted', pos_label=None)) 148 | # print("\tSamples:", sklearn.metrics.f1_score(y_true, y_pred, average='samples')) 149 | 150 | # print("True Length:", len(y_true)) 151 | # print("Prediction Length:", len(y_pred)) 152 | 153 | cm = confusion_matrix(y_true, y_pred) 154 | # print("\nConfusion Matrix") 155 | # print(cm) 156 | 157 | cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 158 | # print("\nNormalized confusion_matrix") 159 | # print(cm_normalized) 160 | 161 | print("") 162 | print(classification_report(y_true, y_pred, target_names=label_map)) 163 | 164 | pcm(cm, target_names=label_map, title=title) 165 | pcm(cm_normalized, target_names=label_map, title=title+"_Normalized") 166 | 167 | roc(y_pred, y_true, n_classes=len(label_map), title=title) 168 | 169 | print("\n=====================================================================================================\n") 170 | 171 | 172 | def do_eval_summary(tag, 173 | sess, 174 | eval_correct, 175 | examples_placeholder, 176 | labels_placeholder, 177 | data_set): 178 | true_count = 0 179 | steps_per_epoch = data_set.num_examples // FLAGS.batch_size 180 | num_examples = steps_per_epoch * FLAGS.batch_size 181 | for _ in xrange(steps_per_epoch): 182 | feed_dict = fill_feed_dict(data_set, 183 | examples_placeholder, 184 | labels_placeholder) 185 | true_count += sess.run(eval_correct, feed_dict=feed_dict) 186 | error = 1 - true_count / num_examples 187 | 188 | return sess.run(tf.scalar_summary(tag, tf.identity(error))) 189 | 190 | 191 | -------------------------------------------------------------------------------- /Train_SDAE/tools/evaluate_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn 3 | from scipy.special import expit 4 | from sklearn import ensemble 5 | from sklearn.manifold import TSNE 6 | import time 7 | from os.path import join as pjoin 8 | from tools.config import FLAGS 9 | from tools.visualize import scatter 10 | 11 | def get_activations(exp_data, w, b): 12 | # exp_data = np.transpose(exp_data) 13 | prod = exp_data.dot(w) 14 | prod_with_bias = prod + b 15 | return( expit(prod_with_bias) ) 16 | 17 | # Order of *args: first all the weights and then all the biases 18 | def run_random_forest(exp_data, labels, weights, biases, n_layers=None, bias_node=False): 19 | print("Calculating Random Forests...") 20 | assert len(exp_data) == len(labels) 21 | 22 | # I think they should be already transposed when running the code. Will see 23 | act = exp_data#.T 24 | 25 | # Using ternary operator for shortness 26 | n = n_layers if n_layers else len(weights) 27 | 28 | for i in range(n): 29 | print('Weights and biases for layer: ' + str(i+1)) 30 | # print np.asarray(weights[i]).shape, np.asarray(biases[i]).shape 31 | if bias_node: 32 | act = np.insert(act, 1, np.ones_like(act[:,0]), 1) 33 | act = get_activations(act, weights[i], biases[i]) 34 | 35 | rf = ensemble.RandomForestClassifier(n_estimators=1000, oob_score=True, max_depth=5) 36 | rfit = rf.fit(act, labels) 37 | print('OOB score: %.8f\n' % rfit.oob_score_) 38 | 39 | 40 | def plot_tSNE(data, labels, random_state=7074568, plot_name='tsne-generated_{}.png'): 41 | # Calculate t-SNE projections 42 | x_projection = TSNE(random_state=random_state).fit_transform(data) 43 | 44 | # Form the output file name 45 | plot_name = plot_name if plot_name.find(".") > 0 else plot_name+".png" 46 | plot_name = pjoin(FLAGS.output_dir, plot_name.format(time.strftime("%Y-%m-%d %H:%M:%S"))) 47 | 48 | # Create and save a t-SNE scatter plot 49 | scatter(x_projection, labels, plot_name=plot_name) 50 | 51 | -------------------------------------------------------------------------------- /Train_SDAE/tools/start_tensorboard.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | import shlex, subprocess 4 | 5 | from tensorflow import tensorboard as tb 6 | 7 | from config import FLAGS, home_out 8 | 9 | # Configure path variables 10 | _summary_dir = FLAGS.summary_dir 11 | _tb_pid_file = home_out(".tbpid") 12 | 13 | # Configure environment/network parameters 14 | _tb_path = os.path.join(os.path.dirname(tb.__file__), 'tensorboard.py') 15 | _tb_port = "6006" 16 | _tb_host = "0.0.0.0" 17 | 18 | def start_tb(): 19 | if not os.path.exists(_tb_path): 20 | raise EnvironmentError("tensorboard.py not found!") 21 | 22 | if os.path.exists(_tb_pid_file): 23 | tb_pid = int(open(_tb_pid_file, 'r').readline().strip()) 24 | try: 25 | os.kill(tb_pid, signal.SIGKILL) 26 | except OSError: 27 | pass 28 | 29 | os.remove(_tb_pid_file) 30 | 31 | devnull = open(os.devnull, 'wb') 32 | args = shlex.split('nohup ' + FLAGS.python + ' -u ' + _tb_path 33 | + ' --host '+ _tb_host + ' --port ' + _tb_port 34 | + ' --logdir={0}'.format(_summary_dir)) 35 | 36 | p = subprocess.Popen(args, stdout=devnull, stderr=devnull) 37 | 38 | with open(_tb_pid_file, 'w') as f: 39 | f.write(str(p.pid)) 40 | 41 | # if not FLAGS.no_browser: 42 | # subprocess.Popen(['open', 'http://localhost:{0}'.format(_tb_port)]) 43 | 44 | 45 | if __name__ == '__main__': 46 | start_tb() 47 | -------------------------------------------------------------------------------- /Train_SDAE/tools/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import csv 3 | from config import FLAGS 4 | 5 | from sklearn.cross_validation import train_test_split 6 | 7 | class DataSet(object): 8 | def __init__(self, examples, labels=None): 9 | if labels is not None: 10 | assert len(examples) == len(labels), ( 11 | 'examples.shape: %s labels.shape: %s' 12 | % (examples.shape, labels.shape)) 13 | 14 | self._num_examples = examples.shape[0] 15 | self._examples = examples 16 | self._labels = labels 17 | self._epochs_completed = 0 18 | self._index_in_epoch = 0 19 | 20 | @property 21 | def examples(self): 22 | return self._examples 23 | 24 | @property 25 | def labels(self): 26 | return self._labels 27 | 28 | @property 29 | def num_examples(self): 30 | return self._num_examples 31 | 32 | @property 33 | def epochs_completed(self): 34 | return self._epochs_completed 35 | 36 | @property 37 | def index_in_epoch(self): 38 | return self._index_in_epoch 39 | 40 | def next_batch(self, batch_size): 41 | """Return the next `batch_size` examples from this data set.""" 42 | start = self._index_in_epoch 43 | self._index_in_epoch += batch_size 44 | 45 | if self._index_in_epoch > self._num_examples: 46 | # Finished epoch 47 | self._epochs_completed += 1 48 | 49 | # Shuffle the data 50 | perm = np.arange(self._num_examples) 51 | np.random.shuffle(perm) 52 | 53 | self._examples = self._examples[perm] 54 | 55 | if self._labels is not None: 56 | self._labels = self._labels[perm] 57 | 58 | # Start next epoch 59 | start = 0 60 | self._index_in_epoch = batch_size 61 | assert batch_size <= self._num_examples 62 | 63 | end = self._index_in_epoch 64 | 65 | if self._labels is None: 66 | return self._examples[start:end] #self._examples.iloc[start:end] 67 | else: 68 | # return self._examples.iloc[start:end], self._labels.iloc[start:end] 69 | return self._examples[start:end], self._labels[start:end] 70 | 71 | 72 | class DataSetPreTraining(object): 73 | def __init__(self, examples): 74 | self._num_examples = examples.shape[0] 75 | self._examples = examples 76 | 77 | self._examples[self._examples < FLAGS.zero_bound] = FLAGS.zero_bound 78 | self._examples[self._examples > FLAGS.one_bound] = FLAGS.one_bound 79 | 80 | self._epochs_completed = 0 81 | self._index_in_epoch = 0 82 | 83 | @property 84 | def examples(self): 85 | return self._examples 86 | 87 | @property 88 | def num_examples(self): 89 | return self._num_examples 90 | 91 | @property 92 | def num_batches(self): 93 | return self.num_examples / FLAGS.batch_size 94 | 95 | @property 96 | def epochs_completed(self): 97 | return self._epochs_completed 98 | 99 | @property 100 | def index_in_epoch(self): 101 | return self._index_in_epoch 102 | 103 | # """ TODO: Under implementation """ 104 | # def same_batch(self): 105 | # pass 106 | 107 | def next_batch(self, batch_size): 108 | """Return the next `batch_size` examples from this data set.""" 109 | start = self._index_in_epoch 110 | self._index_in_epoch += batch_size 111 | 112 | if self._index_in_epoch > self._num_examples: 113 | # Finished epoch 114 | self._epochs_completed += 1 115 | 116 | # Shuffle the data 117 | perm = np.arange(self._num_examples) 118 | np.random.shuffle(perm) 119 | self._images = self._examples[perm] 120 | 121 | # Start next epoch 122 | start = 0 123 | self._index_in_epoch = batch_size 124 | 125 | # print self._num_examples 126 | assert batch_size <= self._num_examples 127 | 128 | end = self._index_in_epoch 129 | 130 | return self._examples[start:end] 131 | 132 | 133 | def load_data_sets(input_data, labels, split_only=True, valid_set=False): 134 | class DataSets(object): 135 | pass 136 | data_sets = DataSets() 137 | 138 | print("\nSplitting to Train & Test sets for Finetuning") 139 | 140 | if valid_set: 141 | train_examples, test_examples, train_labels, test_labels = \ 142 | train_test_split(input_data, labels, test_size=0.2) 143 | train_examples, validation_examples, train_labels, validation_labels = \ 144 | train_test_split(train_examples, train_labels, test_size=0.05) 145 | data_sets.validation = DataSet(validation_examples, validation_labels) 146 | else: 147 | train_examples, test_examples, train_labels, test_labels = \ 148 | train_test_split(input_data, labels, test_size=0.3) 149 | data_sets.validation = None 150 | 151 | # validation_examples = input_data[:VALIDATION_SIZE] 152 | # train_examples = input_data[VALIDATION_SIZE:] 153 | 154 | data_sets.train = DataSet(train_examples, train_labels) 155 | data_sets.test = DataSet(test_examples, test_labels) 156 | 157 | if not split_only: 158 | data_sets.all = DataSet(input_data, labels) 159 | 160 | return data_sets 161 | 162 | 163 | 164 | def load_data_sets_pretraining(input_data, split_only=True, valid_set=False): 165 | """ Load data-sets for pre-training 166 | Data-sets for pre-training does not include labels. It takes 167 | an input data-set and it splits it in train, test and validation 168 | (optional) sets. Then it returns these subsets as DataSetPreTraining 169 | objects which have the ability to give the data in batches (among 170 | other useful functions). If split_only argument is False then it 171 | also returns the whole input data-set as a DataSetPreTraining object. 172 | 173 | Args: 174 | input_data: The data-set to be split. 175 | split_only: If True it just splits the data-set and returns its 176 | subsets as DataSetPreTraining objects, otherwise it 177 | also returns the data-set as DataSetPreTraining object. 178 | valid_set: Whether to create a validation set along with test 179 | and train or not (default False) 180 | """ 181 | class DataSets(object): 182 | pass 183 | data_sets = DataSets() 184 | 185 | print("\nSplitting to Train & Test sets for pre-training") 186 | 187 | if valid_set: 188 | train_examples, test_examples = train_test_split(input_data, test_size=0.20) 189 | train_examples, validation_examples = train_test_split(train_examples, test_size=0.05) 190 | data_sets.validation = DataSetPreTraining(validation_examples) 191 | else: 192 | train_examples, test_examples = train_test_split(input_data, test_size=0.3) 193 | data_sets.validation = None 194 | 195 | if not split_only: 196 | data_sets.all = DataSetPreTraining(input_data) 197 | 198 | data_sets.train = DataSetPreTraining(train_examples) 199 | data_sets.test = DataSetPreTraining(test_examples) 200 | 201 | return data_sets 202 | 203 | 204 | ''' 205 | """ TODO: ADD more noise functions such as Gaussian noise etc. """ 206 | def _add_noise(x, ratio, n_type='MN'): 207 | """ Noise adding (or input corruption) 208 | This function adds noise to the given dataset. 209 | 210 | Args: 211 | x : The input dataset for the noise to be applied (numpy array) 212 | ratio: The percentage of the data affected by the noise addition 213 | n_type: The type of noise to be applied. 214 | Choices: MN (masking noise), SP (salt-and-pepper noise) 215 | """ 216 | ''' 217 | 218 | def fill_feed_dict_dae(data_set, input_pl, batch_size=None): 219 | b_size = FLAGS.batch_size if batch_size is None else batch_size 220 | 221 | input_feed = data_set.next_batch(b_size) 222 | feed_dict = { input_pl: input_feed } 223 | 224 | return feed_dict 225 | 226 | 227 | def fill_feed_dict(data_set, input_pl, labels_pl, batch_size=None): 228 | """Fills the feed_dict for training the given step. 229 | A feed_dict takes the form of: 230 | feed_dict = { 231 | : , 232 | .... 233 | } 234 | Args: 235 | data_set: The set of images and labels, from input_data.read_data_sets() 236 | images_pl: The examples placeholder, from placeholder_inputs(). 237 | labels_pl: The labels placeholder, from placeholder_inputs(). 238 | Returns: 239 | feed_dict: The feed dictionary mapping from placeholders to values. 240 | """ 241 | # Create the feed_dict for the placeholders filled with the next 242 | # `batch size ` examples. 243 | b_size = FLAGS.batch_size if batch_size is None else batch_size 244 | 245 | examples_feed, labels_feed = data_set.next_batch(b_size) 246 | 247 | feed_dict = { 248 | input_pl: examples_feed, 249 | labels_pl: labels_feed 250 | } 251 | 252 | return feed_dict 253 | 254 | 255 | def normalize_data(x, transpose=False): 256 | # Normalization across the whole matrix 257 | # x_max = np.max(x) 258 | # x_min = np.min(x) 259 | # x_norm = (x - x_min) / np.float32(x_max - x_min) 260 | 261 | 262 | # Normalization across the features 263 | x_norm = [] 264 | if transpose: 265 | x = np.transpose(x) 266 | print("\nData Transposed.") 267 | 268 | print "\nNormalizing", len(x), "Features..." 269 | for i in range(len(x)): 270 | x_norm.append((x[i] - np.min(x[i])) / np.float32(np.max(x[i]) - np.min(x[i]))) 271 | if np.isnan(x_norm[i]).any(): 272 | print("NAN at:", i) 273 | 274 | """ OR (norm='l1' or 'l2' or 'max') 275 | from sklearn.preprocessing import normalize 276 | x_norm = normalize(input_data, axis=??, norm='??') 277 | """ 278 | print("Normalization: Done. Transposing...") 279 | return np.asarray(np.transpose(x_norm)) 280 | 281 | 282 | def label_metadata(label_matrix, label_col): 283 | # Check whether the column value is given as index (number) or name (string) 284 | try: 285 | label_col = int(label_col) 286 | 287 | # If given as number, take the name of the column out of it 288 | label_col = label_matrix.columns[label_col] 289 | except ValueError: 290 | pass 291 | 292 | import pandas as pd 293 | # Get the unique classes in the given column, and how many of them are there 294 | unique_classes = pd.unique(label_matrix[label_col].ravel()) 295 | #num_classes = unique_classes.shape[0] 296 | 297 | # Map the unique n classes with a number from 0 to n 298 | label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))}) 299 | 300 | # Replace the given column's values with the mapped equivalent 301 | mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist()) 302 | 303 | # Return the mapped labels as numpy list and the label map (unique classes and number can be obtained from map) 304 | return np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],)), np.asarray(label_map) #, unique_classes, num_classes 305 | 306 | 307 | def write_csv(filename, data, sep='\t'): 308 | with open(filename, 'w') as fp: 309 | a = csv.writer(fp, delimiter='\t') 310 | a.writerows(data) 311 | 312 | 313 | 314 | -------------------------------------------------------------------------------- /Train_SDAE/tools/visualize.py: -------------------------------------------------------------------------------- 1 | #import matplotlib.mlab as mlab 2 | import matplotlib.pyplot as plt 3 | import matplotlib.patheffects as PathEffects 4 | import seaborn as sns 5 | import numpy as np 6 | from os.path import join as pjoin 7 | from config import FLAGS 8 | from sklearn.metrics import confusion_matrix, roc_curve, auc 9 | from scipy import interp 10 | 11 | ''' 12 | interpolation options: 13 | [None, 'none', 'nearest', 'bilinear', 'bicubic', 'spline16', 14 | 'spline36', 'hanning', 'hamming', 'hermite', 'kaiser', 'quadric', 15 | 'catrom', 'gaussian', 'bessel', 'mitchell', 'sinc', 'lanczos'] 16 | ''' 17 | 18 | def scatter(x, y, plot_name): 19 | """ Used to plot t-SNE projections """ 20 | 21 | num_colors = len(np.unique(y)) 22 | # We choose a color palette with seaborn. 23 | palette = np.array(sns.color_palette("hls", num_colors)) 24 | # We create a scatter plot. 25 | f = plt.figure(figsize=(8, 8)) 26 | ax = plt.subplot(aspect='equal') 27 | sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40, 28 | c=palette[y.astype(np.int)]) 29 | plt.xlim(-25, 25) 30 | plt.ylim(-25, 25) 31 | ax.axis('off') 32 | ax.axis('tight') 33 | # We add the labels for each digit. 34 | txts = [] 35 | for i in range(num_colors): 36 | # Position of each label. 37 | xtext, ytext = np.median(x[y == i, :], axis=0) 38 | # if np.isnan(xtext) or np.isnan(ytext): 39 | # break 40 | txt = ax.text(xtext, ytext, str(i), fontsize=24) 41 | txt.set_path_effects([ 42 | PathEffects.Stroke(linewidth=5, foreground="w"), 43 | PathEffects.Normal()]) 44 | txts.append(txt) 45 | 46 | plt.savefig(plot_name, dpi=120) 47 | plt.close() 48 | 49 | 50 | def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=plt.cm.BuGn): 51 | imgplot = plt.imshow(cm, interpolation='nearest', cmap=cmap) 52 | plt.grid(False) 53 | plt.colorbar(imgplot) 54 | plt.title(title) 55 | tick_marks = np.arange(len(target_names)) 56 | plt.xticks(tick_marks, target_names, rotation=90) 57 | plt.yticks(tick_marks, target_names) 58 | plt.tight_layout() 59 | plt.ylabel('True label') 60 | plt.xlabel('Predicted label') 61 | plt.savefig(pjoin(FLAGS.output_dir, title.replace(' ', '_') + '_CM.png')) 62 | plt.close() 63 | 64 | 65 | def plot_roc_curve(y_pred, y_true, n_classes, title='ROC_Curve'): 66 | # Compute ROC curve and ROC area for each class 67 | fpr = dict() 68 | tpr = dict() 69 | tresholds = dict() 70 | roc_auc = dict() 71 | 72 | for i in range(n_classes): 73 | fpr[i], tpr[i], tresholds[i] = roc_curve(y_true, y_pred, pos_label=i, drop_intermediate=False) 74 | roc_auc[i] = auc(fpr[i], tpr[i]) 75 | 76 | # Compute micro-average ROC curve and ROC area 77 | # fpr["micro"], tpr["micro"], _ = roc_curve(np.asarray(y_true).ravel(), np.asarray(y_pred).ravel(), pos_label=0, drop_intermediate=True) 78 | # roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 79 | 80 | # Aggregate all false positive rates 81 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) 82 | 83 | # print("Thresholds:") 84 | # Interpolate all ROC curves at this points 85 | mean_tpr = np.zeros_like(all_fpr) 86 | for i in range(n_classes): 87 | mean_tpr += interp(all_fpr, fpr[i], tpr[i]) 88 | # print("Class_{0}: {1}".format(i, tresholds[i])) 89 | 90 | # Average it and compute AUC 91 | mean_tpr /= n_classes 92 | 93 | fpr["macro"] = all_fpr 94 | tpr["macro"] = mean_tpr 95 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) 96 | 97 | 98 | # Plot all ROC curves 99 | fig = plt.figure() 100 | ax = fig.add_subplot(111) 101 | 102 | # plt.plot(fpr["micro"], tpr["micro"], 103 | # label='micro-average ROC curve (area = {0:0.2f})' 104 | # ''.format(roc_auc["micro"]), 105 | # linewidth=3, ls='--', color='red') 106 | 107 | plt.plot(fpr["macro"], tpr["macro"], 108 | label='macro-average ROC curve (area = {0:0.2f})' 109 | ''.format(roc_auc["macro"]), 110 | linewidth=3, ls='--', color='green') 111 | 112 | for i in range(n_classes): 113 | plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})' 114 | ''.format(i, roc_auc[i])) 115 | 116 | plt.plot([0, 1], [0, 1], 'k--', linewidth=2) 117 | plt.xlim([0.0, 1.0]) 118 | plt.ylim([0.0, 1.05]) 119 | plt.xlabel('False Positive Rate') 120 | plt.ylabel('True Positive Rate') 121 | plt.title('Multi-class Receiver Operating Characteristic') 122 | lgd = ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) 123 | 124 | plt.savefig(pjoin(FLAGS.output_dir, title.replace(' ', '_') + '_ROC.png'), bbox_extra_artists=(lgd,), bbox_inches='tight') 125 | plt.close() 126 | 127 | 128 | def hist_comparison(data1, data2): 129 | f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) 130 | f.suptitle('Histogram Before and After Normalization') 131 | ax1.hist(data1, 10, facecolor='green', alpha=0.75) 132 | ax1.set_xlabel("Values") 133 | ax1.set_ylabel("# of Examples") 134 | ax1.grid(True) 135 | ax2.hist(data2, 10, facecolor='green', alpha=0.75) 136 | ax2.set_xlabel("Values") 137 | ax2.grid(True) 138 | 139 | f.savefig(pjoin(FLAGS.output_dir, 'hist_comparison.png')) 140 | # plt.show() 141 | plt.close() 142 | 143 | 144 | def make_heatmap(data, name): 145 | f = plt.figure() 146 | ax1 = f.add_axes([0.1,0.1,0.8,0.8]) 147 | ax1.grid(False) 148 | imgplot = ax1.imshow(data, interpolation="none") 149 | imgplot.set_cmap('seismic') 150 | f.colorbar(imgplot) 151 | f.savefig(pjoin(FLAGS.output_dir, name + '.png')) 152 | plt.close() 153 | 154 | def make_2d_hist(data, name): 155 | f = plt.figure() 156 | X,Y = np.meshgrid(range(data.shape[0]), range(data.shape[1])) 157 | im = plt.pcolormesh(X,Y,data.transpose(), cmap='seismic') 158 | plt.colorbar(im, orientation='vertical') 159 | # plt.hexbin(data,data) 160 | # plt.show() 161 | f.savefig(pjoin(FLAGS.output_dir, name + '.png')) 162 | plt.close() 163 | 164 | # def make_2d_hexbin(data, name): 165 | # f = plt.figure() 166 | # X,Y = np.meshgrid(range(data.shape[0]), range(data.shape[1])) 167 | # plt.hexbin(X, data) 168 | # # plt.show() 169 | # f.savefig(pjoin(FLAGS.output_dir, name + '.png')) 170 | 171 | def heatmap_comparison(data1, label1, data2, label2, data3, label3): 172 | interpolation = 'none' 173 | 174 | fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) 175 | fig.suptitle('Heatmap Comparison of Normal and Noisy Data') 176 | ax1.imshow(data3, interpolation=interpolation) 177 | ax1.set_title(label1) 178 | ax1.set_ylabel("Examples") 179 | ax1.set_xlabel("Features") 180 | ax1.set_aspect('equal') 181 | 182 | ax2.imshow(data2, interpolation=interpolation) 183 | ax2.set_title(label2) 184 | ax2.set_xlabel("Features") 185 | ax2.set_aspect('equal') 186 | 187 | ax3.imshow(data1, interpolation=interpolation) 188 | ax3.set_title(label3) 189 | ax3.set_xlabel("Features") 190 | ax3.set_aspect('equal') 191 | 192 | cax = fig.add_axes([0, 0, .1, .1]) 193 | cax.get_xaxis().set_visible(False) 194 | cax.get_yaxis().set_visible(False) 195 | cax.patch.set_alpha(0.5) 196 | cax.set_frame_on(True) 197 | # plt.colorbar(ax1, ax2, orientation='vertical') 198 | plt.show() 199 | plt.close() 200 | # 201 | # fig = plt.figure(figsize=(6, 3.2)) 202 | # 203 | # ax = fig.add_subplot(111) 204 | # ax.set_title('colorMap') 205 | # plt.imshow(data1) 206 | # ax.set_aspect('equal') 207 | # 208 | # cax = fig.add_axes([0.12, 0.1, 0.78, 0.8]) 209 | # cax.get_xaxis().set_visible(False) 210 | # cax.get_yaxis().set_visible(False) 211 | # cax.patch.set_alpha(0) 212 | # cax.set_frame_on(False) 213 | # plt.colorbar(orientation='vertical') 214 | # plt.show() 215 | # --------------------------------------------------------------------------------