├── .gitignore
├── Evaluation
    ├── evaluate_model.R
    └── evaluate_model_old.R
├── LICENSE
├── README.md
└── Train_SDAE
    ├── dae.py
    ├── run.py
    ├── stacked_dae.py
    ├── test_dae.py
    └── tools
        ├── ADASYN.py
        ├── __init__.py
        ├── config.py
        ├── data_handler.py
        ├── evaluate.py
        ├── evaluate_model.py
        ├── start_tensorboard.py
        ├── utils.py
        └── visualize.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/Evaluation/evaluate_model.R:
--------------------------------------------------------------------------------
  1 | suppressPackageStartupMessages(library("randomForest"))
  2 | library("Rtsne")
  3 | 
  4 | # # Function definitions
  5 | 
  6 | sgm <- function(x){
  7 |     # Sigmoid function
  8 |     return(1/(1+exp(-x)))
  9 | }
 10 | 
 11 | get_activations <- function(exp_data, w, b){
 12 |     # Propagate inputs through to the hidden layer
 13 |     # Linear transform
 14 |     print(dim(w))
 15 |     print(dim(exp_data))
 16 |     lin <- t(w) %*% as.matrix(exp_data)
 17 |     # Add bias (a bit ugly)
 18 |     bia <- lin
 19 |     for(i in 1:nrow(lin)){
 20 |         bia[i,] <- lin[i,] + b[[i]]
 21 |     }
 22 |     act <- t(sgm(bia))
 23 |     return(act)
 24 | }
 25 | 
 26 | node.act.per.type <- function(act, node, m){
 27 |     lev <- levels(coi)
 28 |     boxes <- NULL
 29 |     for (ctype in lev){
 30 |         box <- t(act[which(m==ctype), node])
 31 |         boxes[[ctype]] <- box
 32 |     }
 33 |     boxplot(boxes, las=2, main=paste("Node", node), ylim=c(0,1))
 34 | }
 35 | 
 36 | type.act.per.node <- function(act, m, filename){
 37 |     par(mfcol=c(3,1))
 38 |     for(cell in levels(coi)){
 39 |         boxplot(act[which(coi==cell),], main=cell, las=2, names=paste0("Node",1:ncol(act)), ylim=c(0,1))
 40 |     }
 41 |     par(mfrow=c(1,1))
 42 | }
 43 | 
 44 | # # Define colors and such for the metadata
 45 | def_colors <- function(meta){
 46 | #     print(meta)
 47 |     # Now 1st column is the former 2nd column. So we use this to take tha names
 48 |     typeNames <<- levels(meta[, colnames(meta)[1]])
 49 | #     print(typeNames)
 50 | 
 51 | ## COLORS :  red=552, blue=26, black=24, green=254, yellow=652 --> change-to yellow2=654
 52 | ## COLORS :  orange=498 --> change-to darkorange1=91, brown=32 --> change-to chocolate4=56
 53 | ## COLORS :  purple=547, grey39=300, violetred=641, darkgreen=81, cyan=68, magenta=450
 54 | ## COLORS :  goldenrod4=151, hotpink=367, darkolivegreen2=87, midnightblue=477, lightcoral=404
 55 | ## COLORS :  darkslategrey=113, 
 56 | 
 57 |     distinct_color_pool <- c("red","blue","black","green","yellow2","darkorange1",
 58 |                             "chocolate4","purple","grey39","violetred","darkgreen",
 59 |                             "cyan","magenta","goldenrod4","hotpink","darkolivegreen2",
 60 |                             "midnightblue","midnightblue","darkslategrey")
 61 | #     typeColors <<- rainbow(length(typeNames))
 62 |     typeColors <<- distinct_color_pool[1:length(typeNames)]
 63 | #     print(typeColors)
 64 |     names(typeColors) <<- typeNames
 65 | #     print(typeColors)
 66 |     
 67 |     # Take the column of interest (coi) and assign example names to the labels
 68 |     coi <<- meta[, colnames(meta)[1]]
 69 | #     print(coi)
 70 |     names(coi) <<- 1:nrow(meta)
 71 | #     print(coi)
 72 | }
 73 | 
 74 | # # Handle several analysis functions
 75 | do_analysis <- function(act, w, b, outfile_pref, bias_node=FALSE){
 76 |     for(i in 1:length(w)){
 77 |         if(bias_node == TRUE){
 78 |             act <- cbind(rep(1, nrow(act)), act)
 79 |         }
 80 |         act <- get_activations(t(act), w[[i]], b[[i]])
 81 |     #     print(act)
 82 |         nondup <- act[which(!duplicated(act)),]
 83 |         print(dim(act))
 84 |         print(dim(nondup))
 85 |         
 86 |         colrs <- typeColors[coi[1:nrow(act)]]
 87 |         plot_pca(nondup, colrs, paste(outfile_pref, i, sep='_'))
 88 |         
 89 |         colrs <- typeColors[coi[1:nrow(nondup)]]
 90 |         plot_tsne(nondup, colrs, paste(outfile_pref, i, sep='_'))
 91 | 
 92 |         node_profiles(act, paste(outfile_pref, i, sep='_'))
 93 |         cell_profiles(act, paste(outfile_pref, i, sep='_'))
 94 |         calc_rf(act)
 95 |     }
 96 | }
 97 | 
 98 | # # PCA on activations
 99 | plot_pca <- function(act, colrs, outfile_pref){
100 |     pcafile <- paste(outfile_pref, "PCA.pdf", sep="_")
101 | 
102 |     p <- prcomp(act)
103 | 
104 |     pdf(file=pcafile, paper="a4r")
105 |     # par mar(Bottom, Left, Top, Right)
106 |     layout(matrix(c(1,2,3,3), ncol=2, byrow=TRUE), heights=c(4, 1))
107 |     plot(p$x, col=colrs, pch=20)
108 |     plot(p$x[,2:3], col=colrs, pch=20)
109 |     par(mai=c(0,0,0,0))
110 |     plot.new()
111 |     legend("center", bty="n", legend=names(typeColors), col=typeColors, pch=rep(20,length(typeColors)), ncol=as.integer((length(typeColors)/10)+0.5), cex=0.8, pt.cex=0.8)
112 |     dev.off()
113 | }
114 | 
115 | # # Rtsne
116 | plot_tsne <- function(act, colrs, outfile_pref){
117 |     tsnefile <- paste(outfile_pref, "tSNE.pdf", sep="_")
118 |     
119 | #     nondup <- act[which(!duplicated(act)),]
120 |     r <- Rtsne(act, perplexity=10)
121 | 
122 |     pdf(file=tsnefile, paper="a4r")
123 |     layout(matrix(c(1,2), ncol=1), heights=c(4, 1))
124 |     plot(r$Y, pch=20, col=colrs, xlab="", ylab="")
125 |     par(mai=c(0,0,0,0))
126 |     plot.new()
127 |     legend("center", bty="n", legend=names(typeColors), col=typeColors, pch=rep(20,length(typeColors)), ncol=as.integer((length(typeColors)/10)+0.5), cex=0.7, pt.cex=0.7)
128 |     dev.off()
129 | }
130 | 
131 | # # Look at the nodes in order of decreasing standard deviation
132 | node_profiles <- function(act, outfile_pref){
133 |     filename <- paste(outfile_pref, "node_profiles.pdf", sep="_")
134 | 
135 |     pdf(filename, paper="a4")
136 |     layout(matrix(c(1,2,3), nrow=1, ncol=3,byrow=TRUE))
137 |     par(mar=c(15.0, 2.3, 2.6, 2.1))
138 | 
139 |     for(node in order(apply(act, 2, sd),decreasing=TRUE)){
140 |         node.act.per.type(act, node, coi)
141 |     }
142 |     dev.off()
143 | }
144 | 
145 | # # Or per cell type
146 | cell_profiles <- function(act, outfile_pref){
147 |     filename <- paste(outfile_pref, "cell_profiles.pdf", sep="_")
148 | 
149 |     pdf(filename, paper="a4")
150 |     par(mar=c(4.5, 2.3, 1.7, 0.1))
151 |     type.act.per.node(act, coi)
152 |     dev.off()
153 | }
154 | 
155 | # # Check predictivity
156 | calc_rf <- function(act){
157 |     rf <- randomForest(x=act, y=as.factor(coi), importance=TRUE)
158 |     print(paste("RF estimated error rate", tail(rf$err.rate, n=1)[,1], sep=":"))
159 | }
160 | 


--------------------------------------------------------------------------------
/Evaluation/evaluate_model_old.R:
--------------------------------------------------------------------------------
  1 | suppressPackageStartupMessages(library("randomForest"))
  2 | library("Rtsne")
  3 | 
  4 | ## Function definitions
  5 | 
  6 | sgm <- function(x){
  7 |   # Sigmoid function
  8 |     return(1/(1+exp(-x)))
  9 | }
 10 | 
 11 | get_activations <- function(exp_data, w, b){
 12 |   # Propagate inputs through to the hidden layer
 13 |   # Linear transform
 14 |   lin <- t(w) %*% as.matrix(exp_data)
 15 |   # Add bias (a bit ugly)
 16 |   bia <- lin
 17 |   for(i in 1:nrow(lin)){
 18 |     bia[i,] <- lin[i,] + b[i,] 
 19 |   }
 20 |   act <- t(sgm(bia))
 21 |   return(act)
 22 | }
 23 | 
 24 | node.act.per.type <- function(act, node, m){
 25 |   shortNames <- c("Astro","Endo","GABA","Glut","Microglia","Oligo","OligoPC","Uncl")
 26 |   boxplot(act[which(m=="Astrocyte"),node], act[which(m=="Endothelial Cell"),node],
 27 |           act[which(m=="GABA-ergic Neuron"),node],act[which(m=="Glutamatergic Neuron"),node],
 28 |           act[which(m=="Microglia"),node], act[which(m=="Oligodendrocyte"),node],
 29 |           act[which(m=="Oligodendrocyte Precursor Cell"),node], act[which(m=="Unclassified"),node],
 30 |           names=shortNames,main=paste("Node",node),las=2,cex=0.5)
 31 | }
 32 | 
 33 | type.act.per.node <- function(act, m){
 34 |   par(mfrow=c(4,2))
 35 |   for(cell in levels(btype)){
 36 |     boxplot(act[which(btype==cell),],main=cell,las=2,names=paste0("Node",1:ncol(act)))
 37 |   }
 38 |   par(mfrow=c(1,1))
 39 | }
 40 | 
 41 | 
 42 | args <- commandArgs(trailingOnly = TRUE)
 43 | numLayers <- (length(args) - 2)/2
 44 | print(paste("Number of layers:", numLayers))
 45 | 
 46 | # Read expression data. (Currently used only to get gene names.)
 47 | print("Reading expression data...")
 48 | exp_data <- read.delim(args[1],check.names=FALSE,row.names=1)
 49 | # Read metadata (clustering results)
 50 | print("Reading metadata...")
 51 | meta <- read.delim(args[2],check.names=FALSE,row.names=1)
 52 | # Check for same ordering
 53 | stopifnot(identical(colnames(exp_data), rownames(meta)))
 54 | 
 55 | # Propagate activity through the network
 56 | # Activation of visible layer is the actual expression data
 57 | act <- t(exp_data)
 58 | for(i in 1:numLayers){
 59 |   # Read weights and bias for the layer in question
 60 |   print(paste("Reading weights for layer", i))
 61 |   w <- read.delim(args[2*i+1],header=FALSE)
 62 |   print(paste("Reading biases for layer", i))
 63 |   b <- read.delim(args[2*i+2],header=FALSE)
 64 |   act <- get_activations(t(act), w, b)
 65 | }
 66 | 
 67 | # Define colors and such for the metadata
 68 | typeNames <- levels(meta$broad_type)
 69 | typeCols <- c("red","blue","black","green","yellow","orange","brown","purple")
 70 | names(typeCols) <- typeNames
 71 | btype <- meta$broad_type
 72 | names(btype) <- rownames(meta)
 73 | 
 74 | outfile_pref <- strsplit(basename(args[1]),"\\.")[[1]][1]
 75 | print(outfile_pref)
 76 | 
 77 | # PCA on activations
 78 | pcafile <- paste(outfile_pref, "PCA.pdf", sep="_")
 79 | pdf(pcafile)
 80 | par(mfrow=c(1,2))
 81 | p <- prcomp(act)
 82 | plot(p$x,col=typeCols[btype[rownames(act)]],pch=20)
 83 | plot(p$x[,2:3],col=typeCols[btype[rownames(act)]],pch=20)
 84 | dev.off()
 85 | 
 86 | # Rtsne
 87 | #nondup <- act[which(!duplicated(act)),]
 88 | #tsnefile <- paste(outfile_pref, "tSNE.pdf", sep="_")
 89 | #pdf(tsnefile)
 90 | #r <- Rtsne(nondup)
 91 | #plot(r$Y, col=typeCols[btype[rownames(act)]],pch=20)
 92 | #dev.off()
 93 | 
 94 | nondup <- act[which(!duplicated(act)),]
 95 | tsnefile <- paste(outfile_pref, "tSNE.pdf", sep="_")
 96 | pdf(tsnefile)
 97 | r <- Rtsne(nondup, perplexity=10)
 98 | plot(r$Y, col=typeCols[btype[rownames(nondup)]],pch=20)
 99 | dev.off()
100 | 
101 | 
102 | # Look at the nodes in order of decreasing standard deviation
103 | pdf(paste(outfile_pref, "node_profiles.pdf", sep="_"))
104 | par(mfrow=c(2,4))
105 | for(node in order(apply(act, 2, sd),decreasing=TRUE)){
106 |   node.act.per.type(act, node, btype)
107 | }
108 | dev.off()
109 | 
110 | # Or per cell type
111 | pdf(paste(outfile_pref, "cell_profiles.pdf", sep="_"))
112 | type.act.per.node(act, btype)
113 | dev.off()
114 | 
115 | # Check predictivity
116 | rf <- randomForest(x=act, y=as.factor(btype), importance=TRUE)
117 | print(paste("RF estimated error rate", tail(rf$err.rate, n=1)[,1], sep=":"))
118 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # StackedDAE
2 | Stacked Denoising AutoEncoder based on TensorFlow
3 | 
4 | This project is intended to be a Bioinformatics tool. However, this repository hosts the project's code, which is not strictly binded to biology, so someone could use it for another purpose with little effort (on the other hand it's not generalized so to fit in every occasion, so a bit of effort is required).
5 | 
6 | Also, in our project we try to find a good setup for the algorithm, so there are a lot options (and there are more to come) implemented in the code. Masking and Salt-and-pepper noise, with or without Emphasis, and Sigmoid or Tahn activation functions to name a few.
7 | 
8 | 


--------------------------------------------------------------------------------
/Train_SDAE/dae.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tools.config import FLAGS
  4 | 
  5 | class DAE_Layer(object):
  6 |     
  7 |     def __init__(self, in_data=None, prev_layer_size=None, next_layer_size=None, nth_layer=None, sess=None, last_layer=True):
  8 |         self._is_last = last_layer
  9 |         self._layer = nth_layer
 10 |         
 11 |         self._prev_layer_size = prev_layer_size + 1 if FLAGS.bias_node else prev_layer_size
 12 |         self._next_layer_size = next_layer_size
 13 |         self._shape = [self._prev_layer_size, self._next_layer_size]
 14 | 
 15 |         self._x = in_data
 16 |         
 17 |         self._l_rate = self._get_l_rate
 18 |         
 19 |         self._noise = [None, None] if self._is_last else self._get_noise
 20 | 
 21 |         self.vars_to_init = self._setup_variables()
 22 | 
 23 |     
 24 |     def _setup_variables(self):
 25 |         with tf.name_scope("Initialize_Variables"):
 26 |             self._w = self._init_w_or_b(shape=self._shape, trainable=True, name='weights')#_{0}'.format(self._layer))
 27 | #             lmt = tf.mul(4.0, tf.sqrt(6.0 / (self._shape[0] + self._shape[1])))
 28 | #             self._w = tf.Variable(tf.random_uniform(self._shape, -1*lmt, lmt), trainable=True, name='weights')
 29 |             self._b_y = self._init_w_or_b(shape=[self._next_layer_size], trainable=True, is_bias=True, name='prev_biases')
 30 |             
 31 |             vars_to_init = [self._w, self._b_y]
 32 |             if not self._is_last:
 33 |                 self._fixed_w = tf.Variable(tf.identity(self._w.initialized_value()), trainable=False, name="weights_fixed")
 34 |                 self._fixed_b = tf.Variable(tf.identity(self._b_y.initialized_value()), trainable=False, name="biases_fixed")
 35 |                 self._b_z = self._init_w_or_b(shape=[self._prev_layer_size], trainable=True, is_bias=True, name='next_biases')
 36 |                 vars_to_init.append(self._fixed_w)
 37 |                 vars_to_init.append(self._fixed_b)
 38 |                 vars_to_init.append(self._b_z)
 39 | 
 40 |         return vars_to_init
 41 |     
 42 |         
 43 |         """ TODO: TRY initialization for different functions (e.g. tanh) """
 44 |     def _init_w_or_b(self, shape, trainable=True, name=None, is_bias=False, method='sigmoid'):
 45 | #         with tf.name_scope("dae_{0}_{1}".format(self._layer, name)):
 46 |         if is_bias:
 47 |             return tf.Variable(tf.zeros(shape), trainable=trainable, name=name)
 48 |         
 49 |         if method=='sigmoid':
 50 |             # Upper and Lower limit for the weights
 51 |             lmt = tf.mul(4.0, tf.sqrt(6.0 / (shape[0] + shape[1])))
 52 |             return tf.Variable(tf.random_uniform(shape, -1*lmt, lmt), trainable=trainable, name=name)
 53 | 
 54 | 
 55 |     def clean_activation(self, x_in=None, use_fixed=True):
 56 |         if x_in is None:
 57 |             x = self._x
 58 |         else:
 59 |             x = x_in
 60 |         if use_fixed:
 61 |             return self._activate(x, self._fixed_w, self._fixed_b, name='Latent_layer_next')
 62 |         else:
 63 |             return self._activate(x, self._w, self._b_y, name='Latent_layer_next')
 64 |     
 65 | 
 66 |     def encode(self, x_in=None, noise=None):
 67 |         if x_in is None:
 68 |             x = self._x
 69 |         else:
 70 |             x = x_in
 71 |         
 72 |         if noise is None:
 73 |             ratio = self._noise[0]
 74 |             ntype = self._noise[1]
 75 |         else:
 76 |             ratio = noise[0]
 77 |             ntype = noise[1]
 78 | 
 79 |         self._x_tilde, self._noise_map = self._corrupt(x, ratio=ratio, n_type=ntype)
 80 |         with tf.name_scope("Encoder"):
 81 |             self._y = self._activate(self._x_tilde, self._w, self._b_y, name='Latent_layer_next')
 82 |         return self._y
 83 |     
 84 |     def decode(self):
 85 | #         self._y = self.encode()
 86 |         with tf.name_scope("Decoder"):
 87 |             y = self.encode()
 88 |             if self._is_last:
 89 |                 exit("This is the last layer. Currently the reconstruction of this layer cannot be done.")
 90 |             self._z = self._activate(y, self._w, self._b_z, transpose_w=True, name='Reconstr_layer_{0}'.format(self._layer))
 91 |         return self._z
 92 | 
 93 |     @property
 94 |     def get_loss(self):
 95 |         z = self.decode()
 96 |         noise_map = None
 97 |         
 98 |         if FLAGS.emphasis:
 99 |             noise_map = self._noise_map
100 | 
101 |         loss = self._loss_x_entropy(x=self._x, z=z, noise=noise_map)
102 |         
103 |         return loss
104 | 
105 |     @property
106 |     def get_w_all_b(self):
107 |         return [self._w, self._b_y, self._b_z]
108 |     
109 |     @property
110 |     def get_w_b(self):
111 |         return [self._w, self._b_y]
112 |     
113 |     @property
114 |     def get_w(self):
115 |         return self._w
116 | 
117 |     @property
118 |     def get_fixed_w(self):
119 |         return self._fixed_w
120 |     
121 |     @property
122 |     def get_b(self):
123 |         return self._b_y
124 |     
125 |     @property
126 |     def get_fixed_b(self):
127 |         return self._fixed_b
128 | 
129 |     @property
130 |     def get_b_recon(self):
131 |         return self._b_z
132 |     
133 |     @property
134 |     def get_representation_y(self):
135 |         return self._y
136 |     
137 |     @property
138 |     def get_reconstruction_z(self):
139 |         return self._z
140 |     
141 |     @property
142 |     def which(self):
143 |         return self._layer - 1
144 |     
145 |     @staticmethod
146 |     def _activate(x, w, b, transpose_w=False, name=None):
147 |         """ TODO: TRY different activation functions (e.g. tanh, sigmoid...) """
148 |         return tf.sigmoid(tf.nn.bias_add(tf.matmul(x, w, transpose_b=transpose_w), b), name=name)
149 | 
150 |     @property
151 |     def _get_noise(self):
152 |         assert self._layer >= 0
153 | 
154 |         try:
155 |             return getattr(FLAGS, "noise_{0}".format(self._layer))
156 |         except AttributeError:
157 |             print "Noise out of bounds. Using default noise for this Layer (Layer {0})".format(self._layer)
158 |             return FLAGS.default_noise
159 |     
160 |     @property
161 |     def _get_l_rate(self):
162 |         return getattr(FLAGS, "unsupervised_learning_rate")
163 | 
164 |     @property
165 |     def _get_emph_params(self):
166 |         if FLAGS.emphasis_type == 'Full':
167 |             return 1, 0
168 |         elif FLAGS.emphasis_type == 'Double':
169 |             return 1, 0.5
170 |         else:
171 |             print("Unspecified/Wrong Emphasis type. Default Full [0-1] is used.")
172 |             return 1, 0
173 | 
174 |     def _loss_x_entropy(self, x, z, noise=None):
175 |         with tf.name_scope("xentropy_loss"):
176 |             z_clipped = tf.clip_by_value(z, FLAGS.zero_bound, FLAGS.one_bound)
177 |             z_minus_1_clipped = tf.clip_by_value((1.0 - z), FLAGS.zero_bound, FLAGS.one_bound)
178 |             x_clipped = tf.clip_by_value(x, FLAGS.zero_bound, FLAGS.one_bound)
179 |             x_minus_1_clipped = tf.clip_by_value((1.0 - x), FLAGS.zero_bound, FLAGS.one_bound)
180 |             
181 |             # cross_entropy = x * log(z) + (1 - x) * log(1 - z)
182 |             
183 |             cross_entropy = tf.add(tf.mul(tf.log(z_clipped), x_clipped),
184 |                                    tf.mul(tf.log(z_minus_1_clipped), x_minus_1_clipped), name='X-Entr')
185 | 
186 |             if noise:
187 |                 with tf.name_scope("Given_Emphasis"):
188 |                     a, b = self._get_emph_params
189 |                     corrupted = tf.select(noise, cross_entropy, tf.zeros_like(cross_entropy), name='Corrupted_Emphasis')
190 |                     
191 |                     # OR -- tf.select(tf.logical_not(noisy_points), cross_entropy, tf.zeros_like(cross_entropy), name='Uncorrupted_Emphasis')
192 |                     uncorrupted = tf.select(noise, tf.zeros_like(cross_entropy), cross_entropy, name='Uncorrupted_Emphasis')
193 |                     
194 |                     loss = a * (-1 * tf.reduce_sum(corrupted, 1)) + b * (-1 * tf.reduce_sum(uncorrupted, 1))
195 |             else:
196 |                 # Sum the cost for each example
197 |                 loss = -1 * tf.reduce_sum(cross_entropy, 1)
198 |         
199 |             # Reduce mean to find the overall cost of the loss
200 |             cross_entropy_mean = tf.reduce_mean(loss, name='xentropy_mean')
201 |     
202 |             return cross_entropy_mean
203 | 
204 | 
205 | #     @property
206 | #     def get_cost(self):
207 | #         z = self.get_reconstruction_z
208 | #         noise_map = None
209 | #         
210 | #         if FLAGS.emphasis:
211 | #             noise_map = self._noise_map
212 | # 
213 | #         cost = self._loss_x_entropy(x=self._x, z=z, noise=noise_map)
214 | #         
215 | #         return cost
216 | 
217 | 
218 |     def _corrupt(self, x, ratio, n_type='MN'):
219 |         with tf.name_scope("Corruption"):
220 |             """ Noise adding (or input corruption)
221 |             This function adds noise to the given data.
222 |             
223 |             Args:
224 |                 x    : The input data for the noise to be applied
225 |                 ratio: The percentage of the data affected by the noise addition
226 |                 n_type: The type of noise to be applied.
227 |                         Choices: MN (masking noise), SP (salt-and-pepper noise)
228 |             """
229 |             
230 |             # Safety check. If unspecified noise type given, use Masking noise instead.
231 |             if n_type != 'MN' and n_type != 'SP' and n_type != 'TFDO':
232 |                 n_type = 'MN'
233 |                 print("Unknown noise type. Masking noise will be used instead.")
234 |             
235 |             
236 |             # if there is no noise to be added there is no need to proceed further
237 |             if ratio == 0.0:
238 |                 return x_tilde, None
239 |             
240 |             if n_type == 'TFDO':
241 |                 x_tilde = tf.nn.dropout(x, keep_prob= 1 - ratio)
242 | #                 points_to_alter = x_tilde == 0.
243 | #                 print points_to_alter
244 | #                 x_tilde = tf.select(points_to_alter, tf.add(tf.zeros_like(x_tilde, dtype=tf.float32),
245 | #                                                                 FLAGS.zero_bound), x_tilde, name='X_tilde')
246 | #                 x_tilde[x_tilde == 0.] = tf.constant(FLAGS.zero_bound)
247 |             else:
248 |                 # It makes a copy of the data, otherwise 'target_feed' will also be affected
249 |                 x_tilde = tf.identity(x, name='X_tilde')
250 |                 shape = tf.Tensor.get_shape(x_tilde)
251 |                 # Creating and applying random noise to the data. (Masking noise)
252 |                 points_to_alter = tf.random_uniform(shape=shape, dtype=tf.float32) < ratio
253 |                 
254 |                 if n_type == 'MN':
255 |                     x_tilde = tf.select(points_to_alter, tf.add(tf.zeros_like(x_tilde, dtype=tf.float32),
256 |                                                                 FLAGS.zero_bound), x_tilde, name='X_tilde')
257 |                     
258 |                 elif n_type == 'SP':
259 |                     coin_flip = np.asarray([np.random.choice([FLAGS.zero_bound, FLAGS.one_bound]) for _ in range(shape[0]) for _ in range(shape[1])]).reshape(shape)
260 |                     x_tilde = tf.select(points_to_alter, tf.to_float(coin_flip), x_tilde, name='X_tilde')
261 | 
262 |                 
263 |             # Also returns the 'points_to_alter' in case of applied Emphasis
264 |             if not FLAGS.emphasis or n_type == 'TFDO':
265 |                 points_to_alter = None
266 |     
267 |             return x_tilde, points_to_alter
268 | 
269 |         


--------------------------------------------------------------------------------
/Train_SDAE/run.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import os
  4 | import shutil
  5 | import time
  6 | import stacked_dae as SDAE
  7 | 
  8 | from os.path import join as pjoin
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | from tools.config import FLAGS, home_out
 13 | from tools.start_tensorboard import start_tb
 14 | from tools.data_handler import load_data, load_linarsson_labels, load_extra
 15 | 
 16 | from tools.utils import load_data_sets_pretraining, load_data_sets
 17 | from tools.utils import normalize_data, label_metadata, write_csv
 18 | from tools.ADASYN import Adasyn, all_indices
 19 | from tools.evaluate_model import run_random_forest as run_rf
 20 | from tools.evaluate_model import plot_tSNE
 21 | from tools.evaluate import predict
 22 | 
 23 | import rpy2.robjects as robjects
 24 | from rpy2.robjects.packages import importr
 25 | from rpy2.robjects import numpy2ri
 26 | from rpy2.robjects import pandas2ri
 27 | from tensorflow.python.framework.errors import FailedPreconditionError
 28 | 
 29 | from scipy import stats, integrate
 30 | import seaborn as sns
 31 | from rpy2.rinterface._rinterface import RRuntimeError
 32 | sns.set(color_codes=True)
 33 | 
 34 | # Initialize R - Python connection
 35 | pandas2ri.activate()
 36 | numpy2ri.activate()
 37 | r = robjects.r
 38 | r_source = r['source']
 39 | r_source('../Evaluation/evaluate_model.R', **{'print.eval': True})
 40 | 
 41 | 
 42 | np.set_printoptions(threshold=np.nan)
 43 | 
 44 | # Assign config variables
 45 | _data_dir = FLAGS.data_dir
 46 | _output_dir = FLAGS.output_dir
 47 | _summary_dir = FLAGS.summary_dir
 48 | _chkpt_dir = FLAGS.chkpt_dir
 49 | 
 50 | 
 51 | def main():
 52 |     """
 53 |         TODO: Break to 2 or 3 functions
 54 |                 for better comprehension.
 55 |     """
 56 | 
 57 |     # Initialize the directory environment
 58 |     initialize()
 59 |     
 60 |     # Start TensorBoard
 61 |     start_tb()
 62 | 
 63 |     # Set Hyper-parameters
 64 |     bias_node = FLAGS.bias_node
 65 |     nHLay = FLAGS.num_hidden_layers
 66 |     nHUnits = [getattr(FLAGS, "hidden{0}_units".format(j + 1))\
 67 |                for j in xrange(nHLay)]
 68 |     
 69 |     if FLAGS.use_balanced:
 70 |         transp = True
 71 |     else:
 72 |         transp = False
 73 | 
 74 | 
 75 |     # ...... Read/Upload/Process the Data ...... #
 76 |     
 77 |     # Capture time for logging loading duration
 78 |     start_time = time.time()
 79 | 
 80 |     # Load data (Allen dataset). Label_col {9: types, 7: subtypes}
 81 |     # datafile, (mapped_labels, label_map) = load_data('TPM', label_col=9,\
 82 |     #                                                   transpose=True)
 83 |     
 84 |     # Load data (Linnarsson dataset)
 85 |     datafile, labels, meta = load_data(FLAGS.dataset, d_type='filtered',\
 86 |                                        label_col=1, transpose=transp)
 87 | 
 88 |     # datafile_orig, labels, meta = load_data(FLAGS.dataset, d_type='filtered',\
 89 |     #                                         label_col=7, transpose=transp)
 90 |     
 91 |     print("Data Loaded. Duration:", time.time() - start_time)
 92 |     
 93 |     
 94 |     # ...... Receive/Set Metadata (Labels) ...... #
 95 |     
 96 |     mapped_labels_df, label_map = meta
 97 |     
 98 |     mapped_labels = np.reshape(mapped_labels_df.values,\
 99 |                                (mapped_labels_df.shape[0],))
100 |     
101 |     num_classes = label_map.shape[0]
102 |     
103 |     # Print class statistics using ADASYN's function all_indices()
104 |     print("\nClass Statistics:")
105 |     
106 |     for i in xrange(num_classes):
107 |         print("{: >30}\t".format(label_map[i,0]),\
108 |               len(all_indices(i, mapped_labels.tolist())))
109 | 
110 |     
111 |     # ...... Class Balancing ...... #
112 |     
113 |     balanced_data = None
114 |     recr_labels = None
115 |     
116 |     # "transp" is True if the flag "use_balanced" is True, False otherwise
117 |     if transp:
118 |         a = Adasyn(datafile, mapped_labels, label_map[:,1], beta=1)
119 |         
120 |         # Balance the data and collect them
121 |         balanced_data, mapped_labels = a.balance_all()
122 | 
123 |         recr_labels = pd.DataFrame(data=mapped_labels)
124 |         recr_labels = recr_labels.replace(label_map[:,1].tolist(),\
125 |                                           label_map[:,0].tolist())
126 | 
127 |     # Control the transposition of the data if we use ADASYN or not
128 |     data = balanced_data if transp else datafile
129 | 
130 |     # Save some space
131 |     del(balanced_data)
132 | 
133 | 
134 |     # ...... Data Normalization ...... #
135 |     
136 |     # Capture time for logging processing duration
137 |     start_time = time.time()
138 |     norm_data = normalize_data(data, transpose=transp)
139 |     
140 |     # Normalize the unbalanced data (experimenting)
141 |     if transp:
142 |         norm_orig = normalize_data(datafile, transpose=transp)
143 |     else:
144 |         norm_orig = norm_data
145 | 
146 |     # Save some space
147 |     del(datafile)
148 | 
149 |     print("Data Normalized. Duration:", time.time() - start_time)
150 | 
151 | 
152 |     # Get the number of existed features
153 |     # (e.g. genes), in the data-set
154 |     num_features = norm_data.shape[1]
155 | 
156 |     # Create the shape of the AutoEncoder
157 |     sdae_shape = [num_features] + nHUnits + [num_classes]
158 |     print(sdae_shape)
159 | 
160 | 
161 |     # ...... Pre-training Phase ...... #
162 | 
163 |     # Get data-sets (train, test) for pretraining in a proper way
164 |     data = load_data_sets_pretraining(norm_data, split_only=False)
165 | 
166 |     # Run pretraining step
167 |     # TODO: Change function name to "fit()"
168 |     sdae = SDAE.pretrain_sdae(input_x=data, shape=sdae_shape)
169 |     
170 |     # Save some space
171 |     del(data)
172 | 
173 | 
174 |     # Load another dataset to test it on the created model
175 |     
176 |     # sub_labels, _ = load_linarsson_labels(sub_labels=True)
177 |     # data_an, labels_an, meta = load_extra('Allen',\
178 |     #                                       'TPM_common_ready_data.csv',\
179 |     #                                       transpose=True, label_col=7)
180 |     
181 |     data_an, labels_an, meta = load_extra('Lin-Allen',\
182 |                                           'Lin-Allen_compendium.csv',\
183 |                                           transpose=True, label_col=0)
184 | 
185 |     # Data Normalization
186 |     data_an = normalize_data(data_an, transpose=False)
187 |     data_an = np.transpose(data_an)
188 | 
189 |     # Get the labels
190 |     mapped_an_df, l_map = meta
191 |     mapped_an_labs = np.reshape(mapped_an_df.values,\
192 |                                 (mapped_an_df.shape[0],))
193 |     print(l_map)
194 |     
195 |     # Create comprehensive plots/graphs
196 |     try:
197 |         analyze(sdae, data_an, labels_an,\
198 |                 bias_node=bias_node, prefix='Foreign_Pretraining')
199 |         analyze(sdae, norm_orig, labels,\
200 |                 bias_node=bias_node, prefix='Pretraining')
201 |     except:
202 |         pass
203 |     # analyze(sdae, datafile_norm, recr_labels,\
204 |     #             prefix='recr_Pretraining')
205 |     # analyze(sdae, datafile_norm, sub_labels,\
206 |     #             mapped_labels, prefix='recr_Pretraining')
207 | 
208 | 
209 |     # ...... Fine-tuning Phase ...... #
210 | 
211 |     # Get data-sets (train, test) for finetuning in a proper way
212 |     data = load_data_sets(norm_data, mapped_labels)
213 |     
214 |     # print("\nTotal Number of Examples:",\
215 |     #       data.train.num_examples + data.test.num_examples)
216 | 
217 |     # Run finetuning step
218 |     # TODO: Change function name to "finetune()" or similar
219 |     sdae = SDAE.finetune_sdae(sdae=sdae, input_x=data,\
220 |                               n_classes=num_classes,\
221 |                               label_map=label_map[:,0])
222 | 
223 |     # Save some space
224 |     del(data)
225 | 
226 |     # Evaluate the results on a totally different data-set
227 |     foreign_data = load_data_sets(data_an, mapped_an_labs, split_only=False)
228 |     
229 |     # TODO: make the "predict" function part of the Stacked_DAE class
230 |     p, t = predict(sdae, foreign_data.all, bias_node=bias_node)
231 |     p = pd.DataFrame(data=p).replace(l_map[:,1].tolist(), l_map[:,0].tolist())
232 |     t = pd.DataFrame(data=t).replace(l_map[:,1].tolist(), l_map[:,0].tolist())
233 |     print(p, t)
234 |     p.to_csv(pjoin(FLAGS.output_dir, 'Predictions_of_Foreign.txt'), sep='\t')
235 |     t.to_csv(pjoin(FLAGS.output_dir, 'True_labels_of_Foreign.txt'), sep='\t')
236 | 
237 |     # Save some space
238 |     del(foreign_data)
239 |     del(norm_data)
240 | 
241 |     # Create comprehensive plots/graphs
242 |     # analyze(sdae, datafile_norm, recr_labels,\
243 |     #         mapped_labels, prefix='recr_Finetuning')
244 |     try:
245 |         analyze(sdae, data_an, labels_an, mapped_labels,\
246 |                 bias_node=bias_node, prefix='Foreign_Finetuning')
247 |         analyze(sdae, norm_orig, labels, mapped_labels,\
248 |                 bias_node=bias_node, prefix='Finetuning')
249 |     except:
250 |         pass
251 | 
252 |     # Print the used set up
253 |     print_setup()
254 | 
255 |     # ...... The End ...... #
256 | 
257 | 
258 | def _check_and_clean_dir(d):
259 |     """
260 |         Clears the given directory.
261 |     """
262 |     if os.path.exists(d):
263 |         shutil.rmtree(d)
264 |     os.mkdir(d)
265 | 
266 | 
267 | def initialize():
268 |     """
269 |         Performs initialization of the directory environment.
270 |     """
271 |     home = home_out('')
272 |     
273 |     # Make sure core directories exist
274 |     if not os.path.exists(home):
275 |         os.makedirs(home)
276 | 
277 |     if not os.path.exists(_data_dir):
278 |         os.mkdir(_data_dir)
279 | 
280 |     if not os.path.exists(_output_dir):
281 |         os.makedirs(_output_dir)
282 | 
283 |     elif os.listdir(_output_dir):
284 | 
285 |         # If the output folder is not empty, Prompt before delete contents.
286 |         var = raw_input("{0} {1}"\
287 |                         .format("Output folder is not empty. Clean it?",\
288 |                                 "(This will delete every file in it.) y/N: "))
289 | 
290 |         if var == 'y' or var == 'Y' or var == '1':
291 |             _check_and_clean_dir(_output_dir)
292 |         else:
293 |             exit("Exiting... Please save your former \
294 |                     output data and restart SDAE.")
295 |     else:
296 |         _check_and_clean_dir(_output_dir)
297 | 
298 |     # Clean the rest directories
299 |     _check_and_clean_dir(_summary_dir)
300 |     _check_and_clean_dir(_chkpt_dir)
301 |     
302 |     # Create checkpoint directories (depricated)
303 |     os.mkdir(os.path.join(_chkpt_dir, '1'))
304 |     os.mkdir(os.path.join(_chkpt_dir, '2'))
305 |     os.mkdir(os.path.join(_chkpt_dir, '3'))
306 |     os.mkdir(os.path.join(_chkpt_dir, 'fine_tuning'))
307 | 
308 | 
309 | def analyze(sdae, datafile_norm,\
310 |             labels, mapped_labels=None,\
311 |             bias_node=False, prefix=None):
312 | 
313 |     """
314 |         Speeks to R, and submits it analysis jobs.
315 |     """
316 | 
317 |     # Get some R functions on the Python environment
318 |     def_colors = robjects.globalenv['def_colors']
319 |     do_analysis = robjects.globalenv['do_analysis']
320 | 
321 |     # labels.reset_index(level=0, inplace=True)
322 |     def_colors(labels)
323 |     act = np.float32(datafile_norm)
324 | 
325 |     try:
326 |         do_analysis(act, sdae.get_weights, sdae.get_biases,\
327 |                     pjoin(FLAGS.output_dir, "{}_R_Layer_".format(prefix)),\
328 |                     bias_node=bias_node)
329 |     except RRuntimeError as e:
330 |         pass
331 | 
332 | #     for layer in sdae.get_layers:
333 | #         fixed = False if layer.which > sdae.nHLayers - 1 else True
334 | #  
335 | #         try:
336 | #             act = sdae.get_activation(act, layer.which, use_fixed=fixed)
337 | #             print("Analysis for layer {}:".format(layer.which + 1))
338 | #             temp = pd.DataFrame(data=act)
339 | #             do_analysis(temp, pjoin(FLAGS.output_dir,\
340 | #                                     "{}_Layer_{}"\
341 | #                                     .format(prefix, layer.which)))
342 | #              
343 | # #             if not fixed:
344 | # #                 weights = sdae.get_weights[layer.which]
345 | # #                 for node in weights.transpose():
346 | # #                     sns.distplot(node, kde=False,\
347 | #                                     fit=stats.gamma, rug=True);
348 | # #                     sns.plt.show()
349 | #             try:
350 | #                 plot_tSNE(act, mapped_labels,\
351 | #                             plot_name="Pyhton_{}_tSNE_layer_{}"\
352 | #                             .format(prefix, layer.which))
353 | #             except IndexError as e:
354 | #                 pass
355 | #         except FailedPreconditionError as e:
356 | #             break
357 | 
358 | 
359 | def print_setup():
360 |     nHLay = FLAGS.num_hidden_layers
361 |     nHUnits = [getattr(FLAGS, "hidden{0}_units"\
362 |                        .format(j + 1)) for j in xrange(nHLay)]
363 |     l_rates = [getattr(FLAGS, "pre_layer{}_learning_rate"\
364 |                        .format(i)) for i in xrange(1,nHLay+1)]
365 |     noise_ratios = [getattr(FLAGS, "noise_{0}"\
366 |                             .format(i)) for i in xrange(1,nHLay+1)]
367 |     
368 |     print("\nConfiguration:")
369 |     print("\n{: >45}\t".format("Dataset:"), FLAGS.dataset)
370 |     print("\n{: >45}\t".format("Use Bias Node:"), FLAGS.bias_node)
371 |     print("{: >45}\t".format("# Hidden Layers:"), nHLay)
372 |     print("{: >45}\t".format("# Hidden Units:"), nHUnits)
373 |     print("{: >45}\t".format("Noise Ratio (per layer):"),\
374 |           [row[0] for row in noise_ratios])
375 |     print("{: >45}\t".format("Noise Type (MN, SP, TFDO):"),\
376 |           [row[1] for row in noise_ratios])
377 | 
378 |     if FLAGS.emphasis:
379 |         print("{: >45}\t"\
380 |               .format("Emphasis (Double, Full, No):"),\
381 |               FLAGS.emphasis_type)
382 |     else:
383 |         print("{: >45}\t"\
384 |               .format("Emphasis (Double, Full, No):"), "No")
385 | 
386 |     print("{: >45}\t"\
387 |           .format("Unsupervised Learning Rate (per layer?):"),\
388 |           l_rates)
389 |     
390 |     print("{: >45}\t"\
391 |           .format("Supervised Learning Rate:"),\
392 |           FLAGS.supervised_learning_rate)
393 |     
394 |     print("{: >45}\t".format("Batch size:"),\
395 |           FLAGS.batch_size)
396 |     
397 |     print("{: >45}\t"\
398 |           .format("# Pretraining epochs:"),\
399 |           FLAGS.pretraining_epochs)
400 |     
401 |     print("{: >45}\t".format("# Finetuning epochs:"),\
402 |           FLAGS.finetuning_epochs)
403 | #     Activation Function (Sigmoid, Tanh, ReLU)
404 | #     Weight Initialization (Sigmoid, Tanh, ReLU)
405 | #     Loss Function (X-Entropy, sum of sq. error)
406 | 
407 | 
408 | if __name__ == '__main__':
409 |     total_time = time.time()
410 |     main()
411 |     print("\n{}".format(time.strftime("%Y-%m-%d %H:%M:%S")))
412 |     print("Total time:", time.time() - total_time)
413 |     
414 | 


--------------------------------------------------------------------------------
/Train_SDAE/stacked_dae.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import time
  6 | import sklearn
  7 | 
  8 | from sklearn.metrics import precision_score, confusion_matrix
  9 | from sklearn.metrics import recall_score, f1_score, roc_curve
 10 | 
 11 | from dae import DAE_Layer
 12 | from os.path import join as pjoin
 13 | 
 14 | #from utils import load_data_sets_pretraining, write_csv
 15 | from tools.utils import fill_feed_dict, fill_feed_dict_dae
 16 | from tools.evaluate import do_eval_summary, evaluation, do_eval
 17 | from tools.config import FLAGS
 18 | from tools.visualize import make_heatmap
 19 | from tensorflow.python.framework.errors import FailedPreconditionError
 20 | 
 21 | 
 22 | class Stacked_DAE(object):
 23 |     
 24 |     def __init__(self, net_shape, session=None, selfish_layers=False):
 25 |         """ Stack De-noising Autoencoder (SDAE) initialization 
 26 |         
 27 |         Args:
 28 |             net_shape: The network architecture of the SDAE
 29 |             session : The tensorflow session
 30 |             selfish_layers: Whether the layers are going to be trained individually
 31 |                             or dependent to the direct output of the previous layer
 32 |                             (Theoretically: using it is faster, but memory costly)
 33 |         Tips:
 34 |             Using selfish_layers needs some extra handling.
 35 |               * Feed each individual De-noising Autoencoder (DAE) directly.
 36 |                     (e.g. feed_dict = {sdae.get_layers[i]._x : input_data})
 37 |               * Reassign/Reload the input data-set with the data-set for the next
 38 |                 layer, obtained by using the genrate_next_dataset() function.
 39 |                     (e.g. in this case load_data_sets_pretraining(next_dataset, split_only=False))
 40 |         """
 41 |         self._sess = session
 42 |         self._net_shape = net_shape
 43 |         self.nHLayers = len(self._net_shape) - 2
 44 |         self._selfish_layers = selfish_layers
 45 |         self.loss_summaries = None
 46 |         
 47 |         if self._selfish_layers:
 48 |             self._x = None
 49 |             self._y_dataset = {}
 50 |         else:
 51 |             self._x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, self._net_shape[0]), name='dae_input_layer')
 52 | 
 53 |         self._dae_layers = []
 54 |         self._weights = []
 55 |         self._biases = []
 56 |         self.weights = []
 57 |         self.biases = []
 58 |         self._create_network()
 59 | 
 60 |     def _create_network(self):
 61 |         is_last_layer = False
 62 |         for layer in xrange(self.nHLayers + 1):
 63 |             with tf.name_scope("Layer_{0}".format(layer)):
 64 |                 if self._selfish_layers: 
 65 |                     x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, self._net_shape[layer]), name='dae_input_from_layer_{0}'.format(layer))
 66 |                     self._y_dataset[layer] = []
 67 |                 else:
 68 |                     if layer == 0:
 69 |                         x = self._x
 70 |                     else:
 71 |                         x = self._dae_layers[layer-1].clean_activation()
 72 | #                         x = self._dae_layers[layer-1].get_representation_y
 73 | 
 74 |                 new_x = tf.identity(x)
 75 | 
 76 |                 if layer == self.nHLayers:
 77 |                     is_last_layer = True
 78 | 
 79 |                 if FLAGS.bias_node and layer < self.nHLayers:
 80 |                     # Add bias node (experimental)
 81 |                     bias_node = tf.ones(shape=[FLAGS.batch_size, 1], dtype=tf.float32)
 82 |                     new_x = tf.concat(1, [bias_node, x])
 83 | 
 84 |                 dae_layer = DAE_Layer(in_data=new_x, prev_layer_size=self._net_shape[layer],
 85 |                                       next_layer_size=self._net_shape[layer+1], nth_layer=layer+1,
 86 |                                       last_layer=is_last_layer)
 87 | 
 88 |                 self._dae_layers.append(dae_layer)
 89 | 
 90 |     @property
 91 |     def session(self):
 92 |         return self._sess
 93 | 
 94 |     @property
 95 |     def get_layers(self):
 96 |         return self._dae_layers
 97 | 
 98 |     @property
 99 |     def get_weights(self):
100 | #         if len(self.weights) != self.nHLayers + 1:
101 |         self.weights = []
102 |         for n in xrange(self.nHLayers + 1):
103 |             if self.get_layers[n].get_w:
104 |                 try:
105 |                     self.weights.append(self.session.run(self.get_layers[n].get_w))
106 |                 except FailedPreconditionError:
107 |                     break
108 |             else:
109 |                 break
110 | 
111 |         return self.weights
112 | 
113 |     @property
114 |     def get_biases(self):
115 | #         if len(self.biases) != self.nHLayers + 1:
116 |         self.biases = []
117 |         for n in xrange(self.nHLayers + 1):
118 |             if self.get_layers[n].get_b:
119 |                 try:
120 |                     self.biases.append(self.session.run(self.get_layers[n].get_b))
121 |                 except FailedPreconditionError:
122 |                     break
123 |             else:
124 |                 break
125 | 
126 |         return self.biases
127 |     
128 |     def get_activation(self, x, layer, use_fixed=True):
129 |         return self.session.run(self.get_layers[layer].clean_activation(x_in=x, use_fixed=use_fixed))
130 | #         return self.session.run(tf.sigmoid(tf.nn.bias_add(tf.matmul(x, self.get_weights[layer]), self.get_biases[layer]), name='activate'))
131 | 
132 |     def train(self, cost, layer=None):
133 | #         with tf.name_scope("Training"):
134 |         # Add a scalar summary for the snapshot loss.
135 |         self.loss_summaries = tf.scalar_summary(cost.op.name, cost)
136 | 
137 |         if layer is None:
138 |             lr = FLAGS.supervised_learning_rate
139 |         else:
140 |             lr = self.get_layers[layer]._l_rate
141 | 
142 |         # Create the gradient descent optimizer with the given learning rate.
143 |         optimizer = tf.train.GradientDescentOptimizer(lr)
144 |         
145 |         # Create a variable to track the global step.
146 |         global_step = tf.Variable(0, trainable=False, name='global_step')
147 |         
148 |         # Use the optimizer to apply the gradients that minimize the loss
149 |         # (and also increment the global step counter) as a single training step.
150 |         train_op = optimizer.minimize(cost, global_step=global_step)
151 |         return train_op, global_step
152 | 
153 |     def calc_last_x(self, X, bias_node=False):
154 |         tmp = X
155 |         for layer in self.get_layers:
156 |             if bias_node:
157 |                 bias_n = tf.ones(shape=[FLAGS.batch_size, 1], dtype=tf.float32)
158 |                 tmp = tf.concat(1, [bias_n, tmp])
159 |             tmp = layer.clean_activation(x_in=tmp, use_fixed=False)
160 | #         print(tmp, self._net_shape[-2], self._net_shape[-1])
161 | #         dae_layer = DAE_Layer(in_data=tmp, prev_layer_size=self._net_shape[-2],
162 | #                                       next_layer_size=self._net_shape[-1], nth_layer=len(self._net_shape)-1,
163 | #                                       last_layer=True)
164 | # 
165 | #         self._dae_layers.append(dae_layer)
166 | #         tmp = self.get_layers[-1].clean_activation(x_in=tmp, use_fixed=False)
167 |         
168 |         return tmp
169 | 
170 |     def add_final_layer(self, input_x, bias_node=False):
171 |         last_x = self.calc_last_x(input_x, bias_node=bias_node)
172 |         print "Last layer added:", last_x.get_shape()
173 |         return last_x
174 |     
175 | #     def finetune_net(self):
176 | #         last_output = self._x
177 | #         
178 | #         for layer in xrange(self.nHLayers + 1):
179 | #             w = self.get_layers[layer]
180 |     
181 |     def genrate_next_dataset(self, from_dataset, layer):
182 |         """ Generate next data-set
183 |         Note: This function has a meaning only if selfish layers are in use.
184 |         It takes as input the data-set and transforms it using the previously
185 |         trained layer in order to obtain it's output. The output of that layer
186 |         is saved as a data-set to be used as input for the next one.
187 |         
188 |         Args:
189 |             from_dataset: The data-set you want to transform (usually
190 |                             the one that the previous layer is trained on)
191 |             layer : The layer to be used for the data transformation
192 |         Returns:
193 |             numpy array: The new data-set to be used for the next layer
194 |         """
195 |         if self._selfish_layers:
196 |             for _ in xrange(from_dataset.num_batches):
197 |                 feed_dict = fill_feed_dict_dae(from_dataset, self.get_layers[layer]._x)
198 | 
199 |                 y = self.session.run(self.get_layers[layer].clean_activation(), feed_dict=feed_dict)
200 |                 for j in xrange(np.asarray(y).shape[0]):
201 |                     self._y_dataset[layer].append(y[j])
202 |                     
203 |             return np.asarray(self._y_dataset[layer])
204 |         else:
205 |             print "Note: This function has a meaning only if selfish layers are in use."
206 |             return None
207 | 
208 | def pretrain_sdae(input_x, shape):
209 |     with tf.Graph().as_default():# as g:
210 |         sess = tf.Session()
211 |         
212 |         sdae = Stacked_DAE(net_shape=shape, session=sess, selfish_layers=False)
213 | 
214 |         for layer in sdae.get_layers[:-1]:
215 |             with tf.variable_scope("pretrain_{0}".format(layer.which)):
216 |                 cost = layer.get_loss
217 |                 train_op, global_step = sdae.train(cost, layer=layer.which)
218 | 
219 |                 summary_dir = pjoin(FLAGS.summary_dir, 'pretraining_{0}'.format(layer.which))
220 |                 summary_writer = tf.train.SummaryWriter(summary_dir, graph_def=sess.graph_def, flush_secs=FLAGS.flush_secs)
221 |                 summary_vars = [layer.get_w_b[0], layer.get_w_b[1]]
222 |                         
223 |                 hist_summarries = [tf.histogram_summary(v.op.name, v) for v in summary_vars]
224 |                 hist_summarries.append(sdae.loss_summaries)
225 |                 summary_op = tf.merge_summary(hist_summarries)
226 | 
227 |                 '''
228 |                  You can get all the trainable variables using tf.trainable_variables(),
229 |                  and exclude the variables which should be restored from the pretrained model.
230 |                  Then you can initialize the other variables.
231 |                 '''
232 | 
233 |                 layer.vars_to_init.append(global_step)
234 |                 sess.run(tf.initialize_variables(layer.vars_to_init))
235 | 
236 |                 print("\n\n")
237 |                 print "|  Layer   |   Epoch    |   Step   |    Loss    |"
238 |                 
239 |                 for step in xrange(FLAGS.pretraining_epochs * input_x.train.num_examples):
240 |                     feed_dict = fill_feed_dict_dae(input_x.train, sdae._x)
241 |     
242 |                     loss, _ = sess.run([cost, train_op], feed_dict=feed_dict)
243 |                     
244 |                     if step % 1000 == 0:
245 |                         summary_str = sess.run(summary_op, feed_dict=feed_dict)
246 |                         summary_writer.add_summary(summary_str, step)
247 |                         
248 |                         output = "| Layer {0}  | Epoch {1}    |  {2:>6}  | {3:10.4f} |"\
249 |                                      .format(layer.which, step // input_x.train.num_examples + 1, step, loss)
250 |                         print output
251 |     
252 |                 # Note: Use this style if you are using the shelfish_layer choice.
253 |                 # This way you keep the activated data to be fed to the next layer.
254 |                 # next_dataset = sdae.genrate_next_dataset(from_dataset=input_x.all, layer=layer.which)
255 |                 # input_x = load_data_sets_pretraining(next_dataset, split_only=False)
256 | 
257 |         # Save Weights and Biases for all layers
258 |         for n in xrange(len(shape) - 2):
259 |             w = sdae.get_layers[n].get_w
260 |             b = sdae.get_layers[n].get_b
261 |             W, B = sess.run([w, b])
262 | 
263 |             np.savetxt(pjoin(FLAGS.output_dir, 'Layer_' + str(n) + '_Weights.txt'), np.asarray(W), delimiter='\t')
264 |             np.savetxt(pjoin(FLAGS.output_dir, 'Layer_' + str(n) + '_Biases.txt'), np.asarray(B), delimiter='\t')
265 |             make_heatmap(W, 'weights_'+ str(n))
266 | 
267 |     print "\nPretraining Finished...\n"
268 |     return sdae
269 | 
270 | 
271 | 
272 | def finetune_sdae(sdae, input_x, n_classes, label_map):
273 |     print "Starting Fine-tuning..."
274 |     sess = sdae.session
275 |     with sess.graph.as_default():
276 |         
277 |         n_features = sdae._net_shape[0]
278 |         
279 |         x_pl = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, n_features), name='input_pl')
280 |         labels_pl = tf.placeholder(tf.int32, shape=FLAGS.batch_size, name='labels_pl')
281 |         labels = tf.identity(labels_pl)
282 |         
283 |         # Get the supervised fine tuning net
284 |         logits = sdae.add_final_layer(x_pl, bias_node=FLAGS.bias_node)
285 | #         logits = sdae.finetune_net(input_x)
286 |         loss = loss_supervised(logits, labels_pl, n_classes)
287 | 
288 |         train_op, _ = sdae.train(loss)
289 |         eval_correct, corr, y_pred = evaluation(logits, labels_pl)
290 |         
291 |         hist_summaries = [layer.get_w for layer in sdae.get_layers]
292 |         hist_summaries.extend([layer.get_b for layer in sdae.get_layers])
293 |         
294 |         hist_summaries = [tf.histogram_summary(v.op.name + "_fine_tuning", v) for v in hist_summaries]
295 |         
296 |         summary_op = tf.merge_summary(hist_summaries)
297 |         
298 |         summary_writer = tf.train.SummaryWriter(pjoin(FLAGS.summary_dir, 'fine_tuning'),
299 |                                                 graph_def=sess.graph_def,
300 |                                                 flush_secs=FLAGS.flush_secs)
301 |         
302 |         sess.run(tf.initialize_all_variables())
303 |         
304 |         steps = FLAGS.finetuning_epochs * input_x.train.num_examples
305 |         for step in xrange(steps):
306 |             start_time = time.time()
307 |             
308 |             feed_dict = fill_feed_dict(input_x.train, x_pl, labels_pl)
309 |             
310 |             _, loss_value, ev_corr, c, y_true = sess.run([train_op, loss, eval_correct, corr, labels], feed_dict=feed_dict)
311 |             
312 |             duration = time.time() - start_time
313 | 
314 |             # Write the summaries and print an overview fairly often.
315 |             if step % 1000 == 0:
316 |                 # Print status to stdout.
317 |                 print "\nLoss: ", loss_value
318 | #                 print "Eval corr:", ev_corr
319 | #                 print "Correct:", c
320 | #                 print "Y_pred:", y_pred
321 | #                 print "Label_pred:", y_true
322 |                 
323 | #                 y_true = np.argmax(labels_pl, 0)
324 |                 
325 |                 
326 |                 print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
327 |                 
328 |                 print 'Evaluation Sum:', ev_corr, '/', len(c)
329 | #                 print('Evaluation Corrects:', eval_corr)
330 | #                 print('Logits:', lgts)
331 |                 print "---------------"
332 |                 
333 |                 # Update the events file.
334 |                 summary_str = sess.run(summary_op, feed_dict=feed_dict)
335 |                 summary_writer.add_summary(summary_str, step)
336 | 
337 |             if (step + 1) % 1000 == 0 or (step + 1) == steps:
338 |                 train_sum = do_eval_summary("training_error",
339 |                                             sess,
340 |                                             eval_correct,
341 |                                             x_pl,
342 |                                             labels_pl,
343 |                                             input_x.train)
344 |                 
345 |                 if input_x.validation is not None:
346 |                     val_sum = do_eval_summary("validation_error",
347 |                                               sess,
348 |                                               eval_correct,
349 |                                               x_pl,
350 |                                               labels_pl,
351 |                                               input_x.validation)
352 |                 
353 |                 test_sum = do_eval_summary("test_error",
354 |                                            sess,
355 |                                            eval_correct,
356 |                                            x_pl,
357 |                                            labels_pl,
358 |                                            input_x.test)
359 |                 
360 |                 summary_writer.add_summary(train_sum, step)
361 |                 if input_x.validation is not None:
362 |                     summary_writer.add_summary(val_sum, step)
363 |                 summary_writer.add_summary(test_sum, step)
364 | 
365 |         for n in xrange(len(sdae._net_shape) - 1):
366 |             w = sdae.get_layers[n].get_w
367 |             b = sdae.get_layers[n].get_b
368 |             W, B = sess.run([w, b])
369 | 
370 |             np.savetxt(pjoin(FLAGS.output_dir, 'Finetuned_Layer_' + str(n) + '_Weights.txt'), np.asarray(W), delimiter='\t')
371 |             np.savetxt(pjoin(FLAGS.output_dir, 'Finetuned_Layer_' + str(n) + '_Biases.txt'), np.asarray(B), delimiter='\t')
372 |             make_heatmap(W, 'Finetuned_weights_'+ str(n))
373 | 
374 |         do_eval(sess, eval_correct, y_pred, x_pl, labels_pl, label_map, input_x.train, title='Final_Train')
375 |         do_eval(sess, eval_correct, y_pred, x_pl, labels_pl, label_map, input_x.test, title='Final_Test')
376 |         if input_x.validation is not None:
377 |             do_eval(sess, eval_correct, y_pred, x_pl, labels_pl, label_map, input_x.validation, title='Final_Validation')
378 |         
379 |     print "Fine-tuning Finished..."
380 |     return sdae
381 | 
382 | 
383 | def loss_supervised(logits, labels, num_classes):
384 |     """Calculates the loss from the logits and the labels.
385 |     
386 |     Args:
387 |       logits: Logits tensor, float - [batch_size, NUM_CLASSES].
388 |       labels: Labels tensor, int32 - [batch_size].
389 |     
390 |     Returns:
391 |       loss: Loss tensor of type float.
392 |     """
393 |     
394 |     # Convert from sparse integer labels in the range [0, NUM_CLASSSES)
395 |     # to 1-hot dense float vectors (that is we will have batch_size vectors,
396 |     # each with NUM_CLASSES values, all of which are 0.0 except there will
397 |     # be a 1.0 in the entry corresponding to the label).
398 |     batch_size = tf.size(labels)
399 |     labels = tf.expand_dims(labels, 1)
400 |     
401 |     indices = tf.expand_dims(tf.range(0, batch_size), 1)
402 |     concated = tf.concat(1, [indices, labels])
403 |     onehot_labels = tf.sparse_to_dense(concated, tf.pack([batch_size, num_classes]), 1.0, 0.0)
404 | 
405 |     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, onehot_labels, name='xentropy')
406 |     
407 |     loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
408 |     return loss
409 | 


--------------------------------------------------------------------------------
/Train_SDAE/test_dae.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import os
  4 | import shutil
  5 | import sys
  6 | import time
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | 
 10 | from os.path import join as pjoin
 11 | from tools.config import FLAGS, home_out
 12 | from tools.dae import DAE_Layer
 13 | from tools.start_tensorboard import start
 14 | from tools.data_handler import load_data
 15 | from tools.utils import fill_feed_dict_dae
 16 | from tools.utils import load_data_sets_pretraining
 17 | from tools.utils import normalize_data, label_metadata
 18 | from tools.visualize import hist_comparison
 19 | 
 20 | _data_dir = FLAGS.data_dir
 21 | _output_dir = FLAGS.output_dir
 22 | _summary_dir = FLAGS.summary_dir
 23 | _chkpt_dir = FLAGS.chkpt_dir
 24 | 
 25 | def _check_and_clean_dir(d):
 26 |     if os.path.exists(d):
 27 |         shutil.rmtree(d)
 28 |     os.mkdir(d)
 29 | 
 30 | 
 31 | def main():
 32 |     home = home_out('')
 33 |     if not os.path.exists(home):
 34 |         os.makedirs(home)
 35 |     if not os.path.exists(_data_dir):
 36 |         os.mkdir(_data_dir)
 37 |         # os.makedirs also an option
 38 | 
 39 |         
 40 |     if not os.path.exists(_output_dir):
 41 |         os.mkdir(_output_dir)
 42 |     elif os.listdir(_output_dir):
 43 |         var = raw_input("Output folder is not empty. Clean it? (This will delete every file in it.) y/N: ")
 44 |         if var == 'y' or var == 'Y' or var == '1':
 45 |             _check_and_clean_dir(_output_dir)
 46 |         else:
 47 |             exit("Exiting... Please save your former output data and restart SDAE.")
 48 |     else:
 49 |         _check_and_clean_dir(_output_dir)
 50 | 
 51 |     _check_and_clean_dir(_summary_dir)
 52 |     _check_and_clean_dir(_chkpt_dir)
 53 |     
 54 |     os.mkdir(os.path.join(_chkpt_dir, '1'))
 55 |     os.mkdir(os.path.join(_chkpt_dir, '2'))
 56 |     os.mkdir(os.path.join(_chkpt_dir, '3'))
 57 |     os.mkdir(os.path.join(_chkpt_dir, 'fine_tuning'))
 58 |     
 59 |     start()
 60 |     
 61 |     start_time = time.time()
 62 |     datafile = load_data('RPKM', transpose=False)
 63 |     labelfile = load_data('Labels')
 64 |     print("Data Loaded. Duration:", time.time() - start_time)
 65 | 
 66 |     # Data Normalization
 67 |     datafile_norm = normalize_data(datafile, transpose=False)
 68 | 
 69 |     # Get data-sets (train, test) in a proper way
 70 |     data = load_data_sets_pretraining(datafile_norm, split_only=False)
 71 | 
 72 |     # Get Label Metadata
 73 |     mapped_labels, label_map = label_metadata(label_matrix=labelfile, label_col=7)
 74 |     num_classes = label_map.shape[0]
 75 |     
 76 |     nHLay = FLAGS.num_hidden_layers
 77 |     nHUnits = [getattr(FLAGS, "hidden{0}_units".format(j + 1)) for j in xrange(nHLay)]
 78 |     
 79 |     # Get the number of existed features (e.g. genes) in the data-set 
 80 |     num_features = datafile_norm.shape[1]
 81 |     # Create the shape of the AutoEncoder
 82 |     sdae_shape = [num_features] + nHUnits + [num_classes]
 83 |     
 84 |     with tf.Graph().as_default() as g:
 85 |         sess = tf.Session()
 86 | 
 87 |         y_all = {}
 88 |         for layer in xrange(3):
 89 |             y_all[layer] = []
 90 |             
 91 |             if layer == 0:
 92 |                 x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, num_features), name='dae_input_from_layer_{0}'.format(layer))
 93 |                 dae = DAE_Layer(in_data=x, prev_layer_size=num_features, next_layer_size=FLAGS.hidden1_units, nth_layer=layer+1, last_layer=False)
 94 |             elif layer == 1:
 95 |                 x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, FLAGS.hidden1_units), name='dae_input_from_layer_{0}'.format(layer))
 96 |                 dae = DAE_Layer(in_data=x, prev_layer_size=FLAGS.hidden1_units, next_layer_size=FLAGS.hidden2_units, nth_layer=layer+1, last_layer=False)
 97 |             else:
 98 |                 x = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, FLAGS.hidden2_units), name='dae_input_from_layer_{0}'.format(layer))
 99 |                 dae = DAE_Layer(in_data=x, prev_layer_size=FLAGS.hidden2_units, next_layer_size=num_classes, nth_layer=layer+1, last_layer=False)# or True
100 |             
101 |             cost = dae.get_loss
102 |             
103 |             with tf.variable_scope("pretrain_{0}".format(layer+1)):
104 |                 train_op, global_step, loss_summaries = train(cost)
105 |                 
106 |                 summary_dir = pjoin(FLAGS.summary_dir, 'pretraining_{0}'.format(layer+1))
107 |                 summary_writer = tf.train.SummaryWriter(summary_dir, graph_def=sess.graph_def, flush_secs=FLAGS.flush_secs)
108 |                 summary_vars = [dae.get_w_b[0], dae.get_w_b[1]]
109 |                         
110 |                 hist_summarries = [tf.histogram_summary(v.op.name, v) for v in summary_vars]
111 |                 hist_summarries.append(loss_summaries)
112 |                 summary_op = tf.merge_summary(hist_summarries)
113 |             
114 |                 dae.vars_to_init.append(global_step)
115 |                 sess.run(tf.initialize_variables(dae.vars_to_init))
116 |     
117 |                 print "| Layer | Epoch |   Cost   |   Step   |"
118 |                 print data.train.num_examples
119 |                 print data.all.num_examples
120 |     
121 |                 for step in xrange(FLAGS.pretraining_epochs):# * data.train.num_examples):
122 |     #                 for i in xrange(data.train.num_examples):
123 |                         
124 |                     feed_dict = fill_feed_dict_dae(data.train, x)
125 |     
126 |     #             if layer == 0:
127 |                     c, _, y, z, w, b_in, b_out = sess.run([cost, train_op, dae.get_representation_y, dae.get_reconstruction_z, dae.get_w_b[0], dae.get_w_b[1], dae.get_b_recon], feed_dict=feed_dict)
128 |     #             else:
129 |     #                 c, _, gs, y, z, w, b_in, b_out = sess.run([cost, train_op, g_step, dae.get_representation_y, dae.get_reconstruction_z, dae.get_w_and_biases[0], dae.get_w_and_biases[1], dae.get_w_and_biases[2]], feed_dict=fill_feed_dict_dae(data.train, x))
130 |     
131 |                     if step % 1 == 0:
132 |                         print '|  ', layer+1, '   |  ', step // data.train.num_examples + 1, '  | ', c, '  |     ', step, '     |'
133 |                         summary_str = sess.run(summary_op, feed_dict=feed_dict)
134 |                         summary_writer.add_summary(summary_str, step)
135 |                 
136 |         #         
137 |         #         print np.asarray(y_all).shape
138 |         #         print np.asarray(y_all[layer]).shape
139 |             
140 |             for _ in xrange(data.all.num_batches):
141 |                 feed_dict = fill_feed_dict_dae(data.all, x)
142 |                 
143 |                 y = sess.run(dae.get_representation_y, feed_dict=feed_dict)
144 |                 for j in xrange(np.asarray(y).shape[0]):
145 |                     y_all[layer].append(y[j])
146 | 
147 |             print np.asarray(y_all[layer]).shape
148 |             data = load_data_sets_pretraining(np.asarray(y_all[layer]), split_only=False)
149 |             
150 |         print "Finished..."
151 | 
152 | def train(layer, cost):
153 | #    with tf.name_scope("Training"):
154 |     # Add a scalar summary for the snapshot loss.
155 |     loss_summaries = tf.scalar_summary(cost.op.name, cost)
156 | 
157 |     if layer is None:
158 |         lr = FLAGS.supervised_learning_rate
159 |     else:
160 |         lr = layer._l_rate
161 | 
162 |     # Create the gradient descent optimizer with the given learning rate.
163 |     optimizer = tf.train.GradientDescentOptimizer(lr)
164 |     
165 |     # Create a variable to track the global step.
166 |     global_step = tf.Variable(0, trainable=False, name='global_step')
167 |     
168 |     # Use the optimizer to apply the gradients that minimize the loss
169 |     # (and also increment the global step counter) as a single training step.
170 |     train_op = optimizer.minimize(cost, global_step=global_step)
171 |     return train_op, global_step, loss_summaries
172 |         
173 | if __name__ == '__main__':
174 |     main()
175 |     
176 |     
177 |     
178 |     
179 |     
180 |     


--------------------------------------------------------------------------------
/Train_SDAE/tools/ADASYN.py:
--------------------------------------------------------------------------------
  1 | from sklearn.neighbors import NearestNeighbors
  2 | from random import choice
  3 | 
  4 | '''
  5 | Created on 14-jun.-2013
  6 | @author: Olivier.Janssens
  7 | '''
  8 | 
  9 | '''
 10 | Modified on 24-March-2016
 11 | @author: Anastasios Glaros
 12 | '''
 13 | 
 14 | import numpy as np
 15 | import random
 16 | 
 17 | class Adasyn(object):
 18 |     def __init__(self, data, labels, classes, K=5, beta=1):
 19 |         self.X = data
 20 |         self.K = K
 21 |         self.beta = beta
 22 |         self.new_X, self.new_y = [], []
 23 |         self.d, self.G = [], []
 24 |         
 25 |         try:
 26 |             assert not isinstance(classes, list)
 27 |             self.classes = classes.tolist()
 28 |         except AssertionError as e:
 29 |             self.classs = classes
 30 | 
 31 |         try:
 32 |             assert not isinstance(labels, list)
 33 |             self.y = labels.tolist()
 34 |         except AssertionError as e:
 35 |             self.y = labels
 36 |             
 37 |         temp = []
 38 |         for i in xrange(len(self.classes)):
 39 |             temp.append(len(all_indices(i, self.y)))
 40 |         
 41 |         self.majority_class = self.classes[temp.index(max(temp))] #np.where(np.asarray(temp)==max(temp))[0][0]]
 42 |                 
 43 |         
 44 |     def balance_all(self):
 45 |         classes = np.copy(self.classes).tolist()
 46 |         classes.remove(self.majority_class)
 47 | 
 48 |         print "Classes:", classes
 49 | 
 50 |         # Loop for all the classes except the majority
 51 |         for class_i in classes:
 52 |             print "\nFor class: ", class_i
 53 |             ms, ml = self.get_class_count(self.X, self.y, class_i, self.majority_class)
 54 | 
 55 |             d = self.get_d(self.X, self.y, ms, ml)
 56 |             G = self.get_G(self.X, self.y, ms, ml, self.beta)
 57 |             
 58 |             rlist = self.get_Ris(self.X, self.y, class_i, self.K)
 59 | #             print("ms, ml, d, G, len(rlist): ", ms, ml, d, G, len(rlist))
 60 |             
 61 |             new_X, new_y = self.generate_samples(rlist, self.X, self.y, G, class_i, self.K)
 62 |             print "Length of original_X, new_X:", ms, len(new_X)
 63 | #             print("shape of new_X, new_y:", new_X.shape, new_y.shape)
 64 |             self.new_X.append(new_X)
 65 |             self.new_y.append(new_y)
 66 |         
 67 |         return self.join_all_together()
 68 | #             X, y = self.join_with_the_rest(self.X, self.y, newX, newy, self.classes, class_i)
 69 | 
 70 |     def save_data(self, data_filename, label_filename):
 71 |         from tools.utils import write_csv
 72 |         import csv
 73 |         print(type(self.new_X), "saving...")
 74 |         with open(data_filename, "wb") as f:
 75 |             writer = csv.writer(f, delimiter='\t')
 76 |             writer.writerows(self.new_X)
 77 |             
 78 |         print("Saved.")
 79 | #         write_csv(data_filename, self.new_X)
 80 |         del(self.new_X)
 81 | 
 82 | #         with open(label_filename, "wb") as f:
 83 | #             writer = csv.writer(f, delimiter='\t')
 84 | #             writer.writerows(self.new_y)
 85 |         write_csv(label_filename, self.new_y)
 86 |         del(self.new_y)
 87 | 
 88 |     # @param: X The datapoints e.g.: [f1, f2, ... ,fn]
 89 |     # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn]
 90 |     # @return ms: The amount of samples in the minority group
 91 |     # @return ms: The amount of samples in the majority group
 92 |     def get_class_count(self, X, y, minorityclass, majorityclass):
 93 |         indicesZero = all_indices(minorityclass, y)
 94 |         indicesOne = all_indices(majorityclass, y)
 95 |         
 96 |         if len(indicesZero) > len(indicesOne):
 97 |             ms = len(indicesOne)
 98 |             ml = len(indicesZero)
 99 |         else:
100 |             ms = len(indicesZero)
101 |             ml = len(indicesOne)
102 |         return ms,ml
103 | 
104 |     # @param: X The datapoints e.g.: [f1, f2, ... ,fn]
105 |     # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn]
106 |     # @param ms: The amount of samples in the minority group
107 |     # @param ms: The amount of samples in the majority group
108 |     # @return: The ratio between the minority and majority group
109 |     def get_d(self, X,y,ms,ml):
110 |     
111 |         return float(ms)/float(ml)
112 | 
113 |     # @param: X The datapoints e.g.: [f1, f2, ... ,fn]
114 |     # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn]
115 |     # @param ms: The amount of samples in the minority group
116 |     # @param ms: The amount of samples in the majority group
117 |     # @return: the G value, which indicates how many samples should be generated in total, this can be tuned with beta
118 |     def get_G(self, X,y,ms,ml,beta):
119 |         return (ml-ms)*beta
120 | 
121 | 
122 |     # @param: X The datapoints e.g.: [f1, f2, ... ,fn]
123 |     # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn]
124 |     # @param: minorityclass: The minority class
125 |     # @param: K: The amount of neighbours for Knn
126 |     # @return: rlist: List of r values
127 |     def get_Ris(self, X,y, minorityclass=0, K=5):
128 |         indicesMinority = all_indices(minorityclass,y)
129 |         ymin = np.array(y)[indicesMinority]
130 |         Xmin = np.array(X)[indicesMinority]
131 |         neigh = NearestNeighbors(n_neighbors=30,algorithm = 'ball_tree')
132 |         neigh.fit(X)
133 |         
134 | #         print "Shapes:", Xmin[0].shape, Xmin[0].reshape(1,-1).shape
135 |         
136 |         rlist = [0]*len(ymin)
137 |         normalizedrlist = [0]*len(ymin)
138 | 
139 |         classes = np.copy(self.classes).tolist()
140 |         classes.remove(minorityclass)
141 |         
142 |         for i in xrange(len(ymin)):
143 |             indices = neigh.kneighbors(Xmin[i].reshape(1,-1), K, False)
144 |             
145 |             #print ">", len(all_indices_multi(classes, np.array(y)[indices].tolist()[0]))
146 |             rlist[i] = float(len(all_indices_multi(classes, np.array(y)[indices].tolist()[0]))) / K
147 |     
148 |             
149 |         normConst = sum(rlist)
150 |     
151 |         try:
152 |             for j in xrange(len(rlist)):
153 |                 normalizedrlist[j] = (rlist[j]/normConst)
154 |         except ZeroDivisionError as e:
155 |             normalizedrlist = rlist
156 |             print(rlist)
157 |     
158 |         return normalizedrlist
159 |         
160 | 
161 |     # @param: rlist: List of r values
162 |     # @param: X The datapoints e.g.: [f1, f2, ... ,fn]
163 |     # @param: y the classlabels e.g: [0,1,1,1,0,...,Cn]
164 |     # @return: the G value, which indicates how many samples should be generated in total, this can be tuned with beta
165 |     # @param: minorityclass: The minority class
166 |     # @param: K: The amount of neighbours for Knn
167 |     # @return: The synthetic data samples
168 |     def generate_samples(self, rlist,X,y,G,minorityclasslabel,K):
169 |         syntheticdata = []
170 |         
171 |         indicesMinority = all_indices(minorityclasslabel,y)
172 |         ymin = np.array(y)[indicesMinority]
173 |         Xmin = np.array(X)[indicesMinority]
174 |         
175 | #         print "Xmin shape: ", Xmin.shape, ", len of ymin:", len(ymin)
176 |         
177 |         neigh = NearestNeighbors(n_neighbors=30,algorithm = 'ball_tree')
178 |         neigh.fit(Xmin)
179 |         gsum=0
180 |         for k in xrange(len(ymin)):
181 |             g = int(np.round(rlist[k]*G))
182 |             #print g, "= int round ", rlist[k], "*", G
183 |             gsum += g
184 |             for l in xrange(g):
185 |                 ind = random.choice(neigh.kneighbors(Xmin[k].reshape(1,-1),K,False)[0])
186 |                 s = Xmin[k] + (Xmin[ind]-Xmin[k]) * random.random()
187 |                 syntheticdata.append(s)
188 |         
189 | #         print "synthetic shape: ", np.asarray(syntheticdata).shape, ", gsum:", gsum
190 |         
191 |         try:
192 |             new_data = np.concatenate((syntheticdata, Xmin),axis=0)
193 |             new_y = [minorityclasslabel] * len(new_data)
194 |         except ValueError as e:
195 |             new_data = Xmin
196 |             new_y = ymin
197 | 
198 |         return new_data, new_y
199 | 
200 |     def join_all_together(self):
201 |         X_all, y_all = [], []
202 |         classes = np.copy(self.classes).tolist()
203 |         classes.remove(self.majority_class)
204 |         print "\nJoining Original and Synthetic datasets..."
205 |         # Loop for all classes except 1 (the majority class)
206 |         for i, class_i in zip(xrange(len(self.classes) - 1), classes):
207 |             classes_no_minor = np.copy(self.classes).tolist()
208 |             classes_no_minor.remove(class_i)
209 | #             print i, class_i, classes_no_minor
210 | 
211 |             if i == 0:
212 |                 indicesMajority = all_indices_multi(classes_no_minor, self.y)
213 |                 ymaj = np.array(self.y)[indicesMajority]
214 |                 Xmaj = np.array(self.X)[indicesMajority]
215 | #                 print "Indices_Majority:", len(indicesMajority), "len ymaj:", len(ymaj), "len Xmaj:", len(Xmaj), "len self.new_X:", len(self.new_X)
216 | 
217 | #                 X_all = np.concatenate((Xmaj, self.new_X[i]), axis=0)
218 | #                 y_all = np.concatenate((ymaj, self.new_y[i]), axis=0)
219 |             else:
220 |                 indicesMajority = all_indices_multi(classes_no_minor, y_all.tolist())
221 |                 ymaj = y_all[indicesMajority]
222 |                 Xmaj = X_all[indicesMajority]
223 | #                 print "Indices_Majority:", len(indicesMajority), "len ymaj:", len(ymaj), "len Xmaj:", len(Xmaj), "len self.new_X:", len(self.new_X)
224 | 
225 | #                 X_all = np.concatenate((X_all, np.concatenate((Xmaj, self.new_X[i]), axis=0)), axis=0)
226 | #                 y_all = np.concatenate((y_all, np.concatenate((ymaj, self.new_y[i]), axis=0)), axis=0)
227 |             
228 |             X_all = np.concatenate((Xmaj, self.new_X[i]), axis=0)
229 |             y_all = np.concatenate((ymaj, self.new_y[i]), axis=0)
230 |         print "Joined. Length of X_all and y_all:", len(X_all), len(y_all)
231 | 
232 |         return X_all, y_all
233 |     
234 |     def join_with_the_rest(self, X,y,newData,newy,classes, minorityclass):
235 |         classes.remove(minorityclass)
236 |         indicesMajority = all_indices_multi(classes, y)
237 |         ymaj = np.array(y)[indicesMajority]
238 |         Xmaj = np.array(X)[indicesMajority]
239 |     
240 |         return np.concatenate((Xmaj,newData),axis=0), np.concatenate((ymaj,newy),axis=0)
241 |     
242 |     def joinwithmajorityClass(self, X,y,newData,newy,majorityclasslabel):
243 |         indicesMajority = all_indices(majorityclasslabel,y)
244 |         ymaj = np.array(y)[indicesMajority]
245 |         Xmaj = np.array(X)[indicesMajority]
246 |     
247 |         return np.concatenate((Xmaj,newData),axis=0),np.concatenate((ymaj,newy),axis=0)
248 | 
249 | # @param value: The classlabel
250 | # @param qlist: The list in which to search
251 | # @return: the indices of the values that are equal to the classlabel
252 | def all_indices(value, qlist):
253 |     indices = []
254 |     idx = -1
255 |     while True:
256 |         try:
257 |             idx = qlist.index(value, idx+1)
258 |             indices.append(idx)
259 |         except ValueError:
260 |             break
261 |     return indices
262 | 
263 | 
264 | # @param values: The classlabels except the minority's class
265 | # @param qlist: The list in which to search
266 | # @return: the indices of the values that are equal to the classlabel
267 | def all_indices_multi(values, qlist):
268 |     indices = []
269 |     for i in xrange(len(values)):
270 |         idx = -1
271 |         flag = True
272 |         while flag:            
273 |             try:
274 |                 idx = qlist.index(values[i], idx+1)
275 |                 indices.append(idx)
276 |             except ValueError:
277 |                 flag = False
278 |     return indices
279 | 
280 | 


--------------------------------------------------------------------------------
/Train_SDAE/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glrs/StackedDAE/c21e851dc13e11f201ce7289e854c05956637986/Train_SDAE/tools/__init__.py


--------------------------------------------------------------------------------
/Train_SDAE/tools/config.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import os
 3 | from os.path import join as pjoin
 4 | 
 5 | import sys
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | WEB_OUT = '/var/www/html/'
10 | 
11 | def home_out(path):
12 |     return pjoin(os.environ['HOME'], 'tmp_StackedDAE', 'Allan', path)
13 | 
14 | def web_out(path):
15 |     # Just a quick manual flag for changes between local and remote VMs
16 |     if False:
17 |         return pjoin(WEB_OUT, 'StackedDAE', path)
18 |     else:
19 |         return home_out(path)
20 | 
21 | 
22 | flags = tf.app.flags
23 | FLAGS = flags.FLAGS
24 | 
25 | # Data Management
26 | flags.DEFINE_string('dataset', 'Linarsson', 'Choose which dataset you want to use')
27 | flags.DEFINE_boolean('use_balanced', False, 'Use balanced data or not. If not existed they will be created.')
28 | 
29 | # Autoencoder Architecture Specific Flags
30 | flags.DEFINE_boolean('bias_node', False, 'Whether to use or not a bias node in the network')
31 | 
32 | flags.DEFINE_integer('num_hidden_layers', 3, 'Number of hidden layers')
33 | 
34 | flags.DEFINE_integer('hidden1_units', 50, 'Number of units in hidden layer 1.')   # 2000
35 | flags.DEFINE_integer('hidden2_units', 25, 'Number of units in hidden layer 2.')
36 | flags.DEFINE_integer('hidden3_units', 15, 'Number of units in hidden layer 3.')
37 | 
38 | # flags.DEFINE_integer('example_features', EXAMPLE_FEATURES, 'Total number of features (genes)')  # image_pixels
39 | # flags.DEFINE_integer('num_classes', 10, 'Number of classes')
40 | 
41 | flags.DEFINE_float('unsupervised_learning_rate', 0.0001, 'Unsupervised initial learning rate.')
42 | flags.DEFINE_float('supervised_learning_rate', 0.01, 'Supervised initial learning rate.')
43 | 
44 | flags.DEFINE_float('pre_layer1_learning_rate', 0.0001, 'Initial learning rate.')
45 | flags.DEFINE_float('pre_layer2_learning_rate', 0.0001, 'Initial learning rate.')
46 | flags.DEFINE_float('pre_layer3_learning_rate', 0.0001, 'Initial learning rate.')
47 | 
48 | flags.DEFINE_boolean('emphasis', False, 'Whether to use Emphasis or Not')
49 | flags.DEFINE_string('emphasis_type', 'Double', 'Type of Emphasis for the Cross Entropy. [Double, Full]')
50 | 
51 | flags.DEFINE_float('default_noise', [0.0, 'MN'], 'Default Noise ratio and type to apply on the data')
52 | 
53 | flags.DEFINE_float('noise_1', [0.10, 'TFDO'], 'Noise ratio to apply on the data, and the type of noise')
54 | flags.DEFINE_float('noise_2', [0.10, 'TFDO'], 'Noise ratio to apply on the data, and the type of noise')
55 | flags.DEFINE_float('noise_3', [0.10, 'TFDO'], 'Noise ratio to apply on the data, and the type of noise')
56 | 
57 | """ TODO: ADD a flag for activation function (sigmoid, tanh, etc.) """
58 | 
59 | # Constants
60 | # flags.DEFINE_integer('seed', 1234, 'Random seed')
61 | 
62 | flags.DEFINE_integer('batch_size', 9, 'Batch size. Must divide evenly into the dataset sizes.')   # 100
63 | 
64 | flags.DEFINE_integer('pretraining_epochs', 50, 'Number of training epochs for pretraining layers')  # 60
65 | flags.DEFINE_integer('finetuning_epochs', 50, 'Number of training epochs for fine tuning supervised step')
66 | 
67 | flags.DEFINE_float('zero_bound', 1.0e-9, 'Value to use as buffer to avoid numerical issues at 0')
68 | flags.DEFINE_float('one_bound', 1.0 - 1.0e-9, 'Value to use as buffer to avoid numerical issues at 1')
69 | 
70 | flags.DEFINE_float('flush_secs', 120, 'Number of seconds to flush summaries')   # 120
71 | 
72 | # Directories
73 | flags.DEFINE_string('data_dir', home_out('data'), 'Directory to put the training data.')
74 | 
75 | flags.DEFINE_string('output_dir', web_out('output'), 'Directory to put the output data.')
76 | 
77 | flags.DEFINE_string('summary_dir', home_out('summaries'), 'Directory to put the summary data')
78 | 
79 | flags.DEFINE_string('chkpt_dir', home_out('chkpts'), 'Directory to put the model checkpoints')
80 | 
81 | # TensorBoard
82 | # flags.DEFINE_boolean('no_browser', True, 'Whether to start browser for TensorBoard')
83 | 
84 | # Python
85 | flags.DEFINE_string('python', sys.executable, 'Path to python executable')
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/Train_SDAE/tools/data_handler.py:
--------------------------------------------------------------------------------
  1 | """ Data Handler for Allan's Data-set """
  2 | 
  3 | import pandas as pd
  4 | import os
  5 | import gzip
  6 | import numpy as np
  7 | 
  8 | from os.path import join as pjoin
  9 | from config import FLAGS
 10 | 
 11 | # TODO: Use Dictionary instead!
 12 | TPM = {'filtered':'TPM_common_ready_data.csv', 'ordered':'TPM_ready_data.csv', 'original':'GSE71585_RefSeq_TPM.csv', 'zipped':'GSE71585_RefSeq_TPM.csv.gz'}
 13 | RPKM = {'ordered':'RPKM_ready_data.csv', 'original':'GSE71585_RefSeq_RPKM.csv', 'zipped':'GSE71585_RefSeq_RPKM.csv.gz'}
 14 | COUNTS = {'ordered':'Counts_ready_data.csv', 'original':'GSE71585_RefSeq_counts.csv', 'zipped':'GSE71585_RefSeq_counts.csv.gz'}
 15 | LABELS = {'ordered':'Labels_inOrder.csv', 'original':'GSE71585_Clustering_Results.csv', 'zipped':'GSE71585_Clustering_Results.csv.gz'}
 16 | 
 17 | # TPM = ['TPM_ready_data.csv', 'GSE71585_RefSeq_TPM.csv', 'GSE71585_RefSeq_TPM.csv.gz']
 18 | # LABELS = ['Labels_inOrder.csv', 'GSE71585_Clustering_Results.csv', 'GSE71585_Clustering_Results.csv.gz']
 19 | #'counts_ordered_nonzero_zeroone.tsv', 
 20 | #'metadata_ordered_subset.tsv', 
 21 | 
 22 | LINARSSON = {'filtered':'Linarsson_common_data.txt', 'normal':'expression_mRNA_17-Aug-2014.txt'}
 23 | 
 24 | def extract_data(in_f, out_f):
 25 |     print("Extracting", in_f)
 26 |     in_file = gzip.open(in_f, 'rb')
 27 |     out_file = open(out_f, 'wb')
 28 |     out_file.write(in_file.read())
 29 |     in_file.close()
 30 |     out_file.close()
 31 | 
 32 | 
 33 | def order_labels(data_in, label_in, data_out=None, label_out=None, sep=','):
 34 |     print("Ordering Data with Labels...")
 35 |     
 36 |     labels = pd.read_csv(label_in, index_col=0)
 37 |     data = pd.read_csv(data_in, index_col=0, sep=sep)
 38 |     
 39 |     common_labels = labels.index.intersection(data.columns)
 40 | #     common_labels2 = data.columns.intersection(labels.index)
 41 |     
 42 | #     data_nonzero = data.loc[(data > 0).any(axis=1)].dropna()
 43 |     data_nonzero = data[(data.sum(axis=1) > 0)].dropna()
 44 |     data_nonzero = data_nonzero[common_labels]
 45 |     
 46 |     """ Better here with non_zero than above? """
 47 |     common_labels2 = data_nonzero.columns.intersection(labels.index)
 48 |     label_sub = labels.loc[common_labels2]
 49 |     label_sub.index.names = labels.index.names
 50 |     
 51 |     label_sub_sort = label_sub.sort_index(0)
 52 |     data_sub_sort = data_nonzero.reindex_axis(sorted(data_nonzero.columns), axis=1)
 53 |     
 54 |     # Check that it worked
 55 |     assert(data_sub_sort.columns == label_sub_sort.index).all()
 56 |     
 57 |     if data_out is not None and label_out is not None:
 58 |         data_sub_sort.to_csv(data_out, sep="\t")
 59 |         label_sub_sort.to_csv(label_out, sep="\t")
 60 | 
 61 |     return data_sub_sort, label_sub_sort
 62 | 
 63 | 
 64 | def label_metadata(label_matrix, label_col):
 65 |     # Check whether the column value is given as index (number) or name (string) 
 66 |     try:
 67 |         label_col = int(label_col)
 68 |         
 69 |         # If given as number, take the name of the column out of it
 70 |         label_col = label_matrix.columns[label_col]
 71 |     except ValueError:
 72 |         pass
 73 | 
 74 |     # Get the unique classes in the given column, and how many of them are there
 75 |     unique_classes = pd.unique(label_matrix[label_col].ravel())
 76 |     
 77 |     # Map the unique n classes with a number from 0 to n
 78 |     label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))})
 79 |     
 80 |     # Replace the given column values with the mapped equivalent
 81 |     mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist())
 82 | #     print("label_matrix", label_matrix)
 83 | #     print("mapped_labels", mapped_labels)
 84 | 
 85 |     # Return the mapped labels as ndarray and the label map (unique classes and number can be obtained from map)
 86 |     # np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],))
 87 |     # Return the mapped labels as DataFrame and the label map (unique classes and number can be obtained from map)
 88 |     return mapped_labels[[label_col]], np.asarray(label_map) #, unique_classes, num_classes
 89 | 
 90 | 
 91 | def sort_labels(data_in):
 92 |     d = pd.read_csv(data_in, sep='\t', index_col=0)
 93 |     return d.sort_index(0)
 94 | 
 95 | 
 96 | def load_linarsson_data(d_type, transpose=False):
 97 |     print("Counts file is loading...")
 98 |     if d_type == 'filtered':
 99 |         data = LINARSSON['filtered']
100 |     else:
101 |         data = LINARSSON['normal']
102 | 
103 | #     data = pd.read_csv(pjoin(FLAGS.data_dir, 'expression_mRNA_17-Aug-2014.txt'), skiprows=[0,1,2,3,4,5,6,8,9,10], header=0, sep='\t', index_col=0)
104 |     data = pd.read_csv(pjoin(FLAGS.data_dir, data), skiprows=[0,1,2,3,4,5,6,8,9,10], header=0, sep='\t', index_col=0)
105 |     data.drop(data.columns[0], axis=1,inplace=True)
106 |     
107 |     if transpose:
108 |         data = data.transpose()
109 | 
110 |     return np.array(data)
111 | 
112 | def load_linarsson_labels(sub_labels=False):
113 |     print("Label file is loading...")
114 |     rows_to_skip = [0,1,2,3,4,5,6,8] if sub_labels else 7
115 |     labels = pd.read_csv(pjoin(FLAGS.data_dir, LINARSSON['normal']), skiprows=rows_to_skip, nrows=2, header=None, sep='\t', index_col=False)
116 | #     sub_labels = pd.read_csv(pjoin(FLAGS.data_dir, "expression_mRNA_17-Aug-2014.txt"), skiprows=[0,1,2,3,4,5,6,8], nrows=1, sep='\t', index_col=1)
117 | 
118 |     labels = labels.transpose()
119 |     labels.columns= labels.iloc[1]
120 |     labels.drop(labels.index[[0, 1]], inplace=True)
121 |     labels.set_index(labels.columns.values[0], inplace=True)
122 |     
123 |     return labels, label_metadata(label_matrix=labels, label_col=0)
124 | 
125 | def load_data(dataset=None, d_type=None, label_col=None, transpose=None):#, sub_labels=False):
126 |     if dataset == 'Linarsson':
127 |         data = load_linarsson_data(d_type, transpose=transpose)
128 |         
129 |         if label_col == 1:
130 |             sub_labels = False
131 |         elif label_col == 2:
132 |             sub_labels = True
133 |         else:
134 |             exit("Error: Options for Linarsson Label columns are 1 or 2.")
135 | 
136 |         labels, meta = load_linarsson_labels(sub_labels)
137 |         return data, labels, meta
138 |     elif dataset == 'Allen':
139 |         return load_allen(d_type=d_type, label_col=label_col, transpose=transpose)
140 |     else:
141 |         exit("Usage: load_data(dataset=['Linarsson', 'Allen'],\
142 |             data_type=['filtered', 'TPM', 'RPKM', 'Counts', 'Labels', None],\
143 |             label_col=[int], (optional)transpose=[Boolean (default=None)])")
144 | 
145 | # def load_data(d_type=None, label_col=None, transpose=False):
146 | def load_allen(d_type=None, label_col=None, transpose=False):
147 |     if d_type == 'TPM':
148 |         d = check_and_load(TPM)
149 |         print("TPM file is loading...")
150 |     elif d_type == 'RPKM':
151 |         d = check_and_load(RPKM)
152 |         print("RPKM file is loading...")
153 |     elif d_type == 'Counts':
154 |         d = check_and_load(COUNTS)
155 |         print("Counts file is loading...")
156 |     elif d_type == 'Labels' or d_type is None and label_col is not None:
157 |         d = check_and_load(LABELS)
158 |         print("Label file is loading...")
159 |     elif d_type == 'filtered':
160 |         d = pd.read_csv(pjoin(FLAGS.data_dir, TPM['filtered']), sep='\t', index_col=0)
161 | #         d = pd.read_csv(pjoin(FLAGS.data_dir, 'TPM_common_ready_data.csv'), sep='\t', index_col=0)
162 |     else:
163 |         exit("Usage: load_data(data_type=['filtered', 'TPM', 'RPKM', 'Counts', 'Labels', None],\
164 |             label_col=[int], (optional)transpose=[Boolean (default=None)])")
165 | 
166 | 
167 | #     if not os.path.exists(pjoin(FLAGS.data_dir, data[0])):
168 | #         if not os.path.exists(pjoin(FLAGS.data_dir, data[1])):
169 | #             if not os.path.exists(pjoin(FLAGS.data_dir, data[2])):
170 | #                 exit("You should download and place the data in the correct folder.")
171 | #             else:
172 | #                 extract_data(pjoin(FLAGS.data_dir, data[2]), pjoin(FLAGS.data_dir, data[1]))
173 | #                 if d_type == 'Labels':
174 | #                     exit("Labels extracted. You need to give a dataset first to receive the labels.")
175 | #                 else:
176 | #                     if not os.path.exists(pjoin(FLAGS.data_dir, LABELS[1])):
177 | #                         extract_data(pjoin(FLAGS.data_dir, LABELS[2]), pjoin(FLAGS.data_dir, LABELS[1]))
178 | #  
179 | #                     d, _ = order_labels(pjoin(FLAGS.data_dir, data[1]), pjoin(FLAGS.data_dir, LABELS[1]),
180 | #                                         pjoin(FLAGS.data_dir, data[0]), pjoin(FLAGS.data_dir, LABELS[0]))
181 | #         else:
182 | #             if d_type == 'Labels':
183 | #                 exit("You need to give a dataset first to receive the labels.")
184 | #             else:
185 | #                 d, _ = order_labels(pjoin(FLAGS.data_dir, data[1]), pjoin(FLAGS.data_dir, LABELS[1]),
186 | #                                     pjoin(FLAGS.data_dir, data[0]), pjoin(FLAGS.data_dir, LABELS[0]))
187 | #     else:
188 | #         d = pd.read_csv(pjoin(FLAGS.data_dir, data[0]), sep='\t', index_col=0)
189 | 
190 | 
191 |     # Use recursion to load and return the labels as well
192 |     if d_type == 'Labels' or d_type is None:
193 |         # Return Label Metadata
194 | 
195 |         labels = d[[label_col]]
196 | #         print(labels)
197 |         return labels, label_metadata(label_matrix=d, label_col=label_col)
198 |     else:
199 |         if transpose:
200 |             d = d.transpose()
201 | 
202 |         labels, (mapped_labels, label_map) = load_allen(label_col=label_col)
203 |         
204 |     return np.array(d), labels, (mapped_labels, label_map)
205 | 
206 | def check_and_load(data):
207 |     if not os.path.exists(pjoin(FLAGS.data_dir, data['ordered'])):
208 |         if not os.path.exists(pjoin(FLAGS.data_dir, data['original'])):
209 |             if not os.path.exists(pjoin(FLAGS.data_dir, data['zipped'])):
210 |                 exit("You should download and place the data in the correct folder.")
211 |             else:
212 |                 extract_data(pjoin(FLAGS.data_dir, data['zipped']), pjoin(FLAGS.data_dir, data['original']))
213 |                 if d_type == 'Labels':
214 |                     exit("Labels extracted. You need to give a dataset first to receive the labels.")
215 |                 else:
216 |                     if not os.path.exists(pjoin(FLAGS.data_dir, LABELS['original'])):
217 |                         extract_data(pjoin(FLAGS.data_dir, LABELS['zipped']), pjoin(FLAGS.data_dir, LABELS['original']))
218 |   
219 |                     d, _ = order_labels(pjoin(FLAGS.data_dir, data['original']), pjoin(FLAGS.data_dir, LABELS['original']),
220 |                                         pjoin(FLAGS.data_dir, data['ordered']), pjoin(FLAGS.data_dir, LABELS['ordered']))
221 |         else:
222 |             if d_type == 'Labels':
223 |                 exit("You need to give a dataset first to receive the labels.")
224 |             else:
225 |                 d, _ = order_labels(pjoin(FLAGS.data_dir, data['original']), pjoin(FLAGS.data_dir, LABELS['original']),
226 |                                     pjoin(FLAGS.data_dir, data['ordered']), pjoin(FLAGS.data_dir, LABELS['ordered']))
227 |     else:
228 |         d = pd.read_csv(pjoin(FLAGS.data_dir, data['ordered']), sep='\t', index_col=0)
229 |         
230 |     return d
231 | 
232 | 
233 | def load_extra(dataset=None, filename=None, transpose=True, label_col=None, sub_labels=False):
234 |     if dataset == 'Allen':
235 | #         data, labels = order_labels(pjoin(FLAGS.data_dir, filename), pjoin(FLAGS.data_dir, LABELS[1]), sep='\t')
236 | 
237 |         labels = pd.read_csv(pjoin(FLAGS.data_dir, LABELS['ordered']), sep='\t', index_col=0)
238 |         data = pd.read_csv(pjoin(FLAGS.data_dir, filename), sep='\t', index_col=0)
239 | 
240 |         if transpose:
241 |             data = data.transpose()
242 |         return np.array(data), labels[[label_col]], label_metadata(label_matrix=labels, label_col=label_col)
243 |     elif dataset == 'Linarsson':
244 |         data = load_linarsson_data('filtered', transpose=transpose)
245 |         labels, meta = load_linarsson_labels(sub_labels)
246 |         return data, labels, meta
247 |     elif dataset == 'Lin-Allen':
248 |         labels = pd.read_csv(pjoin(FLAGS.data_dir, 'Lin-Allen_comp_labels.csv'), sep='\t', index_col=0)
249 |         data = pd.read_csv(pjoin(FLAGS.data_dir, filename), sep='\t', index_col=0)
250 |         
251 |         if transpose:
252 |             data = data.transpose()
253 |         return np.array(data), labels[[label_col]], label_metadata(label_matrix=labels, label_col=label_col)
254 |         
255 | 
256 |         


--------------------------------------------------------------------------------
/Train_SDAE/tools/evaluate.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | 
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | 
  7 | from config import FLAGS
  8 | # from data import fill_feed_dict as fill_feed_dict
  9 | from utils import fill_feed_dict as fill_feed_dict
 10 | 
 11 | from sklearn.metrics import precision_score, confusion_matrix, classification_report
 12 | from sklearn.metrics import recall_score, f1_score, roc_curve, accuracy_score
 13 | 
 14 | from tools.visualize import plot_confusion_matrix as pcm
 15 | from tools.visualize import plot_roc_curve as roc
 16 | 
 17 | np.set_printoptions(linewidth=200)
 18 | 
 19 | def evaluation(logits, labels):
 20 |     """Evaluate the quality of the logits at predicting the label.
 21 |     
 22 |     Args:
 23 |       logits: Logits tensor, float - [batch_size, NUM_CLASSES].
 24 |       labels: Labels tensor, int32 - [batch_size], with values in the
 25 |         range [0, NUM_CLASSES).
 26 |     
 27 |     Returns:
 28 |       A scalar int32 tensor with the number of examples (out of batch_size)
 29 |       that were predicted correctly.
 30 |     """
 31 |     # For a classifier model, we can use the in_top_k Op.
 32 |     # It returns a bool tensor with shape [batch_size] that is true for
 33 |     # the examples where the labels was in the top k (here k=1)
 34 |     # of all logits for that example.
 35 |     # correct: type = List (of booleans)
 36 |     correct = tf.nn.in_top_k(logits, labels, 1)
 37 | #     correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
 38 |     
 39 | #     accuracy = tf.reduce_mean(tf.cast(correct, "float"))
 40 |     y_p = tf.argmax(logits, 1)
 41 | #     l_p = tf.argmax(labels, 1)
 42 | 
 43 |     
 44 |     # Return the number of true entries. Cast because originally is bool.
 45 |     return tf.reduce_sum(tf.cast(correct, tf.int32)), correct, y_p
 46 | 
 47 | def predict(sdae, data_set, bias_node=False):
 48 |     with sdae.session.graph.as_default():
 49 |         labels_placeholder = tf.placeholder(tf.int32, shape=1,\
 50 |                                             name='labels_placeholder')
 51 |         examples_placeholder = tf.placeholder(tf.float32,\
 52 |                                               shape=(1, sdae._net_shape[0]),\
 53 |                                               name='input_pl')
 54 |         
 55 |         logits = tf.identity(examples_placeholder)
 56 |         
 57 |         for layer in sdae.get_layers:
 58 |             if bias_node:
 59 |                 bias_n = tf.ones(shape=[1, 1], dtype=tf.float32)
 60 |                 logits = tf.concat(1, [bias_n, logits])
 61 |             logits = layer.clean_activation(x_in=logits, use_fixed=False)
 62 |             
 63 |         predictions = tf.argmax(logits, 1)
 64 |         
 65 |         labels = tf.identity(labels_placeholder)
 66 |         
 67 |         y_pred = []
 68 |         y_true = []
 69 |         
 70 |         for _ in xrange(data_set.num_examples):
 71 |             feed_dict = fill_feed_dict(data_set,
 72 |                                        examples_placeholder,
 73 |                                        labels_placeholder, 1)
 74 |     
 75 |             y_prediction, y_trues = sdae.session.run([predictions, labels], feed_dict=feed_dict)
 76 |             y_pred += list(y_prediction)
 77 |             y_true += list(y_trues)
 78 |             
 79 | #         print(y_pred)
 80 |         return y_pred, y_true
 81 | 
 82 | def do_eval(sess,
 83 |             eval_correct,
 84 |             predictions,
 85 |             examples_placeholder,
 86 |             labels_placeholder,
 87 |             label_map,
 88 |             data_set,
 89 |             title='Evaluation'):
 90 |     """Runs one evaluation against the full epoch of data.
 91 |     Args:
 92 |       sess: The session in which the model has been trained.
 93 |       eval_correct: The Tensor that returns the number of correct predictions.
 94 |       images_placeholder: The images placeholder.
 95 |       labels_placeholder: The labels placeholder.
 96 |       data_set: The set of images and labels to evaluate, from
 97 |         utils.read_data_sets().
 98 |     """
 99 |     # And run one epoch of eval.
100 |     true_count = 0  # Counts the number of correct predictions.
101 |     y_pred = []
102 |     y_true = []
103 |     steps_per_epoch = data_set.num_examples // FLAGS.batch_size
104 |     num_examples = steps_per_epoch * FLAGS.batch_size
105 |     
106 |     labels = tf.identity(labels_placeholder)
107 |     
108 |     for _ in xrange(steps_per_epoch):
109 |         feed_dict = fill_feed_dict(data_set,
110 |                                    examples_placeholder,
111 |                                    labels_placeholder)
112 |         corrects, y_prediction, y_trues = sess.run([eval_correct, predictions,\
113 |                                                     labels], feed_dict=feed_dict)
114 |         true_count += corrects
115 |         y_pred += list(y_prediction)
116 |         y_true += list(y_trues)
117 |         
118 |     accuracy = true_count / num_examples
119 |     print(title + ' - Num examples: %d  Num correct: %d  Accuracy_score @ 1: %0.08f' %
120 |           (num_examples, true_count, accuracy))
121 | 
122 | #     print("True output:", y_true)
123 | #     print("Pred output:", y_pred)
124 |     
125 |     print("Precision:")
126 |     print("\tNone: ", precision_score(y_true, y_pred, average=None, pos_label=None))
127 | #     print("\tBinary:", precision_score(y_true, y_pred, average='binary'))
128 |     print("\tMicro: %0.08f" % precision_score(y_true, y_pred, average='micro', pos_label=None))
129 |     print("\tMacro: %0.08f" % precision_score(y_true, y_pred, average='macro', pos_label=None))
130 |     print("\tWeighted: %0.08f" % precision_score(y_true, y_pred, average='weighted', pos_label=None))
131 | #     print("\tSamples:", sklearn.metrics.precision_score(y_true, y_pred, average='samples'))
132 | #     print("\tAccuracy_score: %0.08f" % accuracy_score(y_true, y_pred))
133 |      
134 |     print("Recall:")
135 | #     print("\tNone: ", recall_score(y_true, y_pred, average=None, pos_label=None))
136 | #     print("\tBinary:", recall_score(y_true, y_pred, average='binary'))
137 |     print("\tMicro: %0.08f" % recall_score(y_true, y_pred, average='micro', pos_label=None))
138 |     print("\tMacro: %0.08f" % recall_score(y_true, y_pred, average='macro', pos_label=None))
139 |     print("\tWeighted: %0.08f" % recall_score(y_true, y_pred, average='weighted', pos_label=None))
140 | #     print("\tSamples:", sklearn.metrics.recall_score(y_true, y_pred, average='samples'))    
141 |     
142 | #     print("F1_score:")
143 | #     print("\tNone: ", f1_score(y_true, y_pred, average=None, pos_label=None))
144 | #     print("\tBinary:", f1_score(y_true, y_pred, average='binary'))
145 | #     print("\tMicro: %0.08f" % f1_score(y_true, y_pred, average='micro', pos_label=None))
146 | #     print("\tMacro: %0.08f" % f1_score(y_true, y_pred, average='macro', pos_label=None))
147 |     print("\nF1 Score (weighted): %0.08f" % f1_score(y_true, y_pred, average='weighted', pos_label=None))
148 | #     print("\tSamples:", sklearn.metrics.f1_score(y_true, y_pred, average='samples'))
149 | 
150 | #     print("True Length:", len(y_true))
151 | #     print("Prediction Length:", len(y_pred))
152 | 
153 |     cm = confusion_matrix(y_true, y_pred)
154 | #     print("\nConfusion Matrix")
155 | #     print(cm)
156 |     
157 |     cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
158 | #     print("\nNormalized confusion_matrix")
159 | #     print(cm_normalized)
160 | 
161 |     print("")
162 |     print(classification_report(y_true, y_pred, target_names=label_map))
163 | 
164 |     pcm(cm, target_names=label_map, title=title)
165 |     pcm(cm_normalized, target_names=label_map, title=title+"_Normalized")
166 |     
167 |     roc(y_pred, y_true, n_classes=len(label_map), title=title)
168 |     
169 |     print("\n=====================================================================================================\n")
170 | 
171 | 
172 | def do_eval_summary(tag,
173 |                     sess,
174 |                     eval_correct,
175 |                     examples_placeholder,
176 |                     labels_placeholder,
177 |                     data_set):
178 |     true_count = 0
179 |     steps_per_epoch = data_set.num_examples // FLAGS.batch_size
180 |     num_examples = steps_per_epoch * FLAGS.batch_size
181 |     for _ in xrange(steps_per_epoch):
182 |         feed_dict = fill_feed_dict(data_set,
183 |                                    examples_placeholder,
184 |                                    labels_placeholder)
185 |         true_count += sess.run(eval_correct, feed_dict=feed_dict)
186 |     error = 1 - true_count / num_examples
187 |     
188 |     return sess.run(tf.scalar_summary(tag, tf.identity(error)))
189 | 
190 |   
191 | 


--------------------------------------------------------------------------------
/Train_SDAE/tools/evaluate_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sklearn
 3 | from scipy.special import expit
 4 | from sklearn import ensemble
 5 | from sklearn.manifold import TSNE
 6 | import time
 7 | from os.path import join as pjoin
 8 | from tools.config import FLAGS
 9 | from tools.visualize import scatter
10 | 
11 | def get_activations(exp_data, w, b):
12 | #     exp_data = np.transpose(exp_data)
13 |     prod = exp_data.dot(w)
14 |     prod_with_bias = prod + b
15 |     return( expit(prod_with_bias) )
16 | 
17 | # Order of *args: first all the weights and then all the biases
18 | def run_random_forest(exp_data, labels, weights, biases, n_layers=None, bias_node=False):
19 |     print("Calculating Random Forests...")
20 |     assert len(exp_data) == len(labels)
21 |     
22 |     # I think they should be already transposed when running the code. Will see
23 |     act = exp_data#.T
24 |     
25 |     # Using ternary operator for shortness
26 |     n = n_layers if n_layers else len(weights)
27 |     
28 |     for i in range(n):
29 |         print('Weights and biases for layer: ' + str(i+1))
30 | #         print np.asarray(weights[i]).shape, np.asarray(biases[i]).shape
31 |         if bias_node:
32 |             act = np.insert(act, 1, np.ones_like(act[:,0]), 1)
33 |         act = get_activations(act, weights[i], biases[i])
34 |         
35 |     rf = ensemble.RandomForestClassifier(n_estimators=1000, oob_score=True, max_depth=5)
36 |     rfit = rf.fit(act, labels)
37 |     print('OOB score: %.8f\n' % rfit.oob_score_)
38 | 
39 | 
40 | def plot_tSNE(data, labels, random_state=7074568, plot_name='tsne-generated_{}.png'):
41 |     # Calculate t-SNE projections
42 |     x_projection = TSNE(random_state=random_state).fit_transform(data)
43 |     
44 |     # Form the output file name
45 |     plot_name = plot_name if plot_name.find(".") > 0 else plot_name+".png"
46 |     plot_name = pjoin(FLAGS.output_dir, plot_name.format(time.strftime("%Y-%m-%d %H:%M:%S")))
47 |     
48 |     # Create and save a t-SNE scatter plot
49 |     scatter(x_projection, labels, plot_name=plot_name)
50 | 
51 | 


--------------------------------------------------------------------------------
/Train_SDAE/tools/start_tensorboard.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import signal
 3 | import shlex, subprocess
 4 | 
 5 | from tensorflow import tensorboard as tb
 6 | 
 7 | from config import FLAGS, home_out
 8 | 
 9 | # Configure path variables
10 | _summary_dir = FLAGS.summary_dir
11 | _tb_pid_file = home_out(".tbpid")
12 | 
13 | # Configure environment/network parameters 
14 | _tb_path = os.path.join(os.path.dirname(tb.__file__), 'tensorboard.py')
15 | _tb_port = "6006"
16 | _tb_host = "0.0.0.0"
17 | 
18 | def start_tb():
19 |     if not os.path.exists(_tb_path):
20 |         raise EnvironmentError("tensorboard.py not found!")
21 |     
22 |     if os.path.exists(_tb_pid_file):
23 |         tb_pid = int(open(_tb_pid_file, 'r').readline().strip())
24 |         try:
25 |             os.kill(tb_pid, signal.SIGKILL)
26 |         except OSError:
27 |             pass
28 |         
29 |         os.remove(_tb_pid_file)
30 | 
31 |     devnull = open(os.devnull, 'wb')
32 |     args = shlex.split('nohup ' + FLAGS.python + ' -u ' + _tb_path
33 |                        + ' --host '+ _tb_host + ' --port ' + _tb_port
34 |                        + ' --logdir={0}'.format(_summary_dir))
35 |     
36 |     p = subprocess.Popen(args, stdout=devnull, stderr=devnull)
37 |     
38 |     with open(_tb_pid_file, 'w') as f:
39 |         f.write(str(p.pid))
40 |     
41 |     # if not FLAGS.no_browser:
42 |     #     subprocess.Popen(['open', 'http://localhost:{0}'.format(_tb_port)])
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     start_tb()
47 | 


--------------------------------------------------------------------------------
/Train_SDAE/tools/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import csv
  3 | from config import FLAGS
  4 | 
  5 | from sklearn.cross_validation import train_test_split
  6 | 
  7 | class DataSet(object):
  8 |     def __init__(self, examples, labels=None):
  9 |         if labels is not None:
 10 |             assert len(examples) == len(labels), (
 11 |                                         'examples.shape: %s labels.shape: %s'
 12 |                                         % (examples.shape, labels.shape))
 13 | 
 14 |         self._num_examples = examples.shape[0]
 15 |         self._examples = examples
 16 |         self._labels = labels
 17 |         self._epochs_completed = 0
 18 |         self._index_in_epoch = 0
 19 |         
 20 |     @property
 21 |     def examples(self):
 22 |         return self._examples
 23 |     
 24 |     @property
 25 |     def labels(self):
 26 |         return self._labels
 27 |      
 28 |     @property
 29 |     def num_examples(self):
 30 |         return self._num_examples
 31 |     
 32 |     @property
 33 |     def epochs_completed(self):
 34 |         return self._epochs_completed
 35 |     
 36 |     @property
 37 |     def index_in_epoch(self):
 38 |         return self._index_in_epoch
 39 |         
 40 |     def next_batch(self, batch_size):
 41 |         """Return the next `batch_size` examples from this data set."""
 42 |         start = self._index_in_epoch
 43 |         self._index_in_epoch += batch_size
 44 |         
 45 |         if self._index_in_epoch > self._num_examples:
 46 |             # Finished epoch
 47 |             self._epochs_completed += 1
 48 |             
 49 |             # Shuffle the data
 50 |             perm = np.arange(self._num_examples)
 51 |             np.random.shuffle(perm)
 52 |             
 53 |             self._examples = self._examples[perm]
 54 | 
 55 |             if self._labels is not None:
 56 |                 self._labels = self._labels[perm]
 57 |     
 58 |             # Start next epoch
 59 |             start = 0
 60 |             self._index_in_epoch = batch_size
 61 |             assert batch_size <= self._num_examples
 62 |             
 63 |         end = self._index_in_epoch
 64 |         
 65 |         if self._labels is None:
 66 |             return self._examples[start:end] #self._examples.iloc[start:end]
 67 |         else:
 68 | #             return self._examples.iloc[start:end], self._labels.iloc[start:end]
 69 |             return self._examples[start:end], self._labels[start:end]
 70 |         
 71 | 
 72 | class DataSetPreTraining(object):
 73 |     def __init__(self, examples):
 74 |         self._num_examples = examples.shape[0]
 75 |         self._examples = examples
 76 | 
 77 |         self._examples[self._examples < FLAGS.zero_bound] = FLAGS.zero_bound
 78 |         self._examples[self._examples > FLAGS.one_bound] = FLAGS.one_bound
 79 | 
 80 |         self._epochs_completed = 0
 81 |         self._index_in_epoch = 0
 82 | 
 83 |     @property
 84 |     def examples(self):
 85 |         return self._examples
 86 |     
 87 |     @property
 88 |     def num_examples(self):
 89 |         return self._num_examples
 90 |     
 91 |     @property
 92 |     def num_batches(self):
 93 |         return self.num_examples / FLAGS.batch_size
 94 |     
 95 |     @property
 96 |     def epochs_completed(self):
 97 |         return self._epochs_completed
 98 |     
 99 |     @property
100 |     def index_in_epoch(self):
101 |         return self._index_in_epoch
102 | 
103 | #     """ TODO: Under implementation """
104 | #     def same_batch(self):
105 | #         pass
106 | 
107 |     def next_batch(self, batch_size):
108 |         """Return the next `batch_size` examples from this data set."""
109 |         start = self._index_in_epoch
110 |         self._index_in_epoch += batch_size
111 |         
112 |         if self._index_in_epoch > self._num_examples:
113 |             # Finished epoch
114 |             self._epochs_completed += 1
115 |             
116 |             # Shuffle the data
117 |             perm = np.arange(self._num_examples)
118 |             np.random.shuffle(perm)
119 |             self._images = self._examples[perm]
120 |             
121 |             # Start next epoch
122 |             start = 0
123 |             self._index_in_epoch = batch_size
124 | 
125 | #             print self._num_examples
126 |             assert batch_size <= self._num_examples
127 |             
128 |         end = self._index_in_epoch
129 |         
130 |         return self._examples[start:end]
131 | 
132 | 
133 | def load_data_sets(input_data, labels, split_only=True, valid_set=False):
134 |     class DataSets(object):
135 |         pass
136 |     data_sets = DataSets()
137 | 
138 |     print("\nSplitting to Train & Test sets for Finetuning")
139 | 
140 |     if valid_set:
141 |         train_examples, test_examples, train_labels, test_labels = \
142 |                         train_test_split(input_data, labels, test_size=0.2)
143 |         train_examples, validation_examples, train_labels, validation_labels = \
144 |                         train_test_split(train_examples, train_labels, test_size=0.05)
145 |         data_sets.validation = DataSet(validation_examples, validation_labels)
146 |     else:
147 |         train_examples, test_examples, train_labels, test_labels = \
148 |                         train_test_split(input_data, labels, test_size=0.3)
149 |         data_sets.validation = None
150 | 
151 | #     validation_examples = input_data[:VALIDATION_SIZE]
152 | #     train_examples = input_data[VALIDATION_SIZE:]
153 | 
154 |     data_sets.train = DataSet(train_examples, train_labels)
155 |     data_sets.test = DataSet(test_examples, test_labels)
156 |     
157 |     if not split_only:
158 |         data_sets.all = DataSet(input_data, labels)
159 |     
160 |     return data_sets
161 | 
162 | 
163 | 
164 | def load_data_sets_pretraining(input_data, split_only=True, valid_set=False):
165 |     """ Load data-sets for pre-training
166 |     Data-sets for pre-training does not include labels. It takes
167 |     an input data-set and it splits it in train, test and validation
168 |     (optional) sets. Then it returns these subsets as DataSetPreTraining
169 |     objects which have the ability to give the data in batches (among
170 |     other useful functions). If split_only argument is False then it
171 |     also returns the whole input data-set as a DataSetPreTraining object.
172 |     
173 |     Args:
174 |         input_data: The data-set to be split.
175 |         split_only: If True it just splits the data-set and returns its
176 |                     subsets as DataSetPreTraining objects, otherwise it
177 |                     also returns the data-set as DataSetPreTraining object.
178 |         valid_set:  Whether to create a validation set along with test
179 |                     and train or not (default False)
180 |     """
181 |     class DataSets(object):
182 |         pass
183 |     data_sets = DataSets()
184 |     
185 |     print("\nSplitting to Train & Test sets for pre-training")
186 |     
187 |     if valid_set:
188 |         train_examples, test_examples = train_test_split(input_data, test_size=0.20)
189 |         train_examples, validation_examples = train_test_split(train_examples, test_size=0.05)
190 |         data_sets.validation = DataSetPreTraining(validation_examples)
191 |     else:
192 |         train_examples, test_examples = train_test_split(input_data, test_size=0.3)
193 |         data_sets.validation = None
194 | 
195 |     if not split_only:
196 |         data_sets.all = DataSetPreTraining(input_data)
197 | 
198 |     data_sets.train = DataSetPreTraining(train_examples)
199 |     data_sets.test = DataSetPreTraining(test_examples)
200 | 
201 |     return data_sets
202 | 
203 | 
204 | '''
205 | """ TODO: ADD more noise functions such as Gaussian noise etc. """
206 | def _add_noise(x, ratio, n_type='MN'):
207 |     """ Noise adding (or input corruption)
208 |     This function adds noise to the given dataset.
209 |     
210 |     Args:
211 |         x    : The input dataset for the noise to be applied (numpy array)
212 |         ratio: The percentage of the data affected by the noise addition
213 |         n_type: The type of noise to be applied.
214 |                 Choices: MN (masking noise), SP (salt-and-pepper noise)
215 |     """
216 | '''
217 | 
218 | def fill_feed_dict_dae(data_set, input_pl, batch_size=None):
219 |     b_size = FLAGS.batch_size if batch_size is None else batch_size
220 | 
221 |     input_feed = data_set.next_batch(b_size)
222 |     feed_dict = { input_pl: input_feed }
223 | 
224 |     return feed_dict
225 | 
226 | 
227 | def fill_feed_dict(data_set, input_pl, labels_pl, batch_size=None):
228 |     """Fills the feed_dict for training the given step.
229 |     A feed_dict takes the form of:
230 |     feed_dict = {
231 |         <placeholder>: <tensor of values to be passed for placeholder>,
232 |         ....
233 |     }
234 |     Args:
235 |       data_set: The set of images and labels, from input_data.read_data_sets()
236 |       images_pl: The examples placeholder, from placeholder_inputs().
237 |       labels_pl: The labels placeholder, from placeholder_inputs().
238 |     Returns:
239 |       feed_dict: The feed dictionary mapping from placeholders to values.
240 |     """
241 |     # Create the feed_dict for the placeholders filled with the next
242 |     # `batch size ` examples.
243 |     b_size = FLAGS.batch_size if batch_size is None else batch_size
244 |     
245 |     examples_feed, labels_feed = data_set.next_batch(b_size)
246 | 
247 |     feed_dict = {
248 |         input_pl: examples_feed,
249 |         labels_pl: labels_feed
250 |     }
251 | 
252 |     return feed_dict
253 | 
254 | 
255 | def normalize_data(x, transpose=False):
256 |     # Normalization across the whole matrix
257 | #     x_max = np.max(x)
258 | #     x_min = np.min(x)
259 | #     x_norm = (x - x_min) / np.float32(x_max - x_min)
260 |     
261 |     
262 |     # Normalization across the features
263 |     x_norm = []
264 |     if transpose:
265 |         x = np.transpose(x)
266 |         print("\nData Transposed.")
267 | 
268 |     print "\nNormalizing", len(x), "Features..."
269 |     for i in range(len(x)):
270 |         x_norm.append((x[i] - np.min(x[i])) / np.float32(np.max(x[i]) - np.min(x[i])))
271 |         if np.isnan(x_norm[i]).any():
272 |             print("NAN at:", i)
273 | 
274 |     """ OR  (norm='l1' or 'l2' or 'max')
275 |     from sklearn.preprocessing import normalize
276 |     x_norm = normalize(input_data, axis=??, norm='??')
277 |     """
278 |     print("Normalization: Done. Transposing...")
279 |     return np.asarray(np.transpose(x_norm))
280 | 
281 | 
282 | def label_metadata(label_matrix, label_col):
283 |     # Check whether the column value is given as index (number) or name (string) 
284 |     try:
285 |         label_col = int(label_col)
286 |         
287 |         # If given as number, take the name of the column out of it
288 |         label_col = label_matrix.columns[label_col]
289 |     except ValueError:
290 |         pass
291 |     
292 |     import pandas as pd
293 |     # Get the unique classes in the given column, and how many of them are there
294 |     unique_classes = pd.unique(label_matrix[label_col].ravel())
295 |     #num_classes = unique_classes.shape[0]
296 |     
297 |     # Map the unique n classes with a number from 0 to n  
298 |     label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))})
299 |     
300 |     # Replace the given column's values with the mapped equivalent
301 |     mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist())
302 |     
303 |     # Return the mapped labels as numpy list and the label map (unique classes and number can be obtained from map)
304 |     return np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],)), np.asarray(label_map) #, unique_classes, num_classes
305 | 
306 | 
307 | def write_csv(filename, data, sep='\t'):
308 |     with open(filename, 'w') as fp:
309 |         a = csv.writer(fp, delimiter='\t')
310 |         a.writerows(data)
311 | 
312 | 
313 | 
314 | 


--------------------------------------------------------------------------------
/Train_SDAE/tools/visualize.py:
--------------------------------------------------------------------------------
  1 | #import matplotlib.mlab as mlab
  2 | import matplotlib.pyplot as plt
  3 | import matplotlib.patheffects as PathEffects
  4 | import seaborn as sns
  5 | import numpy as np
  6 | from os.path import join as pjoin
  7 | from config import FLAGS
  8 | from sklearn.metrics import confusion_matrix, roc_curve, auc
  9 | from scipy import interp
 10 | 
 11 | '''
 12 | interpolation options:
 13 |     [None, 'none', 'nearest', 'bilinear', 'bicubic', 'spline16',
 14 |     'spline36', 'hanning', 'hamming', 'hermite', 'kaiser', 'quadric',
 15 |     'catrom', 'gaussian', 'bessel', 'mitchell', 'sinc', 'lanczos']
 16 | '''
 17 | 
 18 | def scatter(x, y, plot_name):
 19 |     """ Used to plot t-SNE projections """
 20 |  
 21 |     num_colors = len(np.unique(y))
 22 |     # We choose a color palette with seaborn.
 23 |     palette = np.array(sns.color_palette("hls", num_colors))
 24 |     # We create a scatter plot.
 25 |     f = plt.figure(figsize=(8, 8))
 26 |     ax = plt.subplot(aspect='equal')
 27 |     sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40,
 28 |                     c=palette[y.astype(np.int)])
 29 |     plt.xlim(-25, 25)
 30 |     plt.ylim(-25, 25)
 31 |     ax.axis('off')
 32 |     ax.axis('tight')
 33 |     # We add the labels for each digit.
 34 |     txts = []
 35 |     for i in range(num_colors):
 36 |         # Position of each label.
 37 |         xtext, ytext = np.median(x[y == i, :], axis=0)
 38 | #         if np.isnan(xtext) or np.isnan(ytext):
 39 | #             break
 40 |         txt = ax.text(xtext, ytext, str(i), fontsize=24)
 41 |         txt.set_path_effects([
 42 |             PathEffects.Stroke(linewidth=5, foreground="w"),
 43 |             PathEffects.Normal()])
 44 |         txts.append(txt)
 45 |      
 46 |     plt.savefig(plot_name, dpi=120)
 47 |     plt.close()
 48 | 
 49 | 
 50 | def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=plt.cm.BuGn):
 51 |     imgplot = plt.imshow(cm, interpolation='nearest', cmap=cmap)
 52 |     plt.grid(False)
 53 |     plt.colorbar(imgplot)
 54 |     plt.title(title)
 55 |     tick_marks = np.arange(len(target_names))
 56 |     plt.xticks(tick_marks, target_names, rotation=90)
 57 |     plt.yticks(tick_marks, target_names)
 58 |     plt.tight_layout()
 59 |     plt.ylabel('True label')
 60 |     plt.xlabel('Predicted label')
 61 |     plt.savefig(pjoin(FLAGS.output_dir, title.replace(' ', '_') + '_CM.png'))
 62 |     plt.close()
 63 |     
 64 | 
 65 | def plot_roc_curve(y_pred, y_true, n_classes, title='ROC_Curve'):
 66 |     # Compute ROC curve and ROC area for each class
 67 |     fpr = dict()
 68 |     tpr = dict()
 69 |     tresholds = dict()
 70 |     roc_auc = dict()
 71 | 
 72 |     for i in range(n_classes):
 73 |         fpr[i], tpr[i], tresholds[i] = roc_curve(y_true, y_pred, pos_label=i, drop_intermediate=False)
 74 |         roc_auc[i] = auc(fpr[i], tpr[i])
 75 |         
 76 |     # Compute micro-average ROC curve and ROC area
 77 | #     fpr["micro"], tpr["micro"], _ = roc_curve(np.asarray(y_true).ravel(), np.asarray(y_pred).ravel(), pos_label=0, drop_intermediate=True)
 78 | #     roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 79 | 
 80 |     # Aggregate all false positive rates
 81 |     all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
 82 |     
 83 | #     print("Thresholds:")
 84 |     # Interpolate all ROC curves at this points
 85 |     mean_tpr = np.zeros_like(all_fpr)
 86 |     for i in range(n_classes):
 87 |         mean_tpr += interp(all_fpr, fpr[i], tpr[i])
 88 | #         print("Class_{0}: {1}".format(i, tresholds[i]))
 89 | 
 90 |     # Average it and compute AUC
 91 |     mean_tpr /= n_classes
 92 |     
 93 |     fpr["macro"] = all_fpr
 94 |     tpr["macro"] = mean_tpr
 95 |     roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
 96 |     
 97 |     
 98 |     # Plot all ROC curves
 99 |     fig = plt.figure()
100 |     ax = fig.add_subplot(111)
101 | 
102 | #     plt.plot(fpr["micro"], tpr["micro"],
103 | #              label='micro-average ROC curve (area = {0:0.2f})'
104 | #                    ''.format(roc_auc["micro"]),
105 | #              linewidth=3, ls='--', color='red')
106 |     
107 |     plt.plot(fpr["macro"], tpr["macro"],
108 |              label='macro-average ROC curve (area = {0:0.2f})'
109 |                    ''.format(roc_auc["macro"]),
110 |              linewidth=3, ls='--', color='green')
111 |     
112 |     for i in range(n_classes):
113 |         plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
114 |                                        ''.format(i, roc_auc[i]))
115 |     
116 |     plt.plot([0, 1], [0, 1], 'k--', linewidth=2)
117 |     plt.xlim([0.0, 1.0])
118 |     plt.ylim([0.0, 1.05])
119 |     plt.xlabel('False Positive Rate')
120 |     plt.ylabel('True Positive Rate')
121 |     plt.title('Multi-class Receiver Operating Characteristic')
122 |     lgd = ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
123 |     
124 |     plt.savefig(pjoin(FLAGS.output_dir, title.replace(' ', '_') + '_ROC.png'), bbox_extra_artists=(lgd,), bbox_inches='tight')
125 |     plt.close()
126 |     
127 | 
128 | def hist_comparison(data1, data2):
129 |     f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
130 |     f.suptitle('Histogram Before and After Normalization')
131 |     ax1.hist(data1, 10, facecolor='green', alpha=0.75)
132 |     ax1.set_xlabel("Values")
133 |     ax1.set_ylabel("# of Examples")
134 |     ax1.grid(True)
135 |     ax2.hist(data2, 10, facecolor='green', alpha=0.75)
136 |     ax2.set_xlabel("Values")
137 |     ax2.grid(True)
138 | 
139 |     f.savefig(pjoin(FLAGS.output_dir, 'hist_comparison.png'))
140 | #     plt.show()
141 |     plt.close()
142 |     
143 | 
144 | def make_heatmap(data, name):
145 |     f = plt.figure()
146 |     ax1 = f.add_axes([0.1,0.1,0.8,0.8])
147 |     ax1.grid(False)
148 |     imgplot = ax1.imshow(data, interpolation="none")
149 |     imgplot.set_cmap('seismic')
150 |     f.colorbar(imgplot)
151 |     f.savefig(pjoin(FLAGS.output_dir, name + '.png'))
152 |     plt.close()
153 | 
154 | def make_2d_hist(data, name):
155 |     f = plt.figure()
156 |     X,Y = np.meshgrid(range(data.shape[0]), range(data.shape[1]))
157 |     im = plt.pcolormesh(X,Y,data.transpose(), cmap='seismic')
158 |     plt.colorbar(im, orientation='vertical')
159 | #     plt.hexbin(data,data)
160 | #     plt.show()
161 |     f.savefig(pjoin(FLAGS.output_dir, name + '.png'))
162 |     plt.close()
163 |     
164 | # def make_2d_hexbin(data, name):
165 | #     f = plt.figure()
166 | #     X,Y = np.meshgrid(range(data.shape[0]), range(data.shape[1]))
167 | #     plt.hexbin(X, data)
168 | # #     plt.show()
169 | #     f.savefig(pjoin(FLAGS.output_dir, name + '.png'))
170 | 
171 | def heatmap_comparison(data1, label1, data2, label2, data3, label3):    
172 |     interpolation = 'none'
173 |     
174 |     fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) 
175 |     fig.suptitle('Heatmap Comparison of Normal and Noisy Data')
176 |     ax1.imshow(data3, interpolation=interpolation)
177 |     ax1.set_title(label1)
178 |     ax1.set_ylabel("Examples")
179 |     ax1.set_xlabel("Features")
180 |     ax1.set_aspect('equal')
181 |     
182 |     ax2.imshow(data2, interpolation=interpolation)
183 |     ax2.set_title(label2)
184 |     ax2.set_xlabel("Features")
185 |     ax2.set_aspect('equal')
186 |     
187 |     ax3.imshow(data1, interpolation=interpolation)
188 |     ax3.set_title(label3)
189 |     ax3.set_xlabel("Features")
190 |     ax3.set_aspect('equal')
191 |     
192 |     cax = fig.add_axes([0, 0, .1, .1])
193 |     cax.get_xaxis().set_visible(False)
194 |     cax.get_yaxis().set_visible(False)
195 |     cax.patch.set_alpha(0.5)
196 |     cax.set_frame_on(True)
197 | #     plt.colorbar(ax1, ax2, orientation='vertical')
198 |     plt.show()
199 |     plt.close()
200 | #     
201 | #     fig = plt.figure(figsize=(6, 3.2))
202 | # 
203 | #     ax = fig.add_subplot(111)
204 | #     ax.set_title('colorMap')
205 | #     plt.imshow(data1)
206 | #     ax.set_aspect('equal')
207 | #     
208 | #     cax = fig.add_axes([0.12, 0.1, 0.78, 0.8])
209 | #     cax.get_xaxis().set_visible(False)
210 | #     cax.get_yaxis().set_visible(False)
211 | #     cax.patch.set_alpha(0)
212 | #     cax.set_frame_on(False)
213 | #     plt.colorbar(orientation='vertical')
214 | #     plt.show()
215 | #     


--------------------------------------------------------------------------------