├── .gitignore
├── LICENSE
├── README.md
├── batch_generator.py
└── denoising_autoencoder.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Dana Hughes
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DeepEmbeddedClustering
2 | Tensorflow implementation of Deep Embedded Clustering (DEC) for unsupervised learning
3 | 


--------------------------------------------------------------------------------
/batch_generator.py:
--------------------------------------------------------------------------------
  1 | ## batch_generator.py
  2 | ##
  3 | ## 
  4 | ##
  5 | ## History:
  6 | ##    1.0    29-Jun-2016     Initial version
  7 | ##    1.1    12-Aug-2016     Changed input / output to key / value pairs
  8 | ##                           Changed class from Dataset to Batch
  9 | ##    1.2    30-Sep-2016     Changed class from Batch to BatchGenerator
 10 | ##                           Added option to split batch generator into multiple
 11 | ##                           batch generators
 12 | 
 13 | import random
 14 | import numpy as np
 15 | 
 16 | class BatchGenerator:
 17 |    """
 18 |    Object which produces batches from a provided dataset.
 19 |    """
 20 | 
 21 |    def __init__(self, shape_dict):
 22 |       """
 23 |       Setup a new generator for producing batches
 24 |       """
 25 | 
 26 |       self._shape_dict = shape_dict
 27 | 
 28 |       self._data = []
 29 |       self._data_keys = shape_dict.keys()
 30 |       self._shapes = {}
 31 | 
 32 |       for k in self._data_keys:
 33 |          self._shapes[k] = shape_dict[k]
 34 | 
 35 |       self._current_index = 0
 36 | 
 37 | 
 38 |    def add_sample(self, sample_dict):
 39 |       """
 40 |       Add a sample to the Dataset
 41 |       """
 42 | 
 43 |       self._data.append(sample_dict)
 44 | 
 45 | 
 46 |    def shuffle(self):
 47 |       """
 48 |       Shuffle the data
 49 |       """
 50 | 
 51 |       random.shuffle(self._data)
 52 | 
 53 | 
 54 |    def reset(self):
 55 |       """
 56 |       Wrap back around to the start of the list
 57 |       """
 58 | 
 59 |       self._current_index = 0
 60 | 
 61 | 
 62 |    def split(self, distribution):
 63 |       """
 64 |       Split the dataset in the batch generator into multiple generators
 65 | 
 66 |       distribution - Percentage of dataset for each batch generator.
 67 |                      This is assumed to sum to 1.0
 68 |       """
 69 | 
 70 |       # Create new batch generators
 71 |       batch_generators = [BatchGenerator(self._shape_dict) for _ in distribution]
 72 | 
 73 |       # Add each sample in the dataset to a random generator, as appropriate
 74 |       for sample in self._data:
 75 |          rnd = random.random()
 76 |          idx = 0
 77 | 
 78 |          # Determine which batch to add this to
 79 |          while rnd > distribution[idx]:
 80 |             rnd = rnd - distribution[idx]
 81 |             idx += 1
 82 |  
 83 |             # Just in case, assign to the last generator if needed
 84 |             if idx == len(distribution):
 85 |                idx = len(distribution) - 1
 86 |                break
 87 | 
 88 |          batch_generators[idx].add_sample(sample)
 89 | 
 90 |       return batch_generators
 91 | 
 92 | 
 93 |    def get_current_index(self):
 94 |       """
 95 |       Get the current position in the batch
 96 |       """
 97 | 
 98 |       return self._current_index
 99 | 
100 | 
101 |    def set_index(self, index):
102 |       """
103 |       """
104 | 
105 |       self._current_index = index
106 | 
107 | 
108 |    def get_batch(self, batch_size):
109 |       """
110 |       Return an batch of input / output pairs
111 |       """
112 | 
113 |       size = min(len(self._data) - self._current_index, batch_size)
114 | 
115 |       data = {}
116 |       for k in self._data_keys:
117 |          data[k] = np.zeros((size,) + self._shapes[k])
118 | 
119 |          for i in range(size):
120 |             data[k][i,:] = self._data[i + self._current_index][k][:]
121 | 
122 |       self._current_index = self._current_index + size
123 | 
124 |       data['batch_size'] = size
125 | 
126 |       return data
127 | 
128 | 
129 |    def num_samples(self):
130 |       """
131 |       The total number of samples in the batch
132 |       """
133 | 
134 |       return len(self._data)
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/denoising_autoencoder.py:
--------------------------------------------------------------------------------
  1 | # denoising_autoencoder.py															26-Jul-2017
  2 | #
  3 | #
  4 | 
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | 
  8 | 
  9 | def weight_variable(shape, name=None):
 10 |    """
 11 |    Create a weight matrix
 12 |    """
 13 | 
 14 |    return tf.Variable(tf.truncated_normal(shape, stddev=0.01), name=name)
 15 | 
 16 | 
 17 | def bias_variable(shape, name=None):
 18 |    """
 19 |    Create a bias variable
 20 |    """
 21 | 
 22 |    return tf.Variable(tf.constant(0.01, shape=shape), name=name)
 23 | 
 24 | 
 25 | def linear(x):
 26 | 	"""
 27 | 	"""
 28 | 
 29 | 	return x
 30 | 
 31 | 
 32 | class DenoisingAutoencoder:
 33 | 	"""
 34 | 	A denoising autoencoder
 35 | 	"""
 36 | 
 37 | 	def __init__(self, input_size, code_size, **kwargs):
 38 | 		"""
 39 | 		"""
 40 | 
 41 | 		# Create all the needed tensorflow stuff
 42 | 		self.sess = kwargs.get('session', tf.InteractiveSession())
 43 | 		hidden_activation = kwargs.get('hidden_activation', tf.nn.relu)
 44 | 		output_activation = kwargs.get('output_activation', tf.nn.relu)
 45 | 		self.name = kwargs.get('name', None)
 46 | 
 47 | 		with tf.variable_scope(self.name):
 48 | 			# Input to the network
 49 | 			self.input = tf.placeholder(tf.float32, (None, input_size), name='input_'+self.name)
 50 | 			self.dropout_prob = tf.placeholder(tf.float32, None, name='dropout_probability_' + self.name)
 51 | 			self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate_' + self.name)
 52 | 
 53 | 			# Weights and biases of the network
 54 | 			self.W1 = weight_variable((input_size, code_size), 'W_encode_'+self.name)
 55 | 			self.b1 = bias_variable((code_size,), 'b_encode_' + self.name)
 56 | 			self.W2 = weight_variable((code_size, input_size), 'W_decode_'+self.name)
 57 | 			self.b2 = bias_variable((input_size,), 'b_decode_' + self.name)
 58 | 
 59 | 			x = tf.nn.dropout(self.input, self.dropout_prob)
 60 | 
 61 | 			# Code layer and output layer
 62 | 			self.code = hidden_activation(tf.matmul(x, self.W1) + self.b1)
 63 | 
 64 | 			h = tf.nn.dropout(self.code, self.dropout_prob)
 65 | 
 66 | 			output = output_activation(tf.matmul(h, self.W2) + self.b2)
 67 | 
 68 | 			# Build an optimizer
 69 | 	        self.loss = tf.reduce_sum(tf.square(output - self.input), name='objective_' + self.name)
 70 | 	        self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
 71 | 
 72 | 
 73 | 	def train(self, dataset, dropout_prob = 0.2, learning_rate = 0.1):
 74 | 		"""
 75 | 		"""
 76 | 
 77 | 		fd = {self.input: dataset, self.dropout_prob: dropout_prob, self.learning_rate: learning_rate}
 78 | 
 79 | 		self.sess.run(self.train_step, feed_dict=fd)
 80 | 
 81 | 
 82 | 	def get_loss(self, dataset, dropout_prob = 0.0):
 83 | 		"""
 84 | 		"""
 85 | 
 86 | 		fd = {self.input: dataset, self.dropout_prob: dropout_prob}
 87 | 
 88 | 		return self.sess.run(self.loss, feed_dict=fd)
 89 | 
 90 | 
 91 | 	def get_code(self, dataset, dropout_prob = 0.0):
 92 | 		"""
 93 | 		"""
 94 | 
 95 | 		fd = {self.input: dataset, self.dropout_prob: dropout_prob}
 96 | 
 97 | 		return self.sess.run(self.code, feed_dict=fd)
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------