├── .gitignore ├── LICENSE ├── README.md ├── batch_generator.py └── denoising_autoencoder.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Dana Hughes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepEmbeddedClustering 2 | Tensorflow implementation of Deep Embedded Clustering (DEC) for unsupervised learning 3 | -------------------------------------------------------------------------------- /batch_generator.py: -------------------------------------------------------------------------------- 1 | ## batch_generator.py 2 | ## 3 | ## 4 | ## 5 | ## History: 6 | ## 1.0 29-Jun-2016 Initial version 7 | ## 1.1 12-Aug-2016 Changed input / output to key / value pairs 8 | ## Changed class from Dataset to Batch 9 | ## 1.2 30-Sep-2016 Changed class from Batch to BatchGenerator 10 | ## Added option to split batch generator into multiple 11 | ## batch generators 12 | 13 | import random 14 | import numpy as np 15 | 16 | class BatchGenerator: 17 | """ 18 | Object which produces batches from a provided dataset. 19 | """ 20 | 21 | def __init__(self, shape_dict): 22 | """ 23 | Setup a new generator for producing batches 24 | """ 25 | 26 | self._shape_dict = shape_dict 27 | 28 | self._data = [] 29 | self._data_keys = shape_dict.keys() 30 | self._shapes = {} 31 | 32 | for k in self._data_keys: 33 | self._shapes[k] = shape_dict[k] 34 | 35 | self._current_index = 0 36 | 37 | 38 | def add_sample(self, sample_dict): 39 | """ 40 | Add a sample to the Dataset 41 | """ 42 | 43 | self._data.append(sample_dict) 44 | 45 | 46 | def shuffle(self): 47 | """ 48 | Shuffle the data 49 | """ 50 | 51 | random.shuffle(self._data) 52 | 53 | 54 | def reset(self): 55 | """ 56 | Wrap back around to the start of the list 57 | """ 58 | 59 | self._current_index = 0 60 | 61 | 62 | def split(self, distribution): 63 | """ 64 | Split the dataset in the batch generator into multiple generators 65 | 66 | distribution - Percentage of dataset for each batch generator. 67 | This is assumed to sum to 1.0 68 | """ 69 | 70 | # Create new batch generators 71 | batch_generators = [BatchGenerator(self._shape_dict) for _ in distribution] 72 | 73 | # Add each sample in the dataset to a random generator, as appropriate 74 | for sample in self._data: 75 | rnd = random.random() 76 | idx = 0 77 | 78 | # Determine which batch to add this to 79 | while rnd > distribution[idx]: 80 | rnd = rnd - distribution[idx] 81 | idx += 1 82 | 83 | # Just in case, assign to the last generator if needed 84 | if idx == len(distribution): 85 | idx = len(distribution) - 1 86 | break 87 | 88 | batch_generators[idx].add_sample(sample) 89 | 90 | return batch_generators 91 | 92 | 93 | def get_current_index(self): 94 | """ 95 | Get the current position in the batch 96 | """ 97 | 98 | return self._current_index 99 | 100 | 101 | def set_index(self, index): 102 | """ 103 | """ 104 | 105 | self._current_index = index 106 | 107 | 108 | def get_batch(self, batch_size): 109 | """ 110 | Return an batch of input / output pairs 111 | """ 112 | 113 | size = min(len(self._data) - self._current_index, batch_size) 114 | 115 | data = {} 116 | for k in self._data_keys: 117 | data[k] = np.zeros((size,) + self._shapes[k]) 118 | 119 | for i in range(size): 120 | data[k][i,:] = self._data[i + self._current_index][k][:] 121 | 122 | self._current_index = self._current_index + size 123 | 124 | data['batch_size'] = size 125 | 126 | return data 127 | 128 | 129 | def num_samples(self): 130 | """ 131 | The total number of samples in the batch 132 | """ 133 | 134 | return len(self._data) 135 | 136 | 137 | -------------------------------------------------------------------------------- /denoising_autoencoder.py: -------------------------------------------------------------------------------- 1 | # denoising_autoencoder.py 26-Jul-2017 2 | # 3 | # 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | 9 | def weight_variable(shape, name=None): 10 | """ 11 | Create a weight matrix 12 | """ 13 | 14 | return tf.Variable(tf.truncated_normal(shape, stddev=0.01), name=name) 15 | 16 | 17 | def bias_variable(shape, name=None): 18 | """ 19 | Create a bias variable 20 | """ 21 | 22 | return tf.Variable(tf.constant(0.01, shape=shape), name=name) 23 | 24 | 25 | def linear(x): 26 | """ 27 | """ 28 | 29 | return x 30 | 31 | 32 | class DenoisingAutoencoder: 33 | """ 34 | A denoising autoencoder 35 | """ 36 | 37 | def __init__(self, input_size, code_size, **kwargs): 38 | """ 39 | """ 40 | 41 | # Create all the needed tensorflow stuff 42 | self.sess = kwargs.get('session', tf.InteractiveSession()) 43 | hidden_activation = kwargs.get('hidden_activation', tf.nn.relu) 44 | output_activation = kwargs.get('output_activation', tf.nn.relu) 45 | self.name = kwargs.get('name', None) 46 | 47 | with tf.variable_scope(self.name): 48 | # Input to the network 49 | self.input = tf.placeholder(tf.float32, (None, input_size), name='input_'+self.name) 50 | self.dropout_prob = tf.placeholder(tf.float32, None, name='dropout_probability_' + self.name) 51 | self.learning_rate = tf.placeholder(tf.float32, None, name='learning_rate_' + self.name) 52 | 53 | # Weights and biases of the network 54 | self.W1 = weight_variable((input_size, code_size), 'W_encode_'+self.name) 55 | self.b1 = bias_variable((code_size,), 'b_encode_' + self.name) 56 | self.W2 = weight_variable((code_size, input_size), 'W_decode_'+self.name) 57 | self.b2 = bias_variable((input_size,), 'b_decode_' + self.name) 58 | 59 | x = tf.nn.dropout(self.input, self.dropout_prob) 60 | 61 | # Code layer and output layer 62 | self.code = hidden_activation(tf.matmul(x, self.W1) + self.b1) 63 | 64 | h = tf.nn.dropout(self.code, self.dropout_prob) 65 | 66 | output = output_activation(tf.matmul(h, self.W2) + self.b2) 67 | 68 | # Build an optimizer 69 | self.loss = tf.reduce_sum(tf.square(output - self.input), name='objective_' + self.name) 70 | self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) 71 | 72 | 73 | def train(self, dataset, dropout_prob = 0.2, learning_rate = 0.1): 74 | """ 75 | """ 76 | 77 | fd = {self.input: dataset, self.dropout_prob: dropout_prob, self.learning_rate: learning_rate} 78 | 79 | self.sess.run(self.train_step, feed_dict=fd) 80 | 81 | 82 | def get_loss(self, dataset, dropout_prob = 0.0): 83 | """ 84 | """ 85 | 86 | fd = {self.input: dataset, self.dropout_prob: dropout_prob} 87 | 88 | return self.sess.run(self.loss, feed_dict=fd) 89 | 90 | 91 | def get_code(self, dataset, dropout_prob = 0.0): 92 | """ 93 | """ 94 | 95 | fd = {self.input: dataset, self.dropout_prob: dropout_prob} 96 | 97 | return self.sess.run(self.code, feed_dict=fd) 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | --------------------------------------------------------------------------------