├── .gitignore ├── LICENSE.md ├── README.md ├── __init__.py ├── brute.py ├── convert_to_records.py ├── images ├── Parallelism.png └── mnist_graph.png ├── main.py ├── mnist_multi_gpu_batching_train.py ├── mnist_multi_gpu_keras.py ├── mnist_multi_gpu_sonnet.py ├── multi_gpu.py ├── network.py ├── older ├── mnist_multi_gpu_eval.py ├── mnist_multi_gpu_train.py └── model.py ├── optimizer.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | /mnist_with_summaries.zip 2 | /mnist_data.zip 3 | /mnist_data/train-labels-idx1-ubyte.gz 4 | /mnist_data/train-images-idx3-ubyte.gz 5 | /mnist_data/t10k-labels-idx1-ubyte.gz 6 | /mnist_data/t10k-images-idx3-ubyte.gz 7 | /.idea 8 | *.pyc 9 | .DS_Store 10 | /logs/test 11 | /logs/train 12 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "{}" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2017 Norman Heckscher 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MNIST Multi GPU with TensorFlow 2 | A ConvNet for MNIST digit classification. 3 | 4 | Multi GPU example with TensorFlow utilising local tower architecture for each GPU. 5 | 6 | Several different examples. Utilise batching and direct feed. 7 | 8 | Keras performs the best. It utilises the MultiGPU code from: https://github.com/kuza55/keras-extras 9 | 10 | ## Training a Model Using Multiple GPU Cards 11 | 12 | Modern workstations may contain multiple GPUs for scientific computation. 13 | TensorFlow can leverage this environment to run the training operation 14 | concurrently across multiple cards. 15 | 16 | Training a model in a parallel, distributed fashion requires 17 | coordinating training processes. For what follows we term *model replica* 18 | to be one copy of a model training on a subset of data. 19 | 20 | Naively employing asynchronous updates of model parameters 21 | leads to sub-optimal training performance 22 | because an individual model replica might be trained on a stale 23 | copy of the model parameters. Conversely, employing fully synchronous 24 | updates will be as slow as the slowest model replica. 25 | 26 | In a workstation with multiple GPU cards, each GPU will have similar speed 27 | and contain enough memory to run an entire MNIST model. Thus, we opt to 28 | design our training system in the following manner: 29 | 30 | * Place an individual model replica on each GPU. 31 | * Update model parameters synchronously by waiting for all GPUs to finish 32 | processing a batch of data. 33 | 34 | Here is a diagram of this model: 35 | 36 |
37 | 38 |
39 | 40 | Note that each GPU computes inference as well as the gradients for a unique 41 | batch of data. This setup effectively permits dividing up a larger batch 42 | of data across the GPUs. 43 | 44 | This setup requires that all GPUs share the model parameters. A well-known 45 | fact is that transferring data to and from GPUs is quite slow. For this 46 | reason, we decide to store and update all model parameters on the CPU (see 47 | green box). A fresh set of model parameters is transferred to the GPU 48 | when a new batch of data is processed by all GPUs. 49 | 50 | The GPUs are synchronized in operation. All gradients are accumulated from 51 | the GPUs and averaged (see green box). The model parameters are updated with 52 | the gradients averaged across all model replicas. 53 | 54 | ### Model Prediction 55 | 56 | The prediction part of the model is constructed by the `inference()` function 57 | which adds operations to compute the *logits* of the predictions. That part of 58 | the model is organized as follows: 59 | 60 | Layer Name | Description 61 | --- | --- 62 | `conv1` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation. 63 | `pool1` | @{tf.nn.max_pool$max pooling}. 64 | `norm1` | @{tf.nn.local_response_normalization$local response normalization}. 65 | `conv2` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation. 66 | `norm2` | @{tf.nn.local_response_normalization$local response normalization}. 67 | `pool2` | @{tf.nn.max_pool$max pooling}. 68 | `local3` | @{$python/nn$fully connected layer with rectified linear activation}. 69 | `local4` | @{$python/nn$fully connected layer with rectified linear activation}. 70 | `softmax_linear` | linear transformation to produce logits. 71 | 72 | Here is a graph generated from TensorBoard describing the inference operation: 73 | 74 |
75 | 76 |
77 | 78 | 79 | # Evolve a neural network with a genetic algorithm 80 | 81 | Taken from https://github.com/harvitronix/neural-network-genetic-algorithm 82 | 83 | `train.py` 84 | `optimizer.py` 85 | `network.py` 86 | `main.py` 87 | `brute.py` 88 | 89 | This is an example of how we can use a genetic algorithm in an attempt to find the optimal network parameters for classification tasks. 90 | 91 | It's currently limited to only MLPs (ie. fully connected networks) and uses the Keras library to build, train and validate. 92 | 93 | On the easy MNIST dataset, we are able to quickly find a network that reaches > 98% accuracy. On the more challenging CIFAR10 dataset, we get to 56% after 10 generations (with population 20). 94 | 95 | For more, see this blog post: 96 | https://medium.com/@harvitronix/lets-evolve-a-neural-network-with-a-genetic-algorithm-code-included-8809bece164 97 | 98 | ## To run 99 | 100 | To run the brute force algorithm: 101 | 102 | ```python3 brute.py``` 103 | 104 | To run the genetic algorithm: 105 | 106 | ```python3 main.py``` 107 | 108 | You can set your network parameter choices by editing each of those files first. You can also choose whether to use the MNIST or CIFAR10 datasets. Simply set `dataset` to either `mnist` or `cifar10`. 109 | 110 | 111 | # Contribution 112 | Your comments (issues) and PRs are always welcome. 113 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Norman Heckscher. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== -------------------------------------------------------------------------------- /brute.py: -------------------------------------------------------------------------------- 1 | """Iterate over every combination of hyperparameters.""" 2 | import logging 3 | from network import Network 4 | from tqdm import tqdm 5 | 6 | # Setup logging. 7 | logging.basicConfig( 8 | format='%(asctime)s - %(levelname)s - %(message)s', 9 | datefmt='%m/%d/%Y %I:%M:%S %p', 10 | level=logging.DEBUG, 11 | filename='brute-log.txt' 12 | ) 13 | 14 | def train_networks(networks, dataset): 15 | """Train each network. 16 | 17 | Args: 18 | networks (list): Current population of networks 19 | dataset (str): Dataset to use for training/evaluating 20 | """ 21 | pbar = tqdm(total=len(networks)) 22 | for network in networks: 23 | network.train(dataset) 24 | network.print_network() 25 | pbar.update(1) 26 | pbar.close() 27 | 28 | # Sort our final population. 29 | networks = sorted(networks, key=lambda x: x.accuracy, reverse=True) 30 | 31 | # Print out the top 5 networks. 32 | print_networks(networks[:5]) 33 | 34 | def print_networks(networks): 35 | """Print a list of networks. 36 | 37 | Args: 38 | networks (list): The population of networks 39 | 40 | """ 41 | logging.info('-'*80) 42 | for network in networks: 43 | network.print_network() 44 | 45 | def generate_network_list(nn_param_choices): 46 | """Generate a list of all possible networks. 47 | 48 | Args: 49 | nn_param_choices (dict): The parameter choices 50 | 51 | Returns: 52 | networks (list): A list of network objects 53 | 54 | """ 55 | networks = [] 56 | 57 | # This is silly. 58 | for nbn in nn_param_choices['nb_neurons']: 59 | for nbl in nn_param_choices['nb_layers']: 60 | for a in nn_param_choices['activation']: 61 | for o in nn_param_choices['optimizer']: 62 | 63 | # Set the parameters. 64 | network = { 65 | 'nb_neurons': nbn, 66 | 'nb_layers': nbl, 67 | 'activation': a, 68 | 'optimizer': o, 69 | } 70 | 71 | # Instantiate a network object with set parameters. 72 | network_obj = Network() 73 | network_obj.create_set(network) 74 | 75 | networks.append(network_obj) 76 | 77 | return networks 78 | 79 | def main(): 80 | """Brute force test every network.""" 81 | dataset = 'cifar10' 82 | 83 | nn_param_choices = { 84 | 'nb_neurons': [64, 128, 256, 512, 768, 1024], 85 | 'nb_layers': [1, 2, 3, 4], 86 | 'activation': ['relu', 'elu', 'tanh', 'sigmoid'], 87 | 'optimizer': ['rmsprop', 'adam', 'sgd', 'adagrad', 88 | 'adadelta', 'adamax', 'nadam'], 89 | } 90 | 91 | logging.info("***Brute forcing networks***") 92 | 93 | networks = generate_network_list(nn_param_choices) 94 | 95 | train_networks(networks, dataset) 96 | 97 | if __name__ == '__main__': 98 | main() 99 | -------------------------------------------------------------------------------- /convert_to_records.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Converts MNIST data to TFRecords file format with Example protos.""" 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import argparse 22 | import os 23 | import sys 24 | 25 | import tensorflow as tf 26 | 27 | from tensorflow.contrib.learn.python.learn.datasets import mnist 28 | 29 | FLAGS = None 30 | 31 | 32 | def _int64_feature(value): 33 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 34 | 35 | 36 | def _bytes_feature(value): 37 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 38 | 39 | 40 | def convert_to(data_set, name): 41 | """Converts a dataset to tfrecords.""" 42 | images = data_set.images 43 | labels = data_set.labels 44 | num_examples = data_set.num_examples 45 | 46 | if images.shape[0] != num_examples: 47 | raise ValueError('Images size %d does not match label size %d.' % 48 | (images.shape[0], num_examples)) 49 | rows = images.shape[1] 50 | cols = images.shape[2] 51 | depth = images.shape[3] 52 | 53 | filename = os.path.join(FLAGS.directory, name + '.tfrecords') 54 | print('Writing', filename) 55 | writer = tf.python_io.TFRecordWriter(filename) 56 | for index in range(num_examples): 57 | image_raw = images[index].tostring() 58 | example = tf.train.Example(features=tf.train.Features(feature={ 59 | 'height': _int64_feature(rows), 60 | 'width': _int64_feature(cols), 61 | 'depth': _int64_feature(depth), 62 | 'label': _int64_feature(int(labels[index])), 63 | 'image_raw': _bytes_feature(image_raw)})) 64 | writer.write(example.SerializeToString()) 65 | writer.close() 66 | 67 | 68 | def main(unused_argv): 69 | # Get the data. 70 | data_sets = mnist.read_data_sets(FLAGS.directory, 71 | dtype=tf.uint8, 72 | reshape=False, 73 | validation_size=FLAGS.validation_size) 74 | 75 | # Convert to Examples and write the result to TFRecords. 76 | convert_to(data_sets.train, 'train') 77 | convert_to(data_sets.validation, 'validation') 78 | convert_to(data_sets.test, 'test') 79 | 80 | 81 | if __name__ == '__main__': 82 | parser = argparse.ArgumentParser() 83 | parser.add_argument( 84 | '--directory', 85 | type=str, 86 | default='/home/norman/MNIST_data', 87 | help='Directory to download data files and write the converted result' 88 | ) 89 | parser.add_argument( 90 | '--validation_size', 91 | type=int, 92 | default=5000, 93 | help="""\ 94 | Number of examples to separate from the training data for the validation 95 | set.\ 96 | """ 97 | ) 98 | FLAGS, unparsed = parser.parse_known_args() 99 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) -------------------------------------------------------------------------------- /images/Parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/normanheckscher/mnist-multi-gpu/3bbd69d852c9029bd3f86ca83786d33b15a54a8d/images/Parallelism.png -------------------------------------------------------------------------------- /images/mnist_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/normanheckscher/mnist-multi-gpu/3bbd69d852c9029bd3f86ca83786d33b15a54a8d/images/mnist_graph.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """Entry point to evolving the neural network. Start here.""" 2 | import logging 3 | from optimizer import Optimizer 4 | from tqdm import tqdm 5 | 6 | # Setup logging. 7 | logging.basicConfig( 8 | format='%(asctime)s - %(levelname)s - %(message)s', 9 | datefmt='%m/%d/%Y %I:%M:%S %p', 10 | level=logging.DEBUG, 11 | filename='log.txt' 12 | ) 13 | 14 | def train_networks(networks, dataset): 15 | """Train each network. 16 | 17 | Args: 18 | networks (list): Current population of networks 19 | dataset (str): Dataset to use for training/evaluating 20 | """ 21 | pbar = tqdm(total=len(networks)) 22 | for network in networks: 23 | network.train(dataset) 24 | pbar.update(1) 25 | pbar.close() 26 | 27 | def get_average_accuracy(networks): 28 | """Get the average accuracy for a group of networks. 29 | 30 | Args: 31 | networks (list): List of networks 32 | 33 | Returns: 34 | float: The average accuracy of a population of networks. 35 | 36 | """ 37 | total_accuracy = 0 38 | for network in networks: 39 | total_accuracy += network.accuracy 40 | 41 | return total_accuracy / len(networks) 42 | 43 | def generate(generations, population, nn_param_choices, dataset): 44 | """Generate a network with the genetic algorithm. 45 | 46 | Args: 47 | generations (int): Number of times to evole the population 48 | population (int): Number of networks in each generation 49 | nn_param_choices (dict): Parameter choices for networks 50 | dataset (str): Dataset to use for training/evaluating 51 | 52 | """ 53 | optimizer = Optimizer(nn_param_choices) 54 | networks = optimizer.create_population(population) 55 | 56 | # Evolve the generation. 57 | for i in range(generations): 58 | logging.info("***Doing generation %d of %d***" % 59 | (i + 1, generations)) 60 | 61 | # Train and get accuracy for networks. 62 | train_networks(networks, dataset) 63 | 64 | # Get the average accuracy for this generation. 65 | average_accuracy = get_average_accuracy(networks) 66 | 67 | # Print out the average accuracy each generation. 68 | logging.info("Generation average: %.2f%%" % (average_accuracy * 100)) 69 | logging.info('-'*80) 70 | 71 | # Evolve, except on the last iteration. 72 | if i != generations - 1: 73 | # Do the evolution. 74 | networks = optimizer.evolve(networks) 75 | 76 | # Sort our final population. 77 | networks = sorted(networks, key=lambda x: x.accuracy, reverse=True) 78 | 79 | # Print out the top 5 networks. 80 | print_networks(networks[:5]) 81 | 82 | def print_networks(networks): 83 | """Print a list of networks. 84 | 85 | Args: 86 | networks (list): The population of networks 87 | 88 | """ 89 | logging.info('-'*80) 90 | for network in networks: 91 | network.print_network() 92 | 93 | def main(): 94 | """Evolve a network.""" 95 | generations = 10 # Number of times to evole the population. 96 | population = 20 # Number of networks in each generation. 97 | dataset = 'mnist' 98 | 99 | nn_param_choices = { 100 | 'nb_neurons': [64, 128, 256, 512, 768, 1024], 101 | 'nb_layers': [1, 2, 3, 4], 102 | 'activation': ['relu', 'elu', 'tanh', 'sigmoid'], 103 | 'optimizer': ['rmsprop', 'adam', 'sgd', 'adagrad', 104 | 'adadelta', 'adamax', 'nadam'], 105 | } 106 | 107 | logging.info("***Evolving %d generations with population %d***" % 108 | (generations, population)) 109 | 110 | generate(generations, population, nn_param_choices, dataset) 111 | 112 | if __name__ == '__main__': 113 | main() 114 | -------------------------------------------------------------------------------- /mnist_multi_gpu_batching_train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Norman Heckscher. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """A binary to train MNIST using multiple GPU's with synchronous updates. 16 | 17 | Accuracy: 18 | Should achieve ~99.2% accuracy after 20K steps, unfortunately it's not at 19 | the moment. 20 | 21 | Speed: With batch_size 50. 22 | 23 | System | Step Time (sec/batch) | Accuracy 24 | ------------------------------------------------------------------------- 25 | 1 GTX 1080 | 0.018-0.022 | ~xx.xx% at 20K steps (x hours) 26 | 2 GTX 1080 | 0.012-0.015 | ~xx.xx% at 20K steps (x hours) 27 | 28 | Usage: 29 | Please see the TensorFlow website for how to download the MNIST 30 | data set, compile and train models. 31 | 32 | """ 33 | 34 | from __future__ import absolute_import 35 | from __future__ import division 36 | from __future__ import print_function 37 | 38 | import os.path 39 | import re 40 | import time 41 | import numpy as np 42 | from datetime import datetime 43 | 44 | from tensorflow.examples.tutorials.mnist import input_data 45 | 46 | import tensorflow as tf 47 | 48 | # Constants used for dealing with the files, matches convert_to_records. 49 | TRAIN_FILE = 'train.tfrecords' 50 | VALIDATION_FILE = 'validation.tfrecords' 51 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name 52 | # to differentiate the operations. Note that this prefix is removed from the 53 | # names of the summaries when visualizing a model. 54 | TOWER_NAME = 'tower' 55 | IMAGE_PIXELS = 784 56 | 57 | # Constants describing the training process. 58 | MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. 59 | NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays. 60 | LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. 61 | INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. 62 | 63 | # Global constants describing the MNIST data set. 64 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 65 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000 66 | 67 | FLAGS = tf.app.flags.FLAGS 68 | 69 | tf.app.flags.DEFINE_integer('batch_size', 64, 70 | """Number of images to process in a batch.""") 71 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data', 72 | """Path to the MNIST data directory.""") 73 | tf.app.flags.DEFINE_string('train_dir', '/home/norman/MNIST_train', 74 | """Directory where to write event logs """ 75 | """and checkpoint.""") 76 | tf.app.flags.DEFINE_integer('num_gpus', 2, 77 | """How many GPUs to use.""") 78 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 79 | """Whether to log device placement.""") 80 | tf.app.flags.DEFINE_boolean('tb_logging', False, 81 | """Whether to log to Tensorboard.""") 82 | tf.app.flags.DEFINE_integer('num_epochs', 10, 83 | """Number of epochs to run trainer.""") 84 | # 17/4/17 85 | # 1 gpu 86 | # Done training for 20 epochs, 22000 steps. 87 | # Total Duration (474.817 sec) 88 | # 2017-04-17 15:24:51.190879: precision = 9743.000v 89 | # Done training for 20 epochs, 22000 steps. 90 | # Total Duration (497.690 sec) 91 | # 2017-04-17 15:35:10.070366: precision = 9305.000 92 | # 2 gpu 93 | # Done training for 20 epochs, 22000 steps. 94 | # Total Duration (687.583 sec) 95 | # 2017-04-17 15:14:28.793936: precision = 9472.000 96 | # Done training for 20 epochs, 22000 steps. 97 | # Total Duration (672.720 sec) 98 | # 2017-04-17 15:52:16.096935: precision = 9672.000 99 | # 17/4/17 100 | 101 | # 18/4/17 102 | # 2 GPU 103 | # Done training for 10 epochs, 8593 steps. 104 | # Total Duration (339.430 sec) 105 | # 2017-04-18 10:50:53.269983: precision = 9677.000 106 | # Done training for 10 epochs, 8593 steps. 107 | # Total Duration (335.611 sec) 108 | # 2017-04-18 11:14:26.685982: precision = 9674.000 109 | # Done training for 10 epochs, 8593 steps. 110 | # Total Duration (349.731 sec) 111 | # 2017-04-18 12:48:15.148828: precision = 9267.000 112 | # Done training for 10 epochs, 8593 steps. 113 | # Total Duration (350.593 sec) 114 | # 2017-04-18 13:14:51.974247: precision = 9270.000 115 | # Done training for 10 epochs, 8593 steps. 116 | # Total Duration (361.926 sec) 117 | # 2017-04-18 13:58:02.775474: precision = 9507.000 118 | # Done training for 10 epochs, 8593 steps. 119 | # Total Duration (346.119 sec) 120 | # 2017-04-18 14:46:51.579685: precision = 9471.000 121 | # Done training for 10 epochs, 8593 steps. 122 | # Total Duration (334.561 sec) 123 | # 2017-04-18 14:58:06.942195: precision = 9781.000 124 | 125 | # 1 GPU 126 | # Done training for 10 epochs, 8593 steps. 127 | # Total Duration (238.033 sec) 128 | # 2017-04-18 11:02:06.403359: precision = 9679.000 129 | # Done training for 10 epochs, 8593 steps. 130 | # Total Duration (256.169 sec) 131 | # 2017-04-18 11:20:54.328206: precision = 9362.000 132 | # Done training for 10 epochs, 8593 steps. 133 | # Total Duration (257.144 sec) 134 | # 2017-04-18 12:30:53.954074: precision = 8989.000 135 | # Done training for 10 epochs, 8593 steps. 136 | # Total Duration (250.306 sec) 137 | # 2017-04-18 12:40:26.649277: precision = 9512.000 138 | # Done training for 10 epochs, 8593 steps. 139 | # Total Duration (257.795 sec) 140 | # 2017-04-18 13:22:48.300705: precision = 9692.000 141 | # Done training for 10 epochs, 8593 steps. 142 | # Total Duration (254.077 sec) 143 | # 2017-04-18 13:35:26.700627: precision = 9391.000 144 | # Done training for 10 epochs, 8593 steps. 145 | # Total Duration (253.215 sec) 146 | # 2017-04-18 13:41:46.708623: precision = 9734.000 147 | 148 | 149 | 150 | def read_and_decode(filename_queue): 151 | reader = tf.TFRecordReader() 152 | _, serialized_example = reader.read(filename_queue) 153 | features = tf.parse_single_example( 154 | serialized_example, 155 | # Defaults are not specified since both keys are required. 156 | features={ 157 | 'image_raw': tf.FixedLenFeature([], tf.string), 158 | 'label': tf.FixedLenFeature([], tf.int64), 159 | }) 160 | 161 | # Convert from a scalar string tensor (whose single string has 162 | # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape 163 | # [mnist.IMAGE_PIXELS]. 164 | image = tf.decode_raw(features['image_raw'], tf.uint8) 165 | image.set_shape([IMAGE_PIXELS]) 166 | 167 | # OPTIONAL: Could reshape into a 28x28 image and apply distortions 168 | # here. Since we are not applying any distortions in this 169 | # example, and the next step expects the image to be flattened 170 | # into a vector, we don't bother. 171 | 172 | # Convert from [0, 255] -> [-0.5, 0.5] floats. 173 | image = tf.cast(image, tf.float32) * (1. / 255) - 0.5 174 | 175 | # Convert label from a scalar uint8 tensor to an int32 scalar. 176 | label = tf.cast(features['label'], tf.int32) 177 | 178 | return image, label 179 | 180 | def inputs(train, batch_size, num_epochs): 181 | """Reads input data num_epochs times. 182 | Args: 183 | train: Selects between the training (True) and validation (False) data. 184 | batch_size: Number of examples per returned batch. 185 | num_epochs: Number of times to read the input data, or 0/None to 186 | train forever. 187 | Returns: 188 | A tuple (images, labels), where: 189 | * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS] 190 | in the range [-0.5, 0.5]. 191 | * labels is an int32 tensor with shape [batch_size] with the true label, 192 | a number in the range [0, mnist.NUM_CLASSES). 193 | Note that an tf.train.QueueRunner is added to the graph, which 194 | must be run using e.g. tf.train.start_queue_runners(). 195 | """ 196 | if not num_epochs: num_epochs = None 197 | filename = os.path.join(FLAGS.data_dir, 198 | TRAIN_FILE if train else VALIDATION_FILE) 199 | 200 | with tf.name_scope('input'): 201 | filename_queue = tf.train.string_input_producer( 202 | [filename], num_epochs=num_epochs) 203 | 204 | # Even when reading in multiple threads, share the filename 205 | # queue. 206 | image, label = read_and_decode(filename_queue) 207 | 208 | # Shuffle the examples and collect them into batch_size batches. 209 | # (Internally uses a RandomShuffleQueue.) 210 | # We run this in two threads to avoid being a bottleneck. 211 | images, sparse_labels = tf.train.shuffle_batch( 212 | [image, label], batch_size=batch_size, num_threads=2, 213 | capacity=1000 + 3 * batch_size, 214 | # Ensures a minimum amount of shuffling of examples. 215 | min_after_dequeue=1000) 216 | 217 | return images, sparse_labels 218 | 219 | def inference(images): 220 | """Build the MNIST model. 221 | 222 | Args: 223 | images: Images returned from MNIST or inputs(). 224 | 225 | Returns: 226 | Logits. 227 | """ 228 | # We instantiate all variables using tf.get_variable() instead of 229 | # tf.Variable() in order to share variables across multiple GPU training 230 | # runs. If we only ran this model on a single GPU, we could simplify this 231 | # function by replacing all instances of tf.get_variable() 232 | # with tf.Variable(). 233 | 234 | # Reshape to use within a convolutional neural net. 235 | # Last dimension is for "features" - there is only one here, since images 236 | # are grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc. 237 | x_image = tf.reshape(images, [-1, 28, 28, 1]) 238 | 239 | # conv1 240 | with tf.variable_scope('conv1') as scope: 241 | kernel = _variable_with_weight_decay('weights', 242 | shape=[5, 5, 1, 32], 243 | stddev=5e-2, 244 | wd=0.0) 245 | biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.0)) 246 | conv = tf.nn.conv2d(x_image, kernel, strides=[1, 1, 1, 1], 247 | padding='SAME') 248 | pre_activation = tf.nn.bias_add(conv, biases) 249 | conv1 = tf.nn.relu(pre_activation, name=scope.name) 250 | _activation_summary(conv1) 251 | 252 | # pool1 253 | pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], 254 | padding='SAME', name='pool1') 255 | 256 | # norm1 257 | norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, 258 | name='norm1') 259 | 260 | # conv2 261 | with tf.variable_scope('conv2') as scope: 262 | kernel = _variable_with_weight_decay('weights', 263 | shape=[5, 5, 32, 64], 264 | stddev=5e-2, 265 | wd=0.0) 266 | conv = tf.nn.conv2d(norm1, kernel, strides=[1, 1, 1, 1], padding='SAME') 267 | biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1)) 268 | pre_activation = tf.nn.bias_add(conv, biases) 269 | conv2 = tf.nn.relu(pre_activation, name=scope.name) 270 | _activation_summary(conv2) 271 | 272 | # norm2 273 | norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, 274 | name='norm1') 275 | 276 | # pool2 277 | pool2 = tf.nn.max_pool(norm2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], 278 | padding='SAME', name='pool2') 279 | 280 | # local3 281 | with tf.variable_scope('local3') as scope: 282 | # Move everything into depth so we can perform a single matrix multiply. 283 | reshape = tf.reshape(pool2, [-1, 7 * 7 * 64]) 284 | dim = reshape.get_shape()[1].value 285 | weights = _variable_with_weight_decay('weights', shape=[dim, 1024], 286 | stddev=0.04, wd=0.004) 287 | biases = _variable_on_cpu('biases', [1024], 288 | tf.constant_initializer(0.1)) 289 | local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, 290 | name=scope.name) 291 | _activation_summary(local3) 292 | 293 | # local4 294 | with tf.variable_scope('local4') as scope: 295 | weights = _variable_with_weight_decay('weights', shape=[1024, 10], 296 | stddev=0.04, wd=0.004) 297 | biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.1)) 298 | local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, 299 | name=scope.name) 300 | _activation_summary(local4) 301 | 302 | # linear layer(WX + b), 303 | # We don't apply softmax here because 304 | # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits 305 | # and performs the softmax internally for efficiency. 306 | with tf.variable_scope('softmax_linear') as scope: 307 | weights = _variable_with_weight_decay('weights', [10, 10], 308 | stddev=1 / 192.0, wd=0.0) 309 | biases = _variable_on_cpu('biases', [10], 310 | tf.constant_initializer(0.0)) 311 | softmax_linear = tf.add(tf.matmul(local4, weights), biases, 312 | name=scope.name) 313 | _activation_summary(softmax_linear) 314 | 315 | return softmax_linear 316 | 317 | def _variable_with_weight_decay(name, shape, stddev, wd): 318 | """Helper to create an initialized Variable with weight decay. 319 | 320 | Note that the Variable is initialized with a truncated normal distribution. 321 | A weight decay is added only if one is specified. 322 | 323 | Args: 324 | name: name of the variable 325 | shape: list of ints 326 | stddev: standard deviation of a truncated Gaussian 327 | wd: add L2Loss weight decay multiplied by this float. If None, weight 328 | decay is not added for this Variable. 329 | 330 | Returns: 331 | Variable Tensor 332 | """ 333 | dtype = tf.float32 334 | var = _variable_on_cpu( 335 | name, 336 | shape, 337 | tf.truncated_normal_initializer(stddev=stddev, dtype=dtype)) 338 | if wd is not None: 339 | weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss') 340 | tf.add_to_collection('losses', weight_decay) 341 | return var 342 | 343 | def _variable_on_cpu(name, shape, initializer): 344 | """Helper to create a Variable stored on CPU memory. 345 | 346 | Args: 347 | name: name of the variable 348 | shape: list of ints 349 | initializer: initializer for Variable 350 | 351 | Returns: 352 | Variable Tensor 353 | """ 354 | with tf.device('/cpu:0'): 355 | dtype = tf.float32 356 | var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype) 357 | return var 358 | 359 | def _activation_summary(x): 360 | """Helper to create summaries for activations. 361 | 362 | Creates a summary that provides a histogram of activations. 363 | Creates a summary that measures the sparsity of activations. 364 | 365 | Args: 366 | x: Tensor 367 | Returns: 368 | nothing 369 | """ 370 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training 371 | # session. This helps the clarity of presentation on tensorboard. 372 | if FLAGS.tb_logging: 373 | tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name) 374 | tf.summary.histogram(tensor_name + '/activations', x) 375 | tf.summary.scalar(tensor_name + '/sparsity', 376 | tf.nn.zero_fraction(x)) 377 | 378 | def loss(logits, labels): 379 | """Add L2Loss to all the trainable variables. 380 | 381 | Add summary for "Loss" and "Loss/avg". 382 | Args: 383 | logits: Logits from inference(). 384 | labels: Labels from distorted_inputs or inputs(). 1-D tensor 385 | of shape [batch_size] 386 | 387 | Returns: 388 | Loss tensor of type float. 389 | """ 390 | # Calculate the average cross entropy loss across the batch. 391 | labels = tf.cast(labels, tf.int64) 392 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( 393 | labels=labels, logits=logits, name='cross_entropy_per_example') 394 | cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') 395 | tf.add_to_collection('losses', cross_entropy_mean) 396 | 397 | # The total loss is defined as the cross entropy loss plus all of the weight 398 | # decay terms (L2 loss). 399 | return tf.add_n(tf.get_collection('losses'), name='total_loss') 400 | 401 | def tower_loss(scope): 402 | """Calculate the total loss on a single tower running the MNIST model. 403 | 404 | Args: 405 | scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0' 406 | 407 | Returns: 408 | Tensor of shape [] containing the total loss for a batch of data 409 | """ 410 | # Input images and labels. 411 | images, labels = inputs(train=True, batch_size=FLAGS.batch_size, 412 | num_epochs=FLAGS.num_epochs) 413 | # Build inference Graph. 414 | logits = inference(images) 415 | 416 | # Build the portion of the Graph calculating the losses. Note that we will 417 | # assemble the total_loss using a custom function below. 418 | _ = loss(logits, labels) 419 | 420 | # Assemble all of the losses for the current tower only. 421 | losses = tf.get_collection('losses', scope) 422 | 423 | # Calculate the total loss for the current tower. 424 | total_loss = tf.add_n(losses, name='total_loss') 425 | 426 | # Attach a scalar summary to all individual losses and the total loss; do 427 | # the same for the averaged version of the losses. 428 | if FLAGS.tb_logging: 429 | for l in losses + [total_loss]: 430 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU 431 | # training session. This helps the clarity of presentation on 432 | # tensorboard. 433 | loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) 434 | tf.summary.scalar(loss_name, l) 435 | 436 | return total_loss 437 | 438 | def average_gradients(tower_grads): 439 | """Calculate average gradient for each shared variable across all towers. 440 | 441 | Note that this function provides a synchronization point across all towers. 442 | 443 | Args: 444 | tower_grads: List of lists of (gradient, variable) tuples. The outer list 445 | is over individual gradients. The inner list is over the gradient 446 | calculation for each tower. 447 | Returns: 448 | List of pairs of (gradient, variable) where the gradient has been 449 | averaged across all towers. 450 | """ 451 | average_grads = [] 452 | for grad_and_vars in zip(*tower_grads): 453 | # Note that each grad_and_vars looks like the following: 454 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) 455 | grads = [] 456 | for g, _ in grad_and_vars: 457 | # Add 0 dimension to the gradients to represent the tower. 458 | expanded_g = tf.expand_dims(g, 0) 459 | 460 | # Append on a 'tower' dimension which we will average over below. 461 | grads.append(expanded_g) 462 | 463 | # Average over the 'tower' dimension. 464 | grad = tf.concat(grads, 0) 465 | grad = tf.reduce_mean(grad, 0) 466 | 467 | # Keep in mind that the Variables are redundant because they are shared 468 | # across towers. So .. we will just return the first tower's pointer to 469 | # the Variable. 470 | v = grad_and_vars[0][1] 471 | grad_and_var = (grad, v) 472 | average_grads.append(grad_and_var) 473 | return average_grads 474 | 475 | def train(): 476 | with tf.Graph().as_default(), tf.device('/cpu:0'): 477 | # Create a variable to count the number of train() calls. This equals 478 | # the number of batches processed * FLAGS.num_gpus. 479 | global_step = tf.get_variable( 480 | 'global_step', [], 481 | initializer=tf.constant_initializer(0), trainable=False) 482 | 483 | # Calculate the learning rate schedule. 484 | num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / 485 | FLAGS.batch_size) 486 | decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) 487 | 488 | # Decay the learning rate exponentially based on the number of steps. 489 | lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, 490 | global_step, 491 | decay_steps, 492 | LEARNING_RATE_DECAY_FACTOR, 493 | staircase=True) 494 | 495 | opt = tf.train.MomentumOptimizer(lr,0.9,use_nesterov=True,use_locking=True) 496 | 497 | # Calculate the gradients for each model tower. 498 | tower_grads = [] 499 | with tf.variable_scope(tf.get_variable_scope()): 500 | for i in xrange(FLAGS.num_gpus): 501 | with tf.device('/gpu:%d' % i): 502 | with tf.name_scope( 503 | '%s_%d' % (TOWER_NAME, i)) as scope: 504 | # Calculate the loss for one tower of the CIFAR model. 505 | # This function constructs the entire CIFAR model but 506 | # shares the variables across all towers. 507 | loss = tower_loss(scope) 508 | 509 | # Reuse variables for the next tower. 510 | tf.get_variable_scope().reuse_variables() 511 | 512 | # Retain the summaries from the final tower. 513 | summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, 514 | scope) 515 | 516 | # Calculate the gradients for the batch of data on this 517 | # MNIST tower. 518 | grads = opt.compute_gradients(loss, gate_gradients=0) 519 | 520 | # Keep track of the gradients across all towers. 521 | tower_grads.append(grads) 522 | 523 | # We must calculate the mean of each gradient. Note that this is the 524 | # synchronization point across all towers. 525 | grads = average_gradients(tower_grads) 526 | 527 | # Add histograms for gradients. 528 | if FLAGS.tb_logging: 529 | for grad, var in grads: 530 | if grad is not None: 531 | summaries.append( 532 | tf.summary.histogram(var.op.name + '/gradients', grad)) 533 | # Add a summary to track the learning rate. 534 | summaries.append(tf.summary.scalar('learning_rate', lr)) 535 | 536 | train_op = opt.apply_gradients(grads, global_step=global_step) 537 | 538 | # Add histograms for trainable variables. 539 | if FLAGS.tb_logging: 540 | for var in tf.trainable_variables(): 541 | summaries.append(tf.summary.histogram(var.op.name, var)) 542 | 543 | # Create a saver. 544 | saver = tf.train.Saver(tf.global_variables(),sharded=True) 545 | 546 | # Build the summary operation from the last tower summaries. 547 | summary_op = tf.summary.merge(summaries) 548 | 549 | # Build an initialization operation to run below. 550 | # init = tf.global_variables_initializer() 551 | 552 | # The op for initializing the variables. 553 | init_op = tf.group(tf.global_variables_initializer(), 554 | tf.local_variables_initializer()) 555 | 556 | # Start running operations on the Graph. allow_soft_placement must be 557 | # set to True to build towers on GPU, as some of the ops do not have GPU 558 | # implementations. 559 | sess = tf.Session(config=tf.ConfigProto( 560 | allow_soft_placement=True, 561 | log_device_placement=FLAGS.log_device_placement)) 562 | sess.run(init_op) 563 | 564 | # Start input enqueue threads. 565 | coord = tf.train.Coordinator() 566 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 567 | 568 | summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) 569 | 570 | try: 571 | step = 0 572 | while not coord.should_stop(): 573 | start_time = time.time() 574 | 575 | # Run one step of the model. The return values are 576 | # the activations from the `train_op` (which is 577 | # discarded) and the `loss` op. To inspect the values 578 | # of your ops or variables, you may include them in 579 | # the list passed to sess.run() and the value tensors 580 | # will be returned in the tuple from the call. 581 | _, loss_value = sess.run([train_op, loss]) 582 | 583 | duration = time.time() - start_time 584 | 585 | assert not np.isnan( 586 | loss_value), 'Model diverged with loss = NaN' 587 | 588 | # Print an overview fairly often. 589 | if step % 100 == 0: 590 | num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus 591 | examples_per_sec = num_examples_per_step / duration 592 | sec_per_batch = duration / FLAGS.num_gpus 593 | format_str = ( 594 | '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 595 | 'sec/batch)') 596 | print(format_str % (datetime.now(), step, loss_value, 597 | examples_per_sec, sec_per_batch)) 598 | if FLAGS.tb_logging: 599 | if step % 10 == 0: 600 | summary_str = sess.run(summary_op) 601 | summary_writer.add_summary(summary_str, step) 602 | 603 | # Save the model checkpoint periodically. 604 | if step % 1000 == 0 or ( 605 | step + 1) == FLAGS.num_epochs * FLAGS.batch_size: 606 | checkpoint_path = os.path.join(FLAGS.train_dir, 607 | 'model.ckpt') 608 | saver.save(sess, checkpoint_path, global_step=step) 609 | 610 | step += 1 611 | except tf.errors.OutOfRangeError: 612 | print('Done training for %d epochs, %d steps.' % ( 613 | FLAGS.num_epochs, step)) 614 | finally: 615 | # When done, ask the threads to stop. 616 | coord.request_stop() 617 | 618 | # Wait for threads to finish. 619 | coord.join(threads) 620 | sess.close() 621 | 622 | def evaluate(): 623 | """Eval MNIST for a number of steps.""" 624 | with tf.Graph().as_default(): 625 | # Get images and labels for MNIST. 626 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False) 627 | images = mnist.test.images 628 | labels = mnist.test.labels 629 | 630 | # Build a Graph that computes the logits predictions from the 631 | # inference model. 632 | logits = inference(images) 633 | 634 | # Calculate predictions. 635 | top_k_op = tf.nn.in_top_k(predictions=logits, targets=labels, k=1) 636 | 637 | # Create saver to restore the learned variables for eval. 638 | saver = tf.train.Saver() 639 | 640 | with tf.Session() as sess: 641 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) 642 | if ckpt and ckpt.model_checkpoint_path: 643 | # Restores from checkpoint 644 | saver.restore(sess, ckpt.model_checkpoint_path) 645 | else: 646 | print('No checkpoint file found') 647 | return 648 | 649 | predictions = np.sum(sess.run([top_k_op])) 650 | 651 | # Compute precision. 652 | print('%s: precision = %.3f' % (datetime.now(), predictions)) 653 | 654 | def main(argv=None): # pylint: disable=unused-argument 655 | start_time = time.time() 656 | train() 657 | duration = time.time() - start_time 658 | print('Total Duration (%.3f sec)' % duration) 659 | evaluate() 660 | 661 | if __name__ == '__main__': 662 | tf.app.run() 663 | -------------------------------------------------------------------------------- /mnist_multi_gpu_keras.py: -------------------------------------------------------------------------------- 1 | '''Trains a simple convnet on the MNIST dataset. 2 | Gets to 99.25% test accuracy after 12 epochs 3 | (there is still a lot of margin for parameter tuning). 4 | 16 seconds per epoch on a GRID K520 GPU. 5 | ''' 6 | 7 | from __future__ import print_function 8 | import numpy as np 9 | np.random.seed(1337) # for reproducibility 10 | 11 | from tensorflow.contrib.keras.api.keras.datasets import mnist 12 | from keras.models import Sequential 13 | from keras.layers import Dense, Dropout, Activation, Flatten 14 | from keras.layers import Convolution2D, MaxPooling2D, BatchNormalization 15 | from keras.utils import np_utils 16 | from keras import backend as K 17 | from keras.callbacks import TensorBoard, ModelCheckpoint 18 | tensorboard = TensorBoard(log_dir='/home/norman/MNIST_train', histogram_freq=1, 19 | write_graph=True, write_images=False, embeddings_freq=1) 20 | import time 21 | import argparse 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--extras', help='(absolute) path to keras-extras') 24 | parser.add_argument('--gpus', help='number of GPUs') 25 | parser.print_help() 26 | args = parser.parse_args() 27 | 28 | import sys 29 | sys.path.append(args.extras) 30 | 31 | from multi_gpu import make_parallel 32 | 33 | #ngpus = int(args.gpus) 34 | ngpus = int(2) 35 | print("Using %i GPUs" %ngpus) 36 | 37 | batch_size = 128 38 | nb_classes = 10 39 | nb_epoch = 12 40 | 41 | # input image dimensions 42 | img_rows, img_cols = 28, 28 43 | # number of convolutional filters to use 44 | nb_filters = 32 45 | # size of pooling area for max pooling 46 | pool_size = (2, 2) 47 | # convolution kernel size 48 | kernel_size = (3, 3) 49 | 50 | # the data, shuffled and split between train and test sets 51 | (X_train, y_train), (X_test, y_test) = mnist.load_data() 52 | 53 | if K.image_dim_ordering() == 'th': 54 | X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols) 55 | X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols) 56 | input_shape = (1, img_rows, img_cols) 57 | else: 58 | X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1) 59 | X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1) 60 | input_shape = (img_rows, img_cols, 1) 61 | 62 | X_train = X_train.astype('float32') 63 | X_test = X_test.astype('float32') 64 | X_train /= 255 65 | X_test /= 255 66 | print('X_train shape:', X_train.shape) 67 | print(X_train.shape[0], 'train samples') 68 | print(X_test.shape[0], 'test samples') 69 | 70 | # convert class vectors to binary class matrices 71 | Y_train = np_utils.to_categorical(y_train, nb_classes) 72 | Y_test = np_utils.to_categorical(y_test, nb_classes) 73 | 74 | model = Sequential() 75 | 76 | model.add(Convolution2D(nb_filters, (kernel_size[0], kernel_size[1]), 77 | padding='valid', 78 | input_shape=input_shape)) 79 | model.add(Activation('relu')) 80 | model.add(Convolution2D(256, (kernel_size[0], kernel_size[1]))) 81 | model.add(Activation('relu')) 82 | model.add(BatchNormalization()) 83 | model.add(Convolution2D(128, (kernel_size[0], kernel_size[1]))) 84 | model.add(Activation('relu')) 85 | model.add(MaxPooling2D(pool_size=pool_size)) 86 | model.add(Dropout(0.25)) 87 | model.add(Flatten()) 88 | model.add(Dense(128)) 89 | model.add(Activation('relu')) 90 | model.add(Dropout(0.5)) 91 | model.add(Dense(nb_classes)) 92 | model.add(Activation('softmax')) 93 | 94 | if ngpus > 1: 95 | model = make_parallel(model,ngpus) 96 | 97 | model.compile(loss='categorical_crossentropy', 98 | optimizer='adadelta', 99 | metrics=['accuracy']) 100 | 101 | start_time = time.time() 102 | model.fit(X_train, Y_train, batch_size=batch_size*ngpus, epochs=nb_epoch, 103 | verbose=1, validation_data=(X_test, Y_test))#, callbacks=[tensorboard]) 104 | score = model.evaluate(X_test, Y_test, verbose=0) 105 | print('Test score:', score[0]) 106 | print('Test accuracy:', score[1]) 107 | duration = time.time() - start_time 108 | print('Total Duration (%.3f sec)' % duration) 109 | -------------------------------------------------------------------------------- /mnist_multi_gpu_sonnet.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Norman Heckscher. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """A binary to train MNIST using multiple GPU's with synchronous updates. 16 | 17 | Accuracy: 18 | 19 | Done training with 2 GPUs, for 20 epochs, 11000 steps. 20 | Total Duration (327.396 sec) 21 | 2017-04-21 20:46:18.466392: precision = 9848.000 22 | Done training with 1 GPUs, for 20 epochs, 22000 steps. 23 | Total Duration (500.122 sec) 24 | 2017-04-21 20:56:40.639580: precision = 9884.000 25 | 26 | Speed: With batch_size 50. 27 | 28 | System | Step Time (sec/batch) | Accuracy 29 | ------------------------------------------------------------------------- 30 | 1 GTX 1080 | 258.136 sec | ~94.58% at 11K steps 31 | 2 GTX 1080 | 189.572 sec | ~94.59% at 11K steps 32 | 33 | Usage: 34 | Please see the TensorFlow website for how to download the MNIST 35 | data set, compile and train models. 36 | 37 | """ 38 | 39 | from __future__ import absolute_import 40 | from __future__ import division 41 | from __future__ import print_function 42 | 43 | import os.path 44 | import re 45 | import time 46 | import numpy as np 47 | from datetime import datetime 48 | 49 | from tensorflow.examples.tutorials.mnist import input_data 50 | 51 | import tensorflow as tf 52 | import sonnet as snt 53 | 54 | # Constants used for dealing with the files, matches convert_to_records. 55 | TRAIN_FILE = 'train.tfrecords' 56 | VALIDATION_FILE = 'validation.tfrecords' 57 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name 58 | # to differentiate the operations. Note that this prefix is removed from the 59 | # names of the summaries when visualizing a model. 60 | TOWER_NAME = 'tower' 61 | IMAGE_PIXELS = 784 62 | 63 | # Constants describing the training process. 64 | MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. 65 | NUM_EPOCHS_PER_DECAY = 20.0 # Epochs after which learning rate decays. 66 | LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. 67 | INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. 68 | 69 | # Global constants describing the MNIST data set. 70 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 71 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000 72 | 73 | FLAGS = tf.app.flags.FLAGS 74 | 75 | tf.app.flags.DEFINE_integer('batch_size', 50, 76 | """Number of images to process in a batch.""") 77 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data', 78 | """Path to the MNIST data directory.""") 79 | tf.app.flags.DEFINE_string('train_dir', '/home/norman/MNIST_train', 80 | """Directory where to write event logs """ 81 | """and checkpoint.""") 82 | tf.app.flags.DEFINE_integer('num_gpus', 2, 83 | """How many GPUs to use.""") 84 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 85 | """Whether to log device placement.""") 86 | tf.app.flags.DEFINE_boolean('tb_logging', False, 87 | """Whether to log to Tensorboard.""") 88 | tf.app.flags.DEFINE_integer('num_epochs', 20, 89 | """Number of epochs to run trainer.""") 90 | 91 | 92 | def read_and_decode(filename_queue): 93 | reader = tf.TFRecordReader() 94 | _, serialized_example = reader.read(filename_queue) 95 | features = tf.parse_single_example( 96 | serialized_example, 97 | # Defaults are not specified since both keys are required. 98 | features={ 99 | 'image_raw': tf.FixedLenFeature([], tf.string), 100 | 'label': tf.FixedLenFeature([], tf.int64), 101 | }) 102 | 103 | # Convert from a scalar string tensor (whose single string has 104 | # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape 105 | # [mnist.IMAGE_PIXELS]. 106 | image = tf.decode_raw(features['image_raw'], tf.uint8) 107 | image.set_shape([IMAGE_PIXELS]) 108 | 109 | # OPTIONAL: Could reshape into a 28x28 image and apply distortions 110 | # here. Since we are not applying any distortions in this 111 | # example, and the next step expects the image to be flattened 112 | # into a vector, we don't bother. 113 | 114 | # Convert from [0, 255] -> [-0.5, 0.5] floats. 115 | image = tf.cast(image, tf.float32) * (1. / 255) - 0.5 116 | 117 | # Convert label from a scalar uint8 tensor to an int32 scalar. 118 | label = tf.cast(features['label'], tf.int32) 119 | 120 | return image, label 121 | 122 | 123 | def inputs(train, batch_size, num_epochs): 124 | """Reads input data num_epochs times. 125 | Args: 126 | train: Selects between the training (True) and validation (False) data. 127 | batch_size: Number of examples per returned batch. 128 | num_epochs: Number of times to read the input data, or 0/None to 129 | train forever. 130 | Returns: 131 | A tuple (images, labels), where: 132 | * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS] 133 | in the range [-0.5, 0.5]. 134 | * labels is an int32 tensor with shape [batch_size] with the true label, 135 | a number in the range [0, mnist.NUM_CLASSES). 136 | Note that an tf.train.QueueRunner is added to the graph, which 137 | must be run using e.g. tf.train.start_queue_runners(). 138 | """ 139 | if not num_epochs: num_epochs = None 140 | filename = os.path.join(FLAGS.data_dir, 141 | TRAIN_FILE if train else VALIDATION_FILE) 142 | 143 | with tf.name_scope('input'): 144 | filename_queue = tf.train.string_input_producer( 145 | [filename], num_epochs=num_epochs) 146 | 147 | # Even when reading in multiple threads, share the filename 148 | # queue. 149 | image, label = read_and_decode(filename_queue) 150 | 151 | # Shuffle the examples and collect them into batch_size batches. 152 | # (Internally uses a RandomShuffleQueue.) 153 | # We run this in two threads to avoid being a bottleneck. 154 | images, sparse_labels = tf.train.shuffle_batch( 155 | [image, label], batch_size=batch_size, num_threads=2, 156 | capacity=1000 + 3 * batch_size, 157 | # Ensures a minimum amount of shuffling of examples. 158 | min_after_dequeue=1000) 159 | 160 | return images, sparse_labels 161 | 162 | 163 | def custom_build(inputs, is_training, keep_prob): 164 | x_inputs = tf.reshape(inputs, [-1, 28, 28, 1]) 165 | """A custom build method to wrap into a sonnet Module.""" 166 | outputs = snt.Conv2D(output_channels=32, kernel_shape=4, stride=2)(x_inputs) 167 | outputs = snt.BatchNorm()(outputs, is_training=is_training) 168 | outputs = tf.nn.relu(outputs) 169 | outputs = tf.nn.max_pool(outputs, ksize=[1, 2, 2, 1], 170 | strides=[1, 2, 2, 1], padding='SAME') 171 | outputs = snt.Conv2D(output_channels=64, kernel_shape=4, stride=2)(outputs) 172 | outputs = snt.BatchNorm()(outputs, is_training=is_training) 173 | outputs = tf.nn.relu(outputs) 174 | outputs = tf.nn.max_pool(outputs, ksize=[1, 2, 2, 1], 175 | strides=[1, 2, 2, 1], padding='SAME') 176 | outputs = snt.Conv2D(output_channels=1024, kernel_shape=1, stride=1)(outputs) 177 | outputs = snt.BatchNorm()(outputs, is_training=is_training) 178 | outputs = tf.nn.relu(outputs) 179 | outputs = snt.BatchFlatten()(outputs) 180 | outputs = tf.nn.dropout(outputs, keep_prob=keep_prob) 181 | outputs = snt.Linear(output_size=10)(outputs) 182 | # _activation_summary(outputs) 183 | return outputs 184 | 185 | 186 | def _variable_with_weight_decay(name, shape, stddev, wd): 187 | """Helper to create an initialized Variable with weight decay. 188 | 189 | Note that the Variable is initialized with a truncated normal distribution. 190 | A weight decay is added only if one is specified. 191 | 192 | Args: 193 | name: name of the variable 194 | shape: list of ints 195 | stddev: standard deviation of a truncated Gaussian 196 | wd: add L2Loss weight decay multiplied by this float. If None, weight 197 | decay is not added for this Variable. 198 | 199 | Returns: 200 | Variable Tensor 201 | """ 202 | dtype = tf.float32 203 | var = _variable_on_cpu( 204 | name, 205 | shape, 206 | tf.truncated_normal_initializer(stddev=stddev, dtype=dtype)) 207 | if wd is not None: 208 | weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss') 209 | tf.add_to_collection('losses', weight_decay) 210 | return var 211 | 212 | 213 | def _variable_on_cpu(name, shape, initializer): 214 | """Helper to create a Variable stored on CPU memory. 215 | 216 | Args: 217 | name: name of the variable 218 | shape: list of ints 219 | initializer: initializer for Variable 220 | 221 | Returns: 222 | Variable Tensor 223 | """ 224 | with tf.device('/cpu:0'): 225 | dtype = tf.float32 226 | var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype) 227 | return var 228 | 229 | 230 | def _activation_summary(x): 231 | """Helper to create summaries for activations. 232 | 233 | Creates a summary that provides a histogram of activations. 234 | Creates a summary that measures the sparsity of activations. 235 | 236 | Args: 237 | x: Tensor 238 | Returns: 239 | nothing 240 | """ 241 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training 242 | # session. This helps the clarity of presentation on tensorboard. 243 | if FLAGS.tb_logging: 244 | tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name) 245 | tf.summary.histogram(tensor_name + '/activations', x) 246 | tf.summary.scalar(tensor_name + '/sparsity', 247 | tf.nn.zero_fraction(x)) 248 | 249 | 250 | def loss(logits, labels): 251 | """Add L2Loss to all the trainable variables. 252 | 253 | Add summary for "Loss" and "Loss/avg". 254 | Args: 255 | logits: Logits from inference(). 256 | labels: Labels from distorted_inputs or inputs(). 1-D tensor 257 | of shape [batch_size] 258 | 259 | Returns: 260 | Loss tensor of type float. 261 | """ 262 | # Calculate the average cross entropy loss across the batch. 263 | # labels = tf.cast(labels, tf.int64) 264 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( 265 | labels=labels, logits=logits, name='cross_entropy_per_example') 266 | cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') 267 | tf.add_to_collection('losses', cross_entropy_mean) 268 | 269 | # The total loss is defined as the cross entropy loss plus all of the weight 270 | # decay terms (L2 loss). 271 | return tf.add_n(tf.get_collection('losses'), name='total_loss') 272 | 273 | 274 | def average_gradients(tower_grads): 275 | """Calculate average gradient for each shared variable across all towers. 276 | 277 | Note that this function provides a synchronization point across all towers. 278 | 279 | Args: 280 | tower_grads: List of lists of (gradient, variable) tuples. The outer list 281 | is over individual gradients. The inner list is over the gradient 282 | calculation for each tower. 283 | Returns: 284 | List of pairs of (gradient, variable) where the gradient has been 285 | averaged across all towers. 286 | """ 287 | # for m in xrange(len(tower_grads)): 288 | # for n in xrange(len(tower_grads[m])): 289 | # print(type(tower_grads[0][n][0])) 290 | # for gg in tower_grads: 291 | # for x in gg: 292 | # print(type(x[0])) 293 | # print(tower_grads) 294 | 295 | average_grads = [] 296 | for grad_and_vars in zip(*tower_grads): 297 | # Note that each grad_and_vars looks like the following: 298 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) 299 | grads = [] 300 | for g, _ in grad_and_vars: 301 | if g != None: 302 | # Add 0 dimension to the gradients to represent the tower. 303 | expanded_g = tf.expand_dims(g, 0) 304 | 305 | # Append on a 'tower' dimension which we will average over below. 306 | grads.append(expanded_g) 307 | 308 | # Average over the 'tower' dimension. 309 | grad = tf.concat(grads, 0) 310 | grad = tf.reduce_mean(grad, 0) 311 | 312 | # Keep in mind that the Variables are redundant because they are shared 313 | # across towers. So .. we will just return the first tower's pointer to 314 | # the Variable. 315 | v = grad_and_vars[0][1] 316 | grad_and_var = (grad, v) 317 | average_grads.append(grad_and_var) 318 | return average_grads 319 | 320 | 321 | def tower_loss(scope): 322 | """Calculate the total loss on a single tower running the MNIST model. 323 | 324 | Args: 325 | scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0' 326 | 327 | Returns: 328 | Tensor of shape [] containing the total loss for a batch of data 329 | """ 330 | # Input images and labels. 331 | 332 | images, labels = inputs(train=True, batch_size=FLAGS.batch_size, 333 | num_epochs=(FLAGS.num_epochs / FLAGS.num_gpus)) 334 | # Build inference Graph. 335 | # The line below takes custom_build and 336 | # wraps it to construct a sonnet Module. 337 | module_with_build_args = snt.Module(custom_build, name='simple_net') 338 | 339 | train_model_outputs = module_with_build_args(images, is_training=True, 340 | keep_prob=tf.constant(0.5)) 341 | 342 | # Build the portion of the Graph calculating the losses. Note that we will 343 | # assemble the total_loss using a custom function below. 344 | _ = loss(train_model_outputs, labels) 345 | 346 | # Assemble all of the losses for the current tower only. 347 | losses = tf.get_collection('losses', scope) 348 | 349 | # Calculate the total loss for the current tower. 350 | total_loss = tf.add_n(losses, name='total_loss') 351 | 352 | # Attach a scalar summary to all individual losses and the total loss; do 353 | # the same for the averaged version of the losses. 354 | if FLAGS.tb_logging: 355 | for l in losses + [total_loss]: 356 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU 357 | # training session. This helps the clarity of presentation on 358 | # tensorboard. 359 | loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) 360 | tf.summary.scalar(loss_name, l) 361 | 362 | return total_loss 363 | 364 | 365 | def train(): 366 | with tf.Graph().as_default(), tf.device('/cpu:0'): 367 | # Create a variable to count the number of train() calls. This equals 368 | # the number of batches processed * FLAGS.num_gpus. 369 | global_step = tf.get_variable( 370 | 'global_step', [], 371 | initializer=tf.constant_initializer(0), trainable=False) 372 | 373 | # Calculate the learning rate schedule. 374 | num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / 375 | (FLAGS.batch_size * FLAGS.num_gpus)) 376 | decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) 377 | 378 | # Decay the learning rate exponentially based on the number of steps. 379 | lr = tf.train.exponential_decay(learning_rate=INITIAL_LEARNING_RATE, 380 | global_step=global_step, 381 | decay_steps=decay_steps, 382 | decay_rate=LEARNING_RATE_DECAY_FACTOR, 383 | staircase=True) 384 | 385 | opt = tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True, 386 | use_locking=True) 387 | #opt = tf.train.AdamOptimizer(1e-4) 388 | # opt = tf.train.ProximalAdagradOptimizer(1e-2) 389 | 390 | # Calculate the gradients for each model tower. 391 | tower_grads = [] 392 | with tf.variable_scope(tf.get_variable_scope()): 393 | for i in xrange(FLAGS.num_gpus): 394 | with tf.device('/gpu:%d' % i): 395 | with tf.name_scope( 396 | '%s_%d' % (TOWER_NAME, i)) as scope: 397 | # Calculate the loss for one tower of the CIFAR model. 398 | # This function constructs the entire CIFAR model but 399 | # shares the variables across all towers. 400 | loss = tower_loss(scope) 401 | 402 | # Reuse variables for the next tower. 403 | # No need for this with Sonnet? 404 | #tf.get_variable_scope().reuse_variables() 405 | 406 | # Retain the summaries from the final tower. 407 | summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, 408 | scope) 409 | 410 | # Calculate the gradients for the batch of data on this 411 | # MNIST tower. 412 | grads = opt.compute_gradients(loss, gate_gradients=2) 413 | # for x in grads: 414 | # print(type(x[0])) 415 | # print (grads) 416 | 417 | # Keep track of the gradients across all towers. 418 | tower_grads.append(grads) 419 | 420 | # We must calculate the mean of each gradient. Note that this is the 421 | # synchronization point across all towers. 422 | grads = average_gradients(tower_grads) 423 | 424 | # Add histograms for gradients. 425 | if FLAGS.tb_logging: 426 | for grad, var in grads: 427 | if grad is not None: 428 | summaries.append( 429 | tf.summary.histogram(var.op.name + '/gradients', grad)) 430 | # Add a summary to track the learning rate. 431 | summaries.append(tf.summary.scalar('learning_rate', lr)) 432 | 433 | train_op = opt.apply_gradients(grads, global_step=global_step) 434 | 435 | # Add histograms for trainable variables. 436 | if FLAGS.tb_logging: 437 | for var in tf.trainable_variables(): 438 | summaries.append(tf.summary.histogram(var.op.name, var)) 439 | 440 | # Create a saver. 441 | saver = tf.train.Saver(tf.global_variables(), sharded=True) 442 | 443 | # Build the summary operation from the last tower summaries. 444 | summary_op = tf.summary.merge(summaries) 445 | 446 | # Build an initialization operation to run below. 447 | # init = tf.global_variables_initializer() 448 | 449 | # The op for initializing the variables. 450 | init_op = tf.group(tf.global_variables_initializer(), 451 | tf.local_variables_initializer()) 452 | 453 | # Start running operations on the Graph. allow_soft_placement must be 454 | # set to True to build towers on GPU, as some of the ops do not have GPU 455 | # implementations. 456 | gpu_options = tf.GPUOptions(allow_growth=True) 457 | sess = tf.Session(config=tf.ConfigProto( 458 | allow_soft_placement=True, 459 | log_device_placement=FLAGS.log_device_placement, 460 | gpu_options=gpu_options)) 461 | sess.run(init_op) 462 | 463 | # Start input enqueue threads. 464 | coord = tf.train.Coordinator() 465 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 466 | 467 | summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) 468 | 469 | try: 470 | step = 0 471 | while not coord.should_stop(): 472 | start_time = time.time() 473 | 474 | # Run one step of the model. The return values are 475 | # the activations from the `train_op` (which is 476 | # discarded) and the `loss` op. To inspect the values 477 | # of your ops or variables, you may include them in 478 | # the list passed to sess.run() and the value tensors 479 | # will be returned in the tuple from the call. 480 | _, loss_value = sess.run([train_op, loss]) 481 | 482 | duration = time.time() - start_time 483 | 484 | assert not np.isnan( 485 | loss_value), 'Model diverged with loss = NaN' 486 | 487 | # Print an overview fairly often. 488 | if step % 100 == 0: 489 | num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus 490 | examples_per_sec = num_examples_per_step / duration 491 | sec_per_batch = duration / FLAGS.num_gpus 492 | format_str = ( 493 | '%s: step %d, epochs %d, loss = %.3f ' 494 | '(%.1f examples/sec; %.3f sec/batch)') 495 | print(format_str % (datetime.now(), step, 496 | # step * gpu / batchsize * gpu = 100 497 | FLAGS.num_epochs, 498 | loss_value, 499 | examples_per_sec, sec_per_batch)) 500 | if FLAGS.tb_logging: 501 | if step % 10 == 0: 502 | summary_str = sess.run(summary_op) 503 | summary_writer.add_summary(summary_str, step) 504 | 505 | # Save the model checkpoint periodically. 506 | if step % 1000 == 0 or ( 507 | step + 1) == FLAGS.num_epochs * FLAGS.batch_size: 508 | checkpoint_path = os.path.join(FLAGS.train_dir, 509 | 'model.ckpt') 510 | saver.save(sess, checkpoint_path, global_step=step) 511 | 512 | step += 1 513 | except tf.errors.OutOfRangeError: 514 | print('Done training with %d GPUs, for %d epochs, %d steps.' % ( 515 | FLAGS.num_gpus, FLAGS.num_epochs, step)) 516 | finally: 517 | # When done, ask the threads to stop. 518 | coord.request_stop() 519 | 520 | # Wait for threads to finish. 521 | coord.join(threads) 522 | sess.close() 523 | 524 | 525 | def evaluate(): 526 | """Eval MNIST for a number of steps.""" 527 | with tf.Graph().as_default(): 528 | # Get images and labels for MNIST. 529 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False) 530 | images = mnist.test.images 531 | labels = mnist.test.labels 532 | 533 | # Build a Graph that computes the logits predictions from the 534 | # inference model. 535 | # The line below takes custom_build and wraps it to construct a sonnet Module. 536 | module_with_build_args = snt.Module(custom_build, name='simple_net') 537 | test_model_outputs = module_with_build_args(images, is_training=False, 538 | keep_prob=tf.constant(1.0)) 539 | 540 | # Calculate predictions. 541 | top_k_op = tf.nn.in_top_k(predictions=test_model_outputs, targets=labels, k=1) 542 | 543 | # Create saver to restore the learned variables for eval. 544 | saver = tf.train.Saver() 545 | 546 | with tf.Session() as sess: 547 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) 548 | if ckpt and ckpt.model_checkpoint_path: 549 | # Restores from checkpoint 550 | saver.restore(sess, ckpt.model_checkpoint_path) 551 | else: 552 | print('No checkpoint file found') 553 | return 554 | 555 | predictions = np.sum(sess.run([top_k_op])) 556 | 557 | # Compute precision. 558 | print('%s: precision = %.3f' % (datetime.now(), predictions)) 559 | 560 | 561 | def main(argv=None): # pylint: disable=unused-argument 562 | start_time = time.time() 563 | train() 564 | duration = time.time() - start_time 565 | print('Total Duration (%.3f sec)' % duration) 566 | evaluate() 567 | 568 | 569 | if __name__ == '__main__': 570 | tf.app.run() 571 | -------------------------------------------------------------------------------- /multi_gpu.py: -------------------------------------------------------------------------------- 1 | # ref: https://raw.githubusercontent.com/kuza55/keras-extras/master/utils/multi_gpu.py @IgnorePep8 2 | 3 | from keras.layers.merge import concatenate 4 | from keras.layers.core import Lambda 5 | from keras.models import Model 6 | from keras import backend as K 7 | 8 | if K.backend() == 'tensorflow': 9 | import tensorflow as tf # @UnresolvedImport 10 | from tensorflow.python.client import device_lib 11 | 12 | def get_available_gpus(): 13 | local_device_protos = device_lib.list_local_devices() 14 | return [x.name for x in local_device_protos if x.device_type == 'GPU'] 15 | 16 | def make_parallel(model, gpu_count): 17 | def get_slice(data, idx, parts): 18 | shape = tf.shape(data) 19 | size = tf.concat([shape[:1] // parts, shape[1:]], axis=0) 20 | stride = tf.concat([shape[:1] // parts, shape[1:] * 0], axis=0) 21 | start = stride * idx 22 | return tf.slice(data, start, size) 23 | 24 | outputs_all = [] 25 | for i in range(len(model.outputs)): 26 | outputs_all.append([]) 27 | 28 | # Place a copy of the model on each GPU, each getting a slice of the batch 29 | for i in range(gpu_count): 30 | with tf.device('/gpu:%d' % i): 31 | with tf.name_scope('tower_%d' % i) as scope: 32 | 33 | inputs = [] 34 | # Slice each input into a piece for processing on this GPU 35 | for x in model.inputs: 36 | input_shape = tuple(x.get_shape().as_list())[1:] 37 | slice_n = Lambda(get_slice, output_shape=input_shape, 38 | arguments={'idx': i, 'parts': gpu_count})( 39 | x) 40 | inputs.append(slice_n) 41 | 42 | outputs = model(inputs) 43 | 44 | if not isinstance(outputs, list): 45 | outputs = [outputs] 46 | 47 | # Save all the outputs for merging back together later 48 | for l in range(len(outputs)): 49 | outputs_all[l].append(outputs[l]) 50 | 51 | # merge outputs on CPU 52 | with tf.device('/cpu:0'): 53 | merged = [] 54 | for outputs in outputs_all: 55 | merged.append(concatenate(inputs=outputs, axis=0)) 56 | 57 | return Model(inputs=model.inputs, outputs=merged) -------------------------------------------------------------------------------- /network.py: -------------------------------------------------------------------------------- 1 | """Class that represents the network to be evolved.""" 2 | import random 3 | import logging 4 | from train import train_and_score 5 | 6 | class Network(): 7 | """Represent a network and let us operate on it. 8 | 9 | Currently only works for an MLP. 10 | """ 11 | 12 | def __init__(self, nn_param_choices=None): 13 | """Initialize our network. 14 | 15 | Args: 16 | nn_param_choices (dict): Parameters for the network, includes: 17 | nb_neurons (list): [64, 128, 256] 18 | nb_layers (list): [1, 2, 3, 4] 19 | activation (list): ['relu', 'elu'] 20 | optimizer (list): ['rmsprop', 'adam'] 21 | """ 22 | self.accuracy = 0. 23 | self.nn_param_choices = nn_param_choices 24 | self.network = {} # (dic): represents MLP network parameters 25 | 26 | def create_random(self): 27 | """Create a random network.""" 28 | for key in self.nn_param_choices: 29 | self.network[key] = random.choice(self.nn_param_choices[key]) 30 | 31 | def create_set(self, network): 32 | """Set network properties. 33 | 34 | Args: 35 | network (dict): The network parameters 36 | 37 | """ 38 | self.network = network 39 | 40 | def train(self, dataset): 41 | """Train the network and record the accuracy. 42 | 43 | Args: 44 | dataset (str): Name of dataset to use. 45 | 46 | """ 47 | if self.accuracy == 0.: 48 | self.accuracy = train_and_score(self.network, dataset) 49 | 50 | def print_network(self): 51 | """Print out a network.""" 52 | logging.info(self.network) 53 | logging.info("Network accuracy: %.2f%%" % (self.accuracy * 100)) 54 | -------------------------------------------------------------------------------- /older/mnist_multi_gpu_eval.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Norman Heckscher. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Evaluation for MNIST. 17 | 18 | Accuracy: 19 | 20 | Speed: 21 | 22 | Usage: 23 | 24 | """ 25 | from __future__ import absolute_import 26 | from __future__ import division 27 | from __future__ import print_function 28 | 29 | from datetime import datetime 30 | import math 31 | import time 32 | 33 | import numpy as np 34 | import tensorflow as tf 35 | 36 | import model 37 | 38 | from tensorflow.examples.tutorials.mnist import input_data 39 | 40 | FLAGS = tf.app.flags.FLAGS 41 | 42 | tf.app.flags.DEFINE_integer('batch_size', 50, 43 | """Number of images to process in a batch.""") 44 | tf.app.flags.DEFINE_string('eval_dir', '/home/norman/MNIST_train', 45 | """Directory where to write event logs.""") 46 | tf.app.flags.DEFINE_string('eval_data', 'test', 47 | """Either 'test' or 'train_eval'.""") 48 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data', 49 | """Path to the MNIST data directory.""") 50 | tf.app.flags.DEFINE_string('checkpoint_dir', '/home/norman/MNIST_train', 51 | """Directory where to read model checkpoints.""") 52 | tf.app.flags.DEFINE_integer('eval_interval_secs', 5, 53 | """How often to run the eval.""") 54 | tf.app.flags.DEFINE_integer('num_examples', 10000, 55 | """Number of examples to run.""") 56 | tf.app.flags.DEFINE_boolean('run_once', False, 57 | """Whether to run eval only once.""") 58 | tf.app.flags.DEFINE_boolean('use_fp16', False, 59 | """Train the model using fp16.""") 60 | 61 | 62 | def eval_once(saver, top_k_op): 63 | """Run Eval once. 64 | 65 | Args: 66 | saver: Saver. 67 | summary_writer: Summary writer. 68 | top_k_op: Top K op. 69 | """ 70 | with tf.Session() as sess: 71 | ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) 72 | if ckpt and ckpt.model_checkpoint_path: 73 | # Restores from checkpoint 74 | saver.restore(sess, ckpt.model_checkpoint_path) 75 | # Assuming model_checkpoint_path looks something like: 76 | # /my-favorite-path/MNIST_train/model.ckpt-0, 77 | # extract global_step from it. 78 | global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[ 79 | -1] 80 | else: 81 | print('No checkpoint file found') 82 | return 83 | 84 | predictions = np.sum(sess.run([top_k_op])) 85 | 86 | # Compute precision. 87 | print('%s: precision = %.3f' % (datetime.now(), predictions)) 88 | 89 | def evaluate(): 90 | """Eval MNIST for a number of steps.""" 91 | with tf.Graph().as_default() as g: 92 | # Get images and labels for MNIST. 93 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False) 94 | images = mnist.test.images 95 | labels = mnist.test.labels 96 | 97 | # Build a Graph that computes the logits predictions from the 98 | # inference model. 99 | logits = model.inference(images, keep_prob=1.0) 100 | 101 | # Calculate predictions. 102 | top_k_op = tf.nn.in_top_k(predictions=logits, targets=labels, k=1) 103 | 104 | # Create saver to restore the learned variables for eval. 105 | saver = tf.train.Saver() 106 | 107 | eval_once(saver, top_k_op) 108 | 109 | def main(argv=None): # pylint: disable=unused-argument 110 | evaluate() 111 | 112 | 113 | if __name__ == '__main__': 114 | tf.app.run() 115 | -------------------------------------------------------------------------------- /older/mnist_multi_gpu_train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Norman Heckscher. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """A binary to train MNIST using multiple GPU's with synchronous updates. 16 | 17 | Accuracy: 18 | mnist_multi_gpu_train.py achieves ~xx% accuracy after 20K steps (xxx 19 | epochs of data) as judged by mnist_multi_gpu_batching_eval.py. 20 | 21 | Speed: With batch_size 50. 22 | 23 | System | Step Time (sec/batch) | Accuracy 24 | -------------------------------------------------------------------- 25 | 1 GTX 1080 | 0.08-0.10 | ~xx% at 20K steps (x hours) 26 | 2 GTX 1080 | 0.08-0.10 | ~xx% at 20K steps (x hours) 27 | 28 | Usage: 29 | Please see the tutorial and website for how to download the MNIST 30 | data set, compile the program and train the model. 31 | 32 | """ 33 | 34 | from __future__ import absolute_import 35 | from __future__ import division 36 | from __future__ import print_function 37 | 38 | from datetime import datetime 39 | import os.path 40 | import re 41 | import time 42 | 43 | import numpy as np 44 | from six.moves import xrange # pylint: disable=redefined-builtin 45 | import tensorflow as tf 46 | 47 | import model 48 | 49 | 50 | 51 | FLAGS = tf.app.flags.FLAGS 52 | 53 | tf.app.flags.DEFINE_integer('batch_size', 1000, 54 | """Number of images to process in a batch.""") 55 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data', 56 | """Path to the MNIST data directory.""") 57 | tf.app.flags.DEFINE_boolean('use_fp16', False, 58 | """Train the model using fp16.""") 59 | tf.app.flags.DEFINE_string('train_dir', '/home/norman/MNIST_train', 60 | """Directory where to write event logs """ 61 | """and checkpoint.""") 62 | tf.app.flags.DEFINE_integer('max_steps', 20000, 63 | """Number of batches to run.""") 64 | tf.app.flags.DEFINE_integer('num_gpus', 2, 65 | """How many GPUs to use.""") 66 | tf.app.flags.DEFINE_boolean('log_device_placement', False, 67 | """Whether to log device placement.""") 68 | tf.app.flags.DEFINE_boolean('tb_logging', False, 69 | """Whether to log to Tensorboard.""") 70 | 71 | def tower_loss(scope): 72 | """Calculate the total loss on a single tower running the MNIST model. 73 | 74 | Args: 75 | scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0' 76 | 77 | Returns: 78 | Tensor of shape [] containing the total loss for a batch of data 79 | """ 80 | # Get images and labels for MSNIT. 81 | images, labels = model.inputs(FLAGS.batch_size) 82 | 83 | # Build inference Graph. 84 | logits = model.inference(images, keep_prob=0.5) 85 | 86 | # Build the portion of the Graph calculating the losses. Note that we will 87 | # assemble the total_loss using a custom function below. 88 | _ = model.loss(logits, labels) 89 | 90 | # Assemble all of the losses for the current tower only. 91 | losses = tf.get_collection('losses', scope) 92 | 93 | # Calculate the total loss for the current tower. 94 | total_loss = tf.add_n(losses, name='total_loss') 95 | 96 | # Attach a scalar summary to all individual losses and the total loss; do 97 | # the same for the averaged version of the losses. 98 | if (FLAGS.tb_logging): 99 | for l in losses + [total_loss]: 100 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU 101 | # training session. This helps the clarity of presentation on 102 | # tensorboard. 103 | loss_name = re.sub('%s_[0-9]*/' % model.TOWER_NAME, '', l.op.name) 104 | tf.summary.scalar(loss_name, l) 105 | 106 | return total_loss 107 | 108 | 109 | def average_gradients(tower_grads): 110 | """Calculate average gradient for each shared variable across all towers. 111 | 112 | Note that this function provides a synchronization point across all towers. 113 | 114 | Args: 115 | tower_grads: List of lists of (gradient, variable) tuples. The outer list 116 | is over individual gradients. The inner list is over the gradient 117 | calculation for each tower. 118 | Returns: 119 | List of pairs of (gradient, variable) where the gradient has been 120 | averaged across all towers. 121 | """ 122 | average_grads = [] 123 | for grad_and_vars in zip(*tower_grads): 124 | # Note that each grad_and_vars looks like the following: 125 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) 126 | grads = [] 127 | for g, _ in grad_and_vars: 128 | # Add 0 dimension to the gradients to represent the tower. 129 | expanded_g = tf.expand_dims(g, 0) 130 | 131 | # Append on a 'tower' dimension which we will average over below. 132 | grads.append(expanded_g) 133 | 134 | # Average over the 'tower' dimension. 135 | grad = tf.concat(grads, 0) 136 | grad = tf.reduce_mean(grad, 0) 137 | 138 | # Keep in mind that the Variables are redundant because they are shared 139 | # across towers. So .. we will just return the first tower's pointer to 140 | # the Variable. 141 | v = grad_and_vars[0][1] 142 | grad_and_var = (grad, v) 143 | average_grads.append(grad_and_var) 144 | return average_grads 145 | 146 | def train(): 147 | """Train MNIST for a number of steps.""" 148 | with tf.Graph().as_default(), tf.device('/cpu:0'): 149 | 150 | # Create a variable to count the number of train() calls. This equals 151 | # the number of batches processed * FLAGS.num_gpus. 152 | global_step = tf.get_variable( 153 | 'global_step', [], 154 | initializer=tf.constant_initializer(0), trainable=False) 155 | 156 | # Use AdamOptimizer. 157 | opt = tf.train.AdamOptimizer(model.INITIAL_LEARNING_RATE) 158 | 159 | # Calculate the gradients for each model tower. 160 | tower_grads = [] 161 | with tf.variable_scope(tf.get_variable_scope()): 162 | for i in xrange(FLAGS.num_gpus): 163 | with tf.device('/gpu:%d' % i): 164 | with tf.name_scope( 165 | '%s_%d' % (model.TOWER_NAME, i)) as scope: 166 | # Calculate the loss for one tower of the MNIST model. 167 | # This function constructs the entire MNIST model but 168 | # shares the variables across all towers. 169 | loss = tower_loss(scope) 170 | 171 | # Reuse variables for the next tower. 172 | tf.get_variable_scope().reuse_variables() 173 | 174 | # Retain the summaries from the final tower. 175 | summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, 176 | scope) 177 | 178 | # Calculate the gradients for the batch of data on this 179 | # MNIST tower. 180 | grads = opt.compute_gradients(loss) 181 | 182 | # Keep track of the gradients across all towers. 183 | tower_grads.append(grads) 184 | 185 | # We must calculate the mean of each gradient. Note that this is the 186 | # synchronization point across all towers. 187 | grads = average_gradients(tower_grads) 188 | 189 | # Add histograms for gradients. 190 | if (FLAGS.tb_logging): 191 | for grad, var in grads: 192 | if grad is not None: 193 | summaries.append( 194 | tf.summary.histogram(var.op.name + '/gradients', grad)) 195 | 196 | # Apply the gradients to adjust the shared variables. 197 | train_op = opt.apply_gradients(grads, global_step=global_step) 198 | 199 | # Add histograms for trainable variables. 200 | if (FLAGS.tb_logging): 201 | for var in tf.trainable_variables(): 202 | summaries.append(tf.summary.histogram(var.op.name, var)) 203 | 204 | # Create a saver. 205 | saver = tf.train.Saver(tf.global_variables()) 206 | 207 | # Build the summary operation from the last tower summaries. 208 | summary_op = tf.summary.merge(summaries) 209 | 210 | # Build an initialization operation to run below. 211 | init = tf.global_variables_initializer() 212 | 213 | # Start running operations on the Graph. allow_soft_placement must be 214 | # set to True to build towers on GPU, as some of the ops do not have GPU 215 | # implementations. 216 | sess = tf.Session(config=tf.ConfigProto( 217 | allow_soft_placement=True, 218 | log_device_placement=FLAGS.log_device_placement)) 219 | sess.run(init) 220 | 221 | # Start the queue runners. 222 | tf.train.start_queue_runners(sess=sess) 223 | 224 | summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) 225 | 226 | for step in xrange(FLAGS.max_steps): 227 | start_time = time.time() 228 | _, loss_value = sess.run([train_op, loss]) 229 | duration = time.time() - start_time 230 | 231 | assert not np.isnan(loss_value), 'Model diverged with loss = NaN' 232 | 233 | if step % 50 == 0: 234 | num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus 235 | examples_per_sec = num_examples_per_step / duration 236 | sec_per_batch = duration / FLAGS.num_gpus 237 | 238 | format_str = ( 239 | '%s: step %d, loss = %.4f (%.1f examples/sec; %.3f ' 240 | 'sec/batch)') 241 | print(format_str % (datetime.now(), step, loss_value, 242 | examples_per_sec, sec_per_batch)) 243 | if (FLAGS.tb_logging): 244 | if step % 5 == 0: 245 | summary_str = sess.run(summary_op) 246 | summary_writer.add_summary(summary_str, step) 247 | 248 | # Save the model checkpoint periodically. 249 | if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: 250 | checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') 251 | saver.save(sess, checkpoint_path, global_step=step) 252 | 253 | 254 | def main(argv=None): # pylint: disable=unused-argument 255 | train() 256 | 257 | 258 | if __name__ == '__main__': 259 | tf.app.run() 260 | -------------------------------------------------------------------------------- /older/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Norman Heckscher. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Builds the MNIST network. 16 | 17 | Summary of available functions: 18 | 19 | # Compute input images and labels for training. If you would like to run 20 | # evaluations, use inputs() instead. 21 | inputs, labels = distorted_inputs() 22 | 23 | # Compute inference on the model inputs to make a prediction. 24 | predictions = inference(inputs) 25 | 26 | # Compute the total loss of the prediction with respect to the labels. 27 | loss = loss(predictions, labels) 28 | 29 | # Create a graph to run one step of training with respect to the loss. 30 | train_op = train(loss, global_step) 31 | """ 32 | 33 | from __future__ import absolute_import 34 | from __future__ import division 35 | from __future__ import print_function 36 | 37 | import re 38 | 39 | import tensorflow as tf 40 | from tensorflow.examples.tutorials.mnist import input_data 41 | 42 | FLAGS = tf.app.flags.FLAGS 43 | # tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data', 44 | # """Path to the MNIST data directory.""") 45 | 46 | # Global constants describing the MNIST data set. 47 | IMAGE_SIZE = 28 48 | NUM_CLASSES = 10 49 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 50 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000 51 | 52 | # Constants describing the training process. 53 | INITIAL_LEARNING_RATE = 0.0001 # Initial learning rate. 54 | 55 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name 56 | # to differentiate the operations. Note that this prefix is removed from the 57 | # names of the summaries when visualizing a model. 58 | TOWER_NAME = 'tower' 59 | 60 | mnist = input_data.read_data_sets('/home/norman/MNIST_data', one_hot=False) 61 | 62 | def inputs(batch_size=50): 63 | """Construct input for MNIST training using the TensorFlow framework. 64 | 65 | Returns: 66 | images: mnist images 67 | labels: mnist labels 68 | 69 | """ 70 | images, labels = mnist.train.next_batch(batch_size) 71 | 72 | return images, labels 73 | 74 | def _variable_with_weight_decay(name, shape, stddev, wd): 75 | """Helper to create an initialized Variable with weight decay. 76 | 77 | Note that the Variable is initialized with a truncated normal distribution. 78 | A weight decay is added only if one is specified. 79 | 80 | Args: 81 | name: name of the variable 82 | shape: list of ints 83 | stddev: standard deviation of a truncated Gaussian 84 | wd: add L2Loss weight decay multiplied by this float. If None, weight 85 | decay is not added for this Variable. 86 | 87 | Returns: 88 | Variable Tensor 89 | """ 90 | dtype = tf.float32 91 | var = _variable_on_cpu( 92 | name, 93 | shape, 94 | tf.truncated_normal_initializer(stddev=stddev, dtype=dtype)) 95 | if wd is not None: 96 | weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss') 97 | tf.add_to_collection('losses', weight_decay) 98 | return var 99 | 100 | 101 | def _variable_on_cpu(name, shape, initializer): 102 | """Helper to create a Variable stored on CPU memory. 103 | 104 | Args: 105 | name: name of the variable 106 | shape: list of ints 107 | initializer: initializer for Variable 108 | 109 | Returns: 110 | Variable Tensor 111 | """ 112 | with tf.device('/cpu:0'): 113 | dtype = tf.float32 114 | var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype) 115 | return var 116 | 117 | 118 | def _activation_summary(x): 119 | """Helper to create summaries for activations. 120 | 121 | Creates a summary that provides a histogram of activations. 122 | Creates a summary that measures the sparsity of activations. 123 | 124 | Args: 125 | x: Tensor 126 | Returns: 127 | nothing 128 | """ 129 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training 130 | # session. This helps the clarity of presentation on tensorboard. 131 | tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name) 132 | tf.summary.histogram(tensor_name + '/activations', x) 133 | tf.summary.scalar(tensor_name + '/sparsity', 134 | tf.nn.zero_fraction(x)) 135 | 136 | 137 | def loss(logits, labels): 138 | """Add L2Loss to all the trainable variables. 139 | 140 | Add summary for "Loss" and "Loss/avg". 141 | Args: 142 | logits: Logits from inference(). 143 | labels: Labels from MNIST or inputs(). 1-D tensor 144 | of shape [batch_size] 145 | 146 | Returns: 147 | Loss tensor of type float. 148 | """ 149 | # Calculate the average cross entropy loss across the batch. 150 | labels = tf.cast(labels, tf.int32) 151 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( 152 | labels=labels, logits=logits, name='cross_entropy_per_example') 153 | cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') 154 | tf.add_to_collection('losses', cross_entropy_mean) 155 | 156 | # The total loss is defined as the cross entropy loss plus all of the weight 157 | # decay terms (L2 loss). 158 | return tf.add_n(tf.get_collection('losses'), name='total_loss') 159 | 160 | 161 | def inference(images, keep_prob=1.0): 162 | """Build the MNIST model. 163 | 164 | Args: 165 | images: Images returned from MNIST or inputs(). 166 | 167 | Returns: 168 | Logits. 169 | """ 170 | # We instantiate all variables using tf.get_variable() instead of 171 | # tf.Variable() in order to share variables across multiple GPU training 172 | # runs. If we only ran this model on a single GPU, we could simplify this 173 | # function by replacing all instances of tf.get_variable() 174 | # with tf.Variable(). 175 | 176 | # Reshape to use within a convolutional neural net. 177 | # Last dimension is for "features" - there is only one here, since images 178 | # are grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc. 179 | x_image = tf.reshape(images, [-1, 28, 28, 1]) 180 | 181 | # conv1 182 | with tf.variable_scope('conv1') as scope: 183 | kernel = _variable_with_weight_decay('weights', 184 | shape=[5, 5, 1, 32], 185 | stddev=5e-2, 186 | wd=0.0) 187 | biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.0)) 188 | conv = tf.nn.conv2d(x_image, kernel, strides=[1, 1, 1, 1], 189 | padding='SAME') 190 | pre_activation = tf.nn.bias_add(conv, biases) 191 | conv1 = tf.nn.relu(pre_activation, name=scope.name) 192 | _activation_summary(conv1) 193 | 194 | # pool1 195 | pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], 196 | padding='SAME', name='pool1') 197 | 198 | # norm1 199 | norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, 200 | name='norm1') 201 | 202 | # conv2 203 | with tf.variable_scope('conv2') as scope: 204 | kernel = _variable_with_weight_decay('weights', 205 | shape=[5, 5, 32, 64], 206 | stddev=5e-2, 207 | wd=0.0) 208 | conv = tf.nn.conv2d(norm1, kernel, strides=[1, 1, 1, 1], padding='SAME') 209 | biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1)) 210 | pre_activation = tf.nn.bias_add(conv, biases) 211 | conv2 = tf.nn.relu(pre_activation, name=scope.name) 212 | _activation_summary(conv2) 213 | 214 | # norm2 215 | norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, 216 | name='norm1') 217 | 218 | # pool2 219 | pool2 = tf.nn.max_pool(norm2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], 220 | padding='SAME', name='pool2') 221 | 222 | # local3 223 | with tf.variable_scope('local3') as scope: 224 | # Move everything into depth so we can perform a single matrix multiply. 225 | reshape = tf.reshape(pool2, [-1, 7 * 7 * 64]) 226 | dim = reshape.get_shape()[1].value 227 | weights = _variable_with_weight_decay('weights', shape=[dim, 1024], 228 | stddev=0.04, wd=0.004) 229 | biases = _variable_on_cpu('biases', [1024], 230 | tf.constant_initializer(0.1)) 231 | local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, 232 | name=scope.name) 233 | _activation_summary(local3) 234 | 235 | # local4 with dropout 236 | with tf.variable_scope('local4') as scope: 237 | # keep_prob = tf.placeholder(tf.float32, name="keep_prob") 238 | local4 = tf.nn.dropout(local3, keep_prob, name=scope.name) 239 | weights = _variable_with_weight_decay('weights', shape=[1024, 10], 240 | stddev=0.04, wd=0.004) 241 | biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.1)) 242 | softmax_linear = tf.add(tf.matmul(local4, weights), biases, 243 | name=scope.name) 244 | _activation_summary(softmax_linear) 245 | 246 | return softmax_linear 247 | -------------------------------------------------------------------------------- /optimizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class that holds a genetic algorithm for evolving a network. 3 | 4 | Credit: 5 | A lot of those code was originally inspired by: 6 | http://lethain.com/genetic-algorithms-cool-name-damn-simple/ 7 | """ 8 | from functools import reduce 9 | from operator import add 10 | import random 11 | from network import Network 12 | 13 | class Optimizer(): 14 | """Class that implements genetic algorithm for MLP optimization.""" 15 | 16 | def __init__(self, nn_param_choices, retain=0.4, 17 | random_select=0.1, mutate_chance=0.2): 18 | """Create an optimizer. 19 | 20 | Args: 21 | nn_param_choices (dict): Possible network paremters 22 | retain (float): Percentage of population to retain after 23 | each generation 24 | random_select (float): Probability of a rejected network 25 | remaining in the population 26 | mutate_chance (float): Probability a network will be 27 | randomly mutated 28 | 29 | """ 30 | self.mutate_chance = mutate_chance 31 | self.random_select = random_select 32 | self.retain = retain 33 | self.nn_param_choices = nn_param_choices 34 | 35 | def create_population(self, count): 36 | """Create a population of random networks. 37 | 38 | Args: 39 | count (int): Number of networks to generate, aka the 40 | size of the population 41 | 42 | Returns: 43 | (list): Population of network objects 44 | 45 | """ 46 | pop = [] 47 | for _ in range(0, count): 48 | # Create a random network. 49 | network = Network(self.nn_param_choices) 50 | network.create_random() 51 | 52 | # Add the network to our population. 53 | pop.append(network) 54 | 55 | return pop 56 | 57 | @staticmethod 58 | def fitness(network): 59 | """Return the accuracy, which is our fitness function.""" 60 | return network.accuracy 61 | 62 | def grade(self, pop): 63 | """Find average fitness for a population. 64 | 65 | Args: 66 | pop (list): The population of networks 67 | 68 | Returns: 69 | (float): The average accuracy of the population 70 | 71 | """ 72 | summed = reduce(add, (self.fitness(network) for network in pop)) 73 | return summed / float((len(pop))) 74 | 75 | def breed(self, mother, father): 76 | """Make two children as parts of their parents. 77 | 78 | Args: 79 | mother (dict): Network parameters 80 | father (dict): Network parameters 81 | 82 | Returns: 83 | (list): Two network objects 84 | 85 | """ 86 | children = [] 87 | for _ in range(2): 88 | 89 | child = {} 90 | 91 | # Loop through the parameters and pick params for the kid. 92 | for param in self.nn_param_choices: 93 | child[param] = random.choice( 94 | [mother.network[param], father.network[param]] 95 | ) 96 | 97 | # Now create a network object. 98 | network = Network(self.nn_param_choices) 99 | network.create_set(child) 100 | 101 | children.append(network) 102 | 103 | return children 104 | 105 | def mutate(self, network): 106 | """Randomly mutate one part of the network. 107 | 108 | Args: 109 | network (dict): The network parameters to mutate 110 | 111 | Returns: 112 | (Network): A randomly mutated network object 113 | 114 | """ 115 | # Choose a random key. 116 | mutation = random.choice(list(self.nn_param_choices.keys())) 117 | 118 | # Mutate one of the params. 119 | network.network[mutation] = random.choice(self.nn_param_choices[mutation]) 120 | 121 | return network 122 | 123 | def evolve(self, pop): 124 | """Evolve a population of networks. 125 | 126 | Args: 127 | pop (list): A list of network parameters 128 | 129 | Returns: 130 | (list): The evolved population of networks 131 | 132 | """ 133 | # Get scores for each network. 134 | graded = [(self.fitness(network), network) for network in pop] 135 | 136 | # Sort on the scores. 137 | graded = [x[1] for x in sorted(graded, key=lambda x: x[0], reverse=True)] 138 | 139 | # Get the number we want to keep for the next gen. 140 | retain_length = int(len(graded)*self.retain) 141 | 142 | # The parents are every network we want to keep. 143 | parents = graded[:retain_length] 144 | 145 | # For those we aren't keeping, randomly keep some anyway. 146 | for individual in graded[retain_length:]: 147 | if self.random_select > random.random(): 148 | parents.append(individual) 149 | 150 | # Randomly mutate some of the networks we're keeping. 151 | for individual in parents: 152 | if self.mutate_chance > random.random(): 153 | individual = self.mutate(individual) 154 | 155 | # Now find out how many spots we have left to fill. 156 | parents_length = len(parents) 157 | desired_length = len(pop) - parents_length 158 | children = [] 159 | 160 | # Add children, which are bred from two remaining networks. 161 | while len(children) < desired_length: 162 | 163 | # Get a random mom and dad. 164 | male = random.randint(0, parents_length-1) 165 | female = random.randint(0, parents_length-1) 166 | 167 | # Assuming they aren't the same network... 168 | if male != female: 169 | male = parents[male] 170 | female = parents[female] 171 | 172 | # Breed them. 173 | babies = self.breed(male, female) 174 | 175 | # Add the children one at a time. 176 | for baby in babies: 177 | # Don't grow larger than desired length. 178 | if len(children) < desired_length: 179 | children.append(baby) 180 | 181 | parents.extend(children) 182 | 183 | return parents 184 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility used by the Network class to actually train. 3 | 4 | Based on: 5 | https://github.com/fchollet/keras/blob/master/examples/mnist_mlp.py 6 | 7 | """ 8 | from keras.datasets import mnist, cifar10 9 | from keras.models import Sequential 10 | from keras.layers import Dense, Dropout 11 | from keras.utils.np_utils import to_categorical 12 | from keras.callbacks import EarlyStopping 13 | 14 | # Helper: Early stopping. 15 | early_stopper = EarlyStopping(patience=5) 16 | 17 | def get_cifar10(): 18 | """Retrieve the CIFAR dataset and process the data.""" 19 | # Set defaults. 20 | nb_classes = 10 21 | batch_size = 64 22 | input_shape = (3072,) 23 | 24 | # Get the data. 25 | (x_train, y_train), (x_test, y_test) = cifar10.load_data() 26 | x_train = x_train.reshape(50000, 3072) 27 | x_test = x_test.reshape(10000, 3072) 28 | x_train = x_train.astype('float32') 29 | x_test = x_test.astype('float32') 30 | x_train /= 255 31 | x_test /= 255 32 | 33 | # convert class vectors to binary class matrices 34 | y_train = to_categorical(y_train, nb_classes) 35 | y_test = to_categorical(y_test, nb_classes) 36 | 37 | return (nb_classes, batch_size, input_shape, x_train, x_test, y_train, y_test) 38 | 39 | def get_mnist(): 40 | """Retrieve the MNIST dataset and process the data.""" 41 | # Set defaults. 42 | nb_classes = 10 43 | batch_size = 128 44 | input_shape = (784,) 45 | 46 | # Get the data. 47 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 48 | x_train = x_train.reshape(60000, 784) 49 | x_test = x_test.reshape(10000, 784) 50 | x_train = x_train.astype('float32') 51 | x_test = x_test.astype('float32') 52 | x_train /= 255 53 | x_test /= 255 54 | 55 | # convert class vectors to binary class matrices 56 | y_train = to_categorical(y_train, nb_classes) 57 | y_test = to_categorical(y_test, nb_classes) 58 | 59 | return (nb_classes, batch_size, input_shape, x_train, x_test, y_train, y_test) 60 | 61 | def compile_model(network, nb_classes, input_shape): 62 | """Compile a sequential model. 63 | 64 | Args: 65 | network (dict): the parameters of the network 66 | 67 | Returns: 68 | a compiled network. 69 | 70 | """ 71 | # Get our network parameters. 72 | nb_layers = network['nb_layers'] 73 | nb_neurons = network['nb_neurons'] 74 | activation = network['activation'] 75 | optimizer = network['optimizer'] 76 | 77 | model = Sequential() 78 | 79 | # Add each layer. 80 | for i in range(nb_layers): 81 | 82 | # Need input shape for first layer. 83 | if i == 0: 84 | model.add(Dense(nb_neurons, activation=activation, input_shape=input_shape)) 85 | else: 86 | model.add(Dense(nb_neurons, activation=activation)) 87 | 88 | model.add(Dropout(0.2)) # hard-coded dropout 89 | 90 | # Output layer. 91 | model.add(Dense(nb_classes, activation='softmax')) 92 | 93 | model.compile(loss='categorical_crossentropy', optimizer=optimizer, 94 | metrics=['accuracy']) 95 | 96 | return model 97 | 98 | def train_and_score(network, dataset): 99 | """Train the model, return test loss. 100 | 101 | Args: 102 | network (dict): the parameters of the network 103 | dataset (str): Dataset to use for training/evaluating 104 | 105 | """ 106 | if dataset == 'cifar10': 107 | nb_classes, batch_size, input_shape, x_train, \ 108 | x_test, y_train, y_test = get_cifar10() 109 | elif dataset == 'mnist': 110 | nb_classes, batch_size, input_shape, x_train, \ 111 | x_test, y_train, y_test = get_mnist() 112 | 113 | model = compile_model(network, nb_classes, input_shape) 114 | 115 | model.fit(x_train, y_train, 116 | batch_size=batch_size, 117 | epochs=10000, # using early stopping, so no real limit 118 | verbose=0, 119 | validation_data=(x_test, y_test), 120 | callbacks=[early_stopper]) 121 | 122 | score = model.evaluate(x_test, y_test, verbose=0) 123 | 124 | return score[1] # 1 is accuracy. 0 is loss. 125 | --------------------------------------------------------------------------------