├── .gitignore
├── LICENSE.md
├── README.md
├── __init__.py
├── brute.py
├── convert_to_records.py
├── images
    ├── Parallelism.png
    └── mnist_graph.png
├── main.py
├── mnist_multi_gpu_batching_train.py
├── mnist_multi_gpu_keras.py
├── mnist_multi_gpu_sonnet.py
├── multi_gpu.py
├── network.py
├── older
    ├── mnist_multi_gpu_eval.py
    ├── mnist_multi_gpu_train.py
    └── model.py
├── optimizer.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | /mnist_with_summaries.zip
 2 | /mnist_data.zip
 3 | /mnist_data/train-labels-idx1-ubyte.gz
 4 | /mnist_data/train-images-idx3-ubyte.gz
 5 | /mnist_data/t10k-labels-idx1-ubyte.gz
 6 | /mnist_data/t10k-images-idx3-ubyte.gz
 7 | /.idea
 8 | *.pyc
 9 | .DS_Store
10 | /logs/test
11 | /logs/train
12 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "{}"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2017 Norman Heckscher
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MNIST Multi GPU with TensorFlow
  2 | A ConvNet for MNIST digit classification.
  3 | 
  4 | Multi GPU example with TensorFlow utilising local tower architecture for each GPU.
  5 | 
  6 | Several different examples. Utilise batching and direct feed.
  7 | 
  8 | Keras performs the best. It utilises the MultiGPU code from: https://github.com/kuza55/keras-extras
  9 | 
 10 | ## Training a Model Using Multiple GPU Cards
 11 | 
 12 | Modern workstations may contain multiple GPUs for scientific computation.
 13 | TensorFlow can leverage this environment to run the training operation
 14 | concurrently across multiple cards.
 15 | 
 16 | Training a model in a parallel, distributed fashion requires
 17 | coordinating training processes. For what follows we term *model replica*
 18 | to be one copy of a model training on a subset of data.
 19 | 
 20 | Naively employing asynchronous updates of model parameters
 21 | leads to sub-optimal training performance
 22 | because an individual model replica might be trained on a stale
 23 | copy of the model parameters. Conversely, employing fully synchronous
 24 | updates will be as slow as the slowest model replica.
 25 | 
 26 | In a workstation with multiple GPU cards, each GPU will have similar speed
 27 | and contain enough memory to run an entire MNIST model. Thus, we opt to
 28 | design our training system in the following manner:
 29 | 
 30 | * Place an individual model replica on each GPU.
 31 | * Update model parameters synchronously by waiting for all GPUs to finish
 32 | processing a batch of data.
 33 | 
 34 | Here is a diagram of this model:
 35 | 
 36 | <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
 37 |   <img style="width:100%" src="./images/Parallelism.png">
 38 | </div>
 39 | 
 40 | Note that each GPU computes inference as well as the gradients for a unique
 41 | batch of data. This setup effectively permits dividing up a larger batch
 42 | of data across the GPUs.
 43 | 
 44 | This setup requires that all GPUs share the model parameters. A well-known
 45 | fact is that transferring data to and from GPUs is quite slow. For this
 46 | reason, we decide to store and update all model parameters on the CPU (see
 47 | green box). A fresh set of model parameters is transferred to the GPU
 48 | when a new batch of data is processed by all GPUs.
 49 | 
 50 | The GPUs are synchronized in operation. All gradients are accumulated from
 51 | the GPUs and averaged (see green box). The model parameters are updated with
 52 | the gradients averaged across all model replicas.
 53 | 
 54 | ### Model Prediction
 55 | 
 56 | The prediction part of the model is constructed by the `inference()` function
 57 | which adds operations to compute the *logits* of the predictions. That part of
 58 | the model is organized as follows:
 59 | 
 60 | Layer Name | Description
 61 | --- | ---
 62 | `conv1` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation.
 63 | `pool1` | @{tf.nn.max_pool$max pooling}.
 64 | `norm1` | @{tf.nn.local_response_normalization$local response normalization}.
 65 | `conv2` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation.
 66 | `norm2` | @{tf.nn.local_response_normalization$local response normalization}.
 67 | `pool2` | @{tf.nn.max_pool$max pooling}.
 68 | `local3` | @{$python/nn$fully connected layer with rectified linear activation}.
 69 | `local4` | @{$python/nn$fully connected layer with rectified linear activation}.
 70 | `softmax_linear` | linear transformation to produce logits.
 71 | 
 72 | Here is a graph generated from TensorBoard describing the inference operation:
 73 | 
 74 | <centre><div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;">
 75 |   <img style="width:100%" src="./images/mnist_graph.png">
 76 | </div></Centre>
 77 | 
 78 | 
 79 | # Evolve a neural network with a genetic algorithm
 80 | 
 81 | Taken from https://github.com/harvitronix/neural-network-genetic-algorithm
 82 | 
 83 | `train.py`
 84 | `optimizer.py`
 85 | `network.py`
 86 | `main.py`
 87 | `brute.py`
 88 | 
 89 | This is an example of how we can use a genetic algorithm in an attempt to find the optimal network parameters for classification tasks.
 90 | 
 91 | It's currently limited to only MLPs (ie. fully connected networks) and uses the Keras library to build, train and validate.
 92 | 
 93 | On the easy MNIST dataset, we are able to quickly find a network that reaches > 98% accuracy. On the more challenging CIFAR10 dataset, we get to 56% after 10 generations (with population 20).
 94 | 
 95 | For more, see this blog post:
 96 | https://medium.com/@harvitronix/lets-evolve-a-neural-network-with-a-genetic-algorithm-code-included-8809bece164
 97 | 
 98 | ## To run
 99 | 
100 | To run the brute force algorithm:
101 | 
102 | ```python3 brute.py```
103 | 
104 | To run the genetic algorithm:
105 | 
106 | ```python3 main.py```
107 | 
108 | You can set your network parameter choices by editing each of those files first. You can also choose whether to use the MNIST or CIFAR10 datasets. Simply set `dataset` to either `mnist` or `cifar10`.
109 | 
110 | 
111 | # Contribution
112 | Your comments (issues) and PRs are always welcome.
113 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the 'License');
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================


--------------------------------------------------------------------------------
/brute.py:
--------------------------------------------------------------------------------
 1 | """Iterate over every combination of hyperparameters."""
 2 | import logging
 3 | from network import Network
 4 | from tqdm import tqdm
 5 | 
 6 | # Setup logging.
 7 | logging.basicConfig(
 8 |     format='%(asctime)s - %(levelname)s - %(message)s',
 9 |     datefmt='%m/%d/%Y %I:%M:%S %p',
10 |     level=logging.DEBUG,
11 |     filename='brute-log.txt'
12 | )
13 | 
14 | def train_networks(networks, dataset):
15 |     """Train each network.
16 | 
17 |     Args:
18 |         networks (list): Current population of networks
19 |         dataset (str): Dataset to use for training/evaluating
20 |     """
21 |     pbar = tqdm(total=len(networks))
22 |     for network in networks:
23 |         network.train(dataset)
24 |         network.print_network()
25 |         pbar.update(1)
26 |     pbar.close()
27 | 
28 |     # Sort our final population.
29 |     networks = sorted(networks, key=lambda x: x.accuracy, reverse=True)
30 | 
31 |     # Print out the top 5 networks.
32 |     print_networks(networks[:5])
33 | 
34 | def print_networks(networks):
35 |     """Print a list of networks.
36 | 
37 |     Args:
38 |         networks (list): The population of networks
39 | 
40 |     """
41 |     logging.info('-'*80)
42 |     for network in networks:
43 |         network.print_network()
44 | 
45 | def generate_network_list(nn_param_choices):
46 |     """Generate a list of all possible networks.
47 | 
48 |     Args:
49 |         nn_param_choices (dict): The parameter choices
50 | 
51 |     Returns:
52 |         networks (list): A list of network objects
53 | 
54 |     """
55 |     networks = []
56 | 
57 |     # This is silly.
58 |     for nbn in nn_param_choices['nb_neurons']:
59 |         for nbl in nn_param_choices['nb_layers']:
60 |             for a in nn_param_choices['activation']:
61 |                 for o in nn_param_choices['optimizer']:
62 | 
63 |                     # Set the parameters.
64 |                     network = {
65 |                         'nb_neurons': nbn,
66 |                         'nb_layers': nbl,
67 |                         'activation': a,
68 |                         'optimizer': o,
69 |                     }
70 | 
71 |                     # Instantiate a network object with set parameters.
72 |                     network_obj = Network()
73 |                     network_obj.create_set(network)
74 | 
75 |                     networks.append(network_obj)
76 | 
77 |     return networks
78 | 
79 | def main():
80 |     """Brute force test every network."""
81 |     dataset = 'cifar10'
82 | 
83 |     nn_param_choices = {
84 |         'nb_neurons': [64, 128, 256, 512, 768, 1024],
85 |         'nb_layers': [1, 2, 3, 4],
86 |         'activation': ['relu', 'elu', 'tanh', 'sigmoid'],
87 |         'optimizer': ['rmsprop', 'adam', 'sgd', 'adagrad',
88 |                       'adadelta', 'adamax', 'nadam'],
89 |     }
90 | 
91 |     logging.info("***Brute forcing networks***")
92 | 
93 |     networks = generate_network_list(nn_param_choices)
94 | 
95 |     train_networks(networks, dataset)
96 | 
97 | if __name__ == '__main__':
98 |     main()
99 | 


--------------------------------------------------------------------------------
/convert_to_records.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Converts MNIST data to TFRecords file format with Example protos."""
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | import os
23 | import sys
24 | 
25 | import tensorflow as tf
26 | 
27 | from tensorflow.contrib.learn.python.learn.datasets import mnist
28 | 
29 | FLAGS = None
30 | 
31 | 
32 | def _int64_feature(value):
33 |   return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
34 | 
35 | 
36 | def _bytes_feature(value):
37 |   return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
38 | 
39 | 
40 | def convert_to(data_set, name):
41 |   """Converts a dataset to tfrecords."""
42 |   images = data_set.images
43 |   labels = data_set.labels
44 |   num_examples = data_set.num_examples
45 | 
46 |   if images.shape[0] != num_examples:
47 |     raise ValueError('Images size %d does not match label size %d.' %
48 |                      (images.shape[0], num_examples))
49 |   rows = images.shape[1]
50 |   cols = images.shape[2]
51 |   depth = images.shape[3]
52 | 
53 |   filename = os.path.join(FLAGS.directory, name + '.tfrecords')
54 |   print('Writing', filename)
55 |   writer = tf.python_io.TFRecordWriter(filename)
56 |   for index in range(num_examples):
57 |     image_raw = images[index].tostring()
58 |     example = tf.train.Example(features=tf.train.Features(feature={
59 |         'height': _int64_feature(rows),
60 |         'width': _int64_feature(cols),
61 |         'depth': _int64_feature(depth),
62 |         'label': _int64_feature(int(labels[index])),
63 |         'image_raw': _bytes_feature(image_raw)}))
64 |     writer.write(example.SerializeToString())
65 |   writer.close()
66 | 
67 | 
68 | def main(unused_argv):
69 |   # Get the data.
70 |   data_sets = mnist.read_data_sets(FLAGS.directory,
71 |                                    dtype=tf.uint8,
72 |                                    reshape=False,
73 |                                    validation_size=FLAGS.validation_size)
74 | 
75 |   # Convert to Examples and write the result to TFRecords.
76 |   convert_to(data_sets.train, 'train')
77 |   convert_to(data_sets.validation, 'validation')
78 |   convert_to(data_sets.test, 'test')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |   parser = argparse.ArgumentParser()
83 |   parser.add_argument(
84 |       '--directory',
85 |       type=str,
86 |       default='/home/norman/MNIST_data',
87 |       help='Directory to download data files and write the converted result'
88 |   )
89 |   parser.add_argument(
90 |       '--validation_size',
91 |       type=int,
92 |       default=5000,
93 |       help="""\
94 |       Number of examples to separate from the training data for the validation
95 |       set.\
96 |       """
97 |   )
98 |   FLAGS, unparsed = parser.parse_known_args()
99 |   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)


--------------------------------------------------------------------------------
/images/Parallelism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/normanheckscher/mnist-multi-gpu/3bbd69d852c9029bd3f86ca83786d33b15a54a8d/images/Parallelism.png


--------------------------------------------------------------------------------
/images/mnist_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/normanheckscher/mnist-multi-gpu/3bbd69d852c9029bd3f86ca83786d33b15a54a8d/images/mnist_graph.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | """Entry point to evolving the neural network. Start here."""
  2 | import logging
  3 | from optimizer import Optimizer
  4 | from tqdm import tqdm
  5 | 
  6 | # Setup logging.
  7 | logging.basicConfig(
  8 |     format='%(asctime)s - %(levelname)s - %(message)s',
  9 |     datefmt='%m/%d/%Y %I:%M:%S %p',
 10 |     level=logging.DEBUG,
 11 |     filename='log.txt'
 12 | )
 13 | 
 14 | def train_networks(networks, dataset):
 15 |     """Train each network.
 16 | 
 17 |     Args:
 18 |         networks (list): Current population of networks
 19 |         dataset (str): Dataset to use for training/evaluating
 20 |     """
 21 |     pbar = tqdm(total=len(networks))
 22 |     for network in networks:
 23 |         network.train(dataset)
 24 |         pbar.update(1)
 25 |     pbar.close()
 26 | 
 27 | def get_average_accuracy(networks):
 28 |     """Get the average accuracy for a group of networks.
 29 | 
 30 |     Args:
 31 |         networks (list): List of networks
 32 | 
 33 |     Returns:
 34 |         float: The average accuracy of a population of networks.
 35 | 
 36 |     """
 37 |     total_accuracy = 0
 38 |     for network in networks:
 39 |         total_accuracy += network.accuracy
 40 | 
 41 |     return total_accuracy / len(networks)
 42 | 
 43 | def generate(generations, population, nn_param_choices, dataset):
 44 |     """Generate a network with the genetic algorithm.
 45 | 
 46 |     Args:
 47 |         generations (int): Number of times to evole the population
 48 |         population (int): Number of networks in each generation
 49 |         nn_param_choices (dict): Parameter choices for networks
 50 |         dataset (str): Dataset to use for training/evaluating
 51 | 
 52 |     """
 53 |     optimizer = Optimizer(nn_param_choices)
 54 |     networks = optimizer.create_population(population)
 55 | 
 56 |     # Evolve the generation.
 57 |     for i in range(generations):
 58 |         logging.info("***Doing generation %d of %d***" %
 59 |                      (i + 1, generations))
 60 | 
 61 |         # Train and get accuracy for networks.
 62 |         train_networks(networks, dataset)
 63 | 
 64 |         # Get the average accuracy for this generation.
 65 |         average_accuracy = get_average_accuracy(networks)
 66 | 
 67 |         # Print out the average accuracy each generation.
 68 |         logging.info("Generation average: %.2f%%" % (average_accuracy * 100))
 69 |         logging.info('-'*80)
 70 | 
 71 |         # Evolve, except on the last iteration.
 72 |         if i != generations - 1:
 73 |             # Do the evolution.
 74 |             networks = optimizer.evolve(networks)
 75 | 
 76 |     # Sort our final population.
 77 |     networks = sorted(networks, key=lambda x: x.accuracy, reverse=True)
 78 | 
 79 |     # Print out the top 5 networks.
 80 |     print_networks(networks[:5])
 81 | 
 82 | def print_networks(networks):
 83 |     """Print a list of networks.
 84 | 
 85 |     Args:
 86 |         networks (list): The population of networks
 87 | 
 88 |     """
 89 |     logging.info('-'*80)
 90 |     for network in networks:
 91 |         network.print_network()
 92 | 
 93 | def main():
 94 |     """Evolve a network."""
 95 |     generations = 10  # Number of times to evole the population.
 96 |     population = 20  # Number of networks in each generation.
 97 |     dataset = 'mnist'
 98 | 
 99 |     nn_param_choices = {
100 |         'nb_neurons': [64, 128, 256, 512, 768, 1024],
101 |         'nb_layers': [1, 2, 3, 4],
102 |         'activation': ['relu', 'elu', 'tanh', 'sigmoid'],
103 |         'optimizer': ['rmsprop', 'adam', 'sgd', 'adagrad',
104 |                       'adadelta', 'adamax', 'nadam'],
105 |     }
106 | 
107 |     logging.info("***Evolving %d generations with population %d***" %
108 |                  (generations, population))
109 | 
110 |     generate(generations, population, nn_param_choices, dataset)
111 | 
112 | if __name__ == '__main__':
113 |     main()
114 | 


--------------------------------------------------------------------------------
/mnist_multi_gpu_batching_train.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """A binary to train MNIST using multiple GPU's with synchronous updates.
 16 | 
 17 | Accuracy:
 18 | Should achieve ~99.2% accuracy after 20K steps, unfortunately it's not at
 19 | the moment.
 20 | 
 21 | Speed: With batch_size 50.
 22 | 
 23 | System        | Step Time (sec/batch)  | Accuracy
 24 | -------------------------------------------------------------------------
 25 | 1 GTX 1080    | 0.018-0.022            | ~xx.xx% at 20K steps  (x hours)
 26 | 2 GTX 1080    | 0.012-0.015            | ~xx.xx% at 20K steps  (x hours)
 27 | 
 28 | Usage:
 29 | Please see the TensorFlow website for how to download the MNIST
 30 | data set, compile and train models.
 31 | 
 32 | """
 33 | 
 34 | from __future__ import absolute_import
 35 | from __future__ import division
 36 | from __future__ import print_function
 37 | 
 38 | import os.path
 39 | import re
 40 | import time
 41 | import numpy as np
 42 | from datetime import datetime
 43 | 
 44 | from tensorflow.examples.tutorials.mnist import input_data
 45 | 
 46 | import tensorflow as tf
 47 | 
 48 | # Constants used for dealing with the files, matches convert_to_records.
 49 | TRAIN_FILE = 'train.tfrecords'
 50 | VALIDATION_FILE = 'validation.tfrecords'
 51 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name
 52 | # to differentiate the operations. Note that this prefix is removed from the
 53 | # names of the summaries when visualizing a model.
 54 | TOWER_NAME = 'tower'
 55 | IMAGE_PIXELS = 784
 56 | 
 57 | # Constants describing the training process.
 58 | MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
 59 | NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
 60 | LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
 61 | INITIAL_LEARNING_RATE = 0.1       # Initial learning rate.
 62 | 
 63 | # Global constants describing the MNIST data set.
 64 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
 65 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
 66 | 
 67 | FLAGS = tf.app.flags.FLAGS
 68 | 
 69 | tf.app.flags.DEFINE_integer('batch_size', 64,
 70 |                             """Number of images to process in a batch.""")
 71 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
 72 |                            """Path to the MNIST data directory.""")
 73 | tf.app.flags.DEFINE_string('train_dir', '/home/norman/MNIST_train',
 74 |                            """Directory where to write event logs """
 75 |                            """and checkpoint.""")
 76 | tf.app.flags.DEFINE_integer('num_gpus', 2,
 77 |                             """How many GPUs to use.""")
 78 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 79 |                             """Whether to log device placement.""")
 80 | tf.app.flags.DEFINE_boolean('tb_logging', False,
 81 |                             """Whether to log to Tensorboard.""")
 82 | tf.app.flags.DEFINE_integer('num_epochs', 10,
 83 |                             """Number of epochs to run trainer.""")
 84 | # 17/4/17
 85 | # 1 gpu
 86 | # Done training for 20 epochs, 22000 steps.
 87 | # Total Duration (474.817 sec)
 88 | # 2017-04-17 15:24:51.190879: precision = 9743.000v
 89 | # Done training for 20 epochs, 22000 steps.
 90 | # Total Duration (497.690 sec)
 91 | # 2017-04-17 15:35:10.070366: precision = 9305.000
 92 | # 2 gpu
 93 | # Done training for 20 epochs, 22000 steps.
 94 | # Total Duration (687.583 sec)
 95 | # 2017-04-17 15:14:28.793936: precision = 9472.000
 96 | # Done training for 20 epochs, 22000 steps.
 97 | # Total Duration (672.720 sec)
 98 | # 2017-04-17 15:52:16.096935: precision = 9672.000
 99 | # 17/4/17
100 | 
101 | # 18/4/17
102 | # 2 GPU
103 | # Done training for 10 epochs, 8593 steps.
104 | # Total Duration (339.430 sec)
105 | # 2017-04-18 10:50:53.269983: precision = 9677.000
106 | # Done training for 10 epochs, 8593 steps.
107 | # Total Duration (335.611 sec)
108 | # 2017-04-18 11:14:26.685982: precision = 9674.000
109 | # Done training for 10 epochs, 8593 steps.
110 | # Total Duration (349.731 sec)
111 | # 2017-04-18 12:48:15.148828: precision = 9267.000
112 | # Done training for 10 epochs, 8593 steps.
113 | # Total Duration (350.593 sec)
114 | # 2017-04-18 13:14:51.974247: precision = 9270.000
115 | # Done training for 10 epochs, 8593 steps.
116 | # Total Duration (361.926 sec)
117 | # 2017-04-18 13:58:02.775474: precision = 9507.000
118 | # Done training for 10 epochs, 8593 steps.
119 | # Total Duration (346.119 sec)
120 | # 2017-04-18 14:46:51.579685: precision = 9471.000
121 | # Done training for 10 epochs, 8593 steps.
122 | # Total Duration (334.561 sec)
123 | # 2017-04-18 14:58:06.942195: precision = 9781.000
124 | 
125 | # 1 GPU
126 | # Done training for 10 epochs, 8593 steps.
127 | # Total Duration (238.033 sec)
128 | # 2017-04-18 11:02:06.403359: precision = 9679.000
129 | # Done training for 10 epochs, 8593 steps.
130 | # Total Duration (256.169 sec)
131 | # 2017-04-18 11:20:54.328206: precision = 9362.000
132 | # Done training for 10 epochs, 8593 steps.
133 | # Total Duration (257.144 sec)
134 | # 2017-04-18 12:30:53.954074: precision = 8989.000
135 | # Done training for 10 epochs, 8593 steps.
136 | # Total Duration (250.306 sec)
137 | # 2017-04-18 12:40:26.649277: precision = 9512.000
138 | # Done training for 10 epochs, 8593 steps.
139 | # Total Duration (257.795 sec)
140 | # 2017-04-18 13:22:48.300705: precision = 9692.000
141 | # Done training for 10 epochs, 8593 steps.
142 | # Total Duration (254.077 sec)
143 | # 2017-04-18 13:35:26.700627: precision = 9391.000
144 | # Done training for 10 epochs, 8593 steps.
145 | # Total Duration (253.215 sec)
146 | # 2017-04-18 13:41:46.708623: precision = 9734.000
147 | 
148 | 
149 | 
150 | def read_and_decode(filename_queue):
151 |     reader = tf.TFRecordReader()
152 |     _, serialized_example = reader.read(filename_queue)
153 |     features = tf.parse_single_example(
154 |         serialized_example,
155 |         # Defaults are not specified since both keys are required.
156 |         features={
157 |             'image_raw': tf.FixedLenFeature([], tf.string),
158 |             'label': tf.FixedLenFeature([], tf.int64),
159 |         })
160 | 
161 |     # Convert from a scalar string tensor (whose single string has
162 |     # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
163 |     # [mnist.IMAGE_PIXELS].
164 |     image = tf.decode_raw(features['image_raw'], tf.uint8)
165 |     image.set_shape([IMAGE_PIXELS])
166 | 
167 |     # OPTIONAL: Could reshape into a 28x28 image and apply distortions
168 |     # here.  Since we are not applying any distortions in this
169 |     # example, and the next step expects the image to be flattened
170 |     # into a vector, we don't bother.
171 | 
172 |     # Convert from [0, 255] -> [-0.5, 0.5] floats.
173 |     image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
174 | 
175 |     # Convert label from a scalar uint8 tensor to an int32 scalar.
176 |     label = tf.cast(features['label'], tf.int32)
177 | 
178 |     return image, label
179 | 
180 | def inputs(train, batch_size, num_epochs):
181 |     """Reads input data num_epochs times.
182 |     Args:
183 |       train: Selects between the training (True) and validation (False) data.
184 |       batch_size: Number of examples per returned batch.
185 |       num_epochs: Number of times to read the input data, or 0/None to
186 |          train forever.
187 |     Returns:
188 |       A tuple (images, labels), where:
189 |       * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]
190 |         in the range [-0.5, 0.5].
191 |       * labels is an int32 tensor with shape [batch_size] with the true label,
192 |         a number in the range [0, mnist.NUM_CLASSES).
193 |       Note that an tf.train.QueueRunner is added to the graph, which
194 |       must be run using e.g. tf.train.start_queue_runners().
195 |     """
196 |     if not num_epochs: num_epochs = None
197 |     filename = os.path.join(FLAGS.data_dir,
198 |                             TRAIN_FILE if train else VALIDATION_FILE)
199 | 
200 |     with tf.name_scope('input'):
201 |         filename_queue = tf.train.string_input_producer(
202 |             [filename], num_epochs=num_epochs)
203 | 
204 |         # Even when reading in multiple threads, share the filename
205 |         # queue.
206 |         image, label = read_and_decode(filename_queue)
207 | 
208 |         # Shuffle the examples and collect them into batch_size batches.
209 |         # (Internally uses a RandomShuffleQueue.)
210 |         # We run this in two threads to avoid being a bottleneck.
211 |         images, sparse_labels = tf.train.shuffle_batch(
212 |             [image, label], batch_size=batch_size, num_threads=2,
213 |             capacity=1000 + 3 * batch_size,
214 |             # Ensures a minimum amount of shuffling of examples.
215 |             min_after_dequeue=1000)
216 | 
217 |         return images, sparse_labels
218 | 
219 | def inference(images):
220 |     """Build the MNIST model.
221 | 
222 |     Args:
223 |       images: Images returned from MNIST or inputs().
224 | 
225 |     Returns:
226 |       Logits.
227 |     """
228 |     # We instantiate all variables using tf.get_variable() instead of
229 |     # tf.Variable() in order to share variables across multiple GPU training
230 |     # runs. If we only ran this model on a single GPU, we could simplify this
231 |     # function by replacing all instances of tf.get_variable()
232 |     # with tf.Variable().
233 | 
234 |     # Reshape to use within a convolutional neural net.
235 |     # Last dimension is for "features" - there is only one here, since images
236 |     # are grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
237 |     x_image = tf.reshape(images, [-1, 28, 28, 1])
238 | 
239 |     # conv1
240 |     with tf.variable_scope('conv1') as scope:
241 |         kernel = _variable_with_weight_decay('weights',
242 |                                              shape=[5, 5, 1, 32],
243 |                                              stddev=5e-2,
244 |                                              wd=0.0)
245 |         biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.0))
246 |         conv = tf.nn.conv2d(x_image, kernel, strides=[1, 1, 1, 1],
247 |                             padding='SAME')
248 |         pre_activation = tf.nn.bias_add(conv, biases)
249 |         conv1 = tf.nn.relu(pre_activation, name=scope.name)
250 |         _activation_summary(conv1)
251 | 
252 |     # pool1
253 |     pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
254 |                            padding='SAME', name='pool1')
255 | 
256 |     # norm1
257 |     norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
258 |                       name='norm1')
259 | 
260 |     # conv2
261 |     with tf.variable_scope('conv2') as scope:
262 |         kernel = _variable_with_weight_decay('weights',
263 |                                              shape=[5, 5, 32, 64],
264 |                                              stddev=5e-2,
265 |                                              wd=0.0)
266 |         conv = tf.nn.conv2d(norm1, kernel, strides=[1, 1, 1, 1], padding='SAME')
267 |         biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
268 |         pre_activation = tf.nn.bias_add(conv, biases)
269 |         conv2 = tf.nn.relu(pre_activation, name=scope.name)
270 |         _activation_summary(conv2)
271 | 
272 |     # norm2
273 |     norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
274 |                       name='norm1')
275 | 
276 |     # pool2
277 |     pool2 = tf.nn.max_pool(norm2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
278 |                            padding='SAME', name='pool2')
279 | 
280 |     # local3
281 |     with tf.variable_scope('local3') as scope:
282 |         # Move everything into depth so we can perform a single matrix multiply.
283 |         reshape = tf.reshape(pool2, [-1, 7 * 7 * 64])
284 |         dim = reshape.get_shape()[1].value
285 |         weights = _variable_with_weight_decay('weights', shape=[dim, 1024],
286 |                                               stddev=0.04, wd=0.004)
287 |         biases = _variable_on_cpu('biases', [1024],
288 |                                   tf.constant_initializer(0.1))
289 |         local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,
290 |                             name=scope.name)
291 |         _activation_summary(local3)
292 | 
293 |     # local4
294 |     with tf.variable_scope('local4') as scope:
295 |         weights = _variable_with_weight_decay('weights', shape=[1024, 10],
296 |                                               stddev=0.04, wd=0.004)
297 |         biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.1))
298 |         local4 = tf.nn.relu(tf.matmul(local3, weights) + biases,
299 |                             name=scope.name)
300 |         _activation_summary(local4)
301 | 
302 |     # linear layer(WX + b),
303 |     # We don't apply softmax here because
304 |     # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
305 |     # and performs the softmax internally for efficiency.
306 |     with tf.variable_scope('softmax_linear') as scope:
307 |         weights = _variable_with_weight_decay('weights', [10, 10],
308 |                                               stddev=1 / 192.0, wd=0.0)
309 |         biases = _variable_on_cpu('biases', [10],
310 |                                   tf.constant_initializer(0.0))
311 |         softmax_linear = tf.add(tf.matmul(local4, weights), biases,
312 |                                 name=scope.name)
313 |         _activation_summary(softmax_linear)
314 | 
315 |     return softmax_linear
316 | 
317 | def _variable_with_weight_decay(name, shape, stddev, wd):
318 |     """Helper to create an initialized Variable with weight decay.
319 | 
320 |     Note that the Variable is initialized with a truncated normal distribution.
321 |     A weight decay is added only if one is specified.
322 | 
323 |     Args:
324 |       name: name of the variable
325 |       shape: list of ints
326 |       stddev: standard deviation of a truncated Gaussian
327 |       wd: add L2Loss weight decay multiplied by this float. If None, weight
328 |           decay is not added for this Variable.
329 | 
330 |     Returns:
331 |       Variable Tensor
332 |     """
333 |     dtype = tf.float32
334 |     var = _variable_on_cpu(
335 |         name,
336 |         shape,
337 |         tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
338 |     if wd is not None:
339 |         weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
340 |         tf.add_to_collection('losses', weight_decay)
341 |     return var
342 | 
343 | def _variable_on_cpu(name, shape, initializer):
344 |     """Helper to create a Variable stored on CPU memory.
345 | 
346 |     Args:
347 |       name: name of the variable
348 |       shape: list of ints
349 |       initializer: initializer for Variable
350 | 
351 |     Returns:
352 |       Variable Tensor
353 |     """
354 |     with tf.device('/cpu:0'):
355 |         dtype = tf.float32
356 |         var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
357 |     return var
358 | 
359 | def _activation_summary(x):
360 |     """Helper to create summaries for activations.
361 | 
362 |     Creates a summary that provides a histogram of activations.
363 |     Creates a summary that measures the sparsity of activations.
364 | 
365 |     Args:
366 |       x: Tensor
367 |     Returns:
368 |       nothing
369 |     """
370 |     # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
371 |     # session. This helps the clarity of presentation on tensorboard.
372 |     if FLAGS.tb_logging:
373 |         tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
374 |         tf.summary.histogram(tensor_name + '/activations', x)
375 |         tf.summary.scalar(tensor_name + '/sparsity',
376 |                           tf.nn.zero_fraction(x))
377 | 
378 | def loss(logits, labels):
379 |     """Add L2Loss to all the trainable variables.
380 |   
381 |     Add summary for "Loss" and "Loss/avg".
382 |     Args:
383 |       logits: Logits from inference().
384 |       labels: Labels from distorted_inputs or inputs(). 1-D tensor
385 |               of shape [batch_size]
386 |   
387 |     Returns:
388 |       Loss tensor of type float.
389 |     """
390 |     # Calculate the average cross entropy loss across the batch.
391 |     labels = tf.cast(labels, tf.int64)
392 |     cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
393 |         labels=labels, logits=logits, name='cross_entropy_per_example')
394 |     cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
395 |     tf.add_to_collection('losses', cross_entropy_mean)
396 | 
397 |     # The total loss is defined as the cross entropy loss plus all of the weight
398 |     # decay terms (L2 loss).
399 |     return tf.add_n(tf.get_collection('losses'), name='total_loss')
400 | 
401 | def tower_loss(scope):
402 |     """Calculate the total loss on a single tower running the MNIST model.
403 | 
404 |     Args:
405 |       scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0'
406 | 
407 |     Returns:
408 |        Tensor of shape [] containing the total loss for a batch of data
409 |     """
410 |     # Input images and labels.
411 |     images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
412 |                             num_epochs=FLAGS.num_epochs)
413 |     # Build inference Graph.
414 |     logits = inference(images)
415 | 
416 |     # Build the portion of the Graph calculating the losses. Note that we will
417 |     # assemble the total_loss using a custom function below.
418 |     _ = loss(logits, labels)
419 | 
420 |     # Assemble all of the losses for the current tower only.
421 |     losses = tf.get_collection('losses', scope)
422 | 
423 |     # Calculate the total loss for the current tower.
424 |     total_loss = tf.add_n(losses, name='total_loss')
425 | 
426 |     # Attach a scalar summary to all individual losses and the total loss; do
427 |     # the same for the averaged version of the losses.
428 |     if FLAGS.tb_logging:
429 |         for l in losses + [total_loss]:
430 |             # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU
431 |             # training session. This helps the clarity of presentation on
432 |             # tensorboard.
433 |             loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
434 |             tf.summary.scalar(loss_name, l)
435 | 
436 |     return total_loss
437 | 
438 | def average_gradients(tower_grads):
439 |     """Calculate average gradient for each shared variable across all towers.
440 | 
441 |     Note that this function provides a synchronization point across all towers.
442 | 
443 |     Args:
444 |       tower_grads: List of lists of (gradient, variable) tuples. The outer list
445 |         is over individual gradients. The inner list is over the gradient
446 |         calculation for each tower.
447 |     Returns:
448 |        List of pairs of (gradient, variable) where the gradient has been 
449 |        averaged across all towers.
450 |     """
451 |     average_grads = []
452 |     for grad_and_vars in zip(*tower_grads):
453 |         # Note that each grad_and_vars looks like the following:
454 |         #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
455 |         grads = []
456 |         for g, _ in grad_and_vars:
457 |             # Add 0 dimension to the gradients to represent the tower.
458 |             expanded_g = tf.expand_dims(g, 0)
459 | 
460 |             # Append on a 'tower' dimension which we will average over below.
461 |             grads.append(expanded_g)
462 | 
463 |         # Average over the 'tower' dimension.
464 |         grad = tf.concat(grads, 0)
465 |         grad = tf.reduce_mean(grad, 0)
466 | 
467 |         # Keep in mind that the Variables are redundant because they are shared
468 |         # across towers. So .. we will just return the first tower's pointer to
469 |         # the Variable.
470 |         v = grad_and_vars[0][1]
471 |         grad_and_var = (grad, v)
472 |         average_grads.append(grad_and_var)
473 |     return average_grads
474 | 
475 | def train():
476 |     with tf.Graph().as_default(), tf.device('/cpu:0'):
477 |         # Create a variable to count the number of train() calls. This equals
478 |         # the number of batches processed * FLAGS.num_gpus.
479 |         global_step = tf.get_variable(
480 |             'global_step', [],
481 |             initializer=tf.constant_initializer(0), trainable=False)
482 | 
483 |         # Calculate the learning rate schedule.
484 |         num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
485 |                                  FLAGS.batch_size)
486 |         decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
487 | 
488 |         # Decay the learning rate exponentially based on the number of steps.
489 |         lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
490 |                                         global_step,
491 |                                         decay_steps,
492 |                                         LEARNING_RATE_DECAY_FACTOR,
493 |                                         staircase=True)
494 | 
495 |         opt = tf.train.MomentumOptimizer(lr,0.9,use_nesterov=True,use_locking=True)
496 | 
497 |         # Calculate the gradients for each model tower.
498 |         tower_grads = []
499 |         with tf.variable_scope(tf.get_variable_scope()):
500 |             for i in xrange(FLAGS.num_gpus):
501 |                 with tf.device('/gpu:%d' % i):
502 |                     with tf.name_scope(
503 |                                     '%s_%d' % (TOWER_NAME, i)) as scope:
504 |                         # Calculate the loss for one tower of the CIFAR model.
505 |                         # This function constructs the entire CIFAR model but
506 |                         # shares the variables across all towers.
507 |                         loss = tower_loss(scope)
508 | 
509 |                         # Reuse variables for the next tower.
510 |                         tf.get_variable_scope().reuse_variables()
511 | 
512 |                         # Retain the summaries from the final tower.
513 |                         summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
514 |                                                       scope)
515 | 
516 |                         # Calculate the gradients for the batch of data on this
517 |                         # MNIST tower.
518 |                         grads = opt.compute_gradients(loss, gate_gradients=0)
519 | 
520 |                         # Keep track of the gradients across all towers.
521 |                         tower_grads.append(grads)
522 | 
523 |         # We must calculate the mean of each gradient. Note that this is the
524 |         # synchronization point across all towers.
525 |         grads = average_gradients(tower_grads)
526 | 
527 |         # Add histograms for gradients.
528 |         if FLAGS.tb_logging:
529 |             for grad, var in grads:
530 |                 if grad is not None:
531 |                     summaries.append(
532 |                         tf.summary.histogram(var.op.name + '/gradients', grad))
533 |             # Add a summary to track the learning rate.
534 |             summaries.append(tf.summary.scalar('learning_rate', lr))
535 | 
536 |         train_op = opt.apply_gradients(grads, global_step=global_step)
537 | 
538 |         # Add histograms for trainable variables.
539 |         if FLAGS.tb_logging:
540 |             for var in tf.trainable_variables():
541 |                 summaries.append(tf.summary.histogram(var.op.name, var))
542 | 
543 |         # Create a saver.
544 |         saver = tf.train.Saver(tf.global_variables(),sharded=True)
545 | 
546 |         # Build the summary operation from the last tower summaries.
547 |         summary_op = tf.summary.merge(summaries)
548 | 
549 |         # Build an initialization operation to run below.
550 |         # init = tf.global_variables_initializer()
551 | 
552 |         # The op for initializing the variables.
553 |         init_op = tf.group(tf.global_variables_initializer(),
554 |                            tf.local_variables_initializer())
555 | 
556 |         # Start running operations on the Graph. allow_soft_placement must be
557 |         # set to True to build towers on GPU, as some of the ops do not have GPU
558 |         # implementations.
559 |         sess = tf.Session(config=tf.ConfigProto(
560 |             allow_soft_placement=True,
561 |             log_device_placement=FLAGS.log_device_placement))
562 |         sess.run(init_op)
563 | 
564 |         # Start input enqueue threads.
565 |         coord = tf.train.Coordinator()
566 |         threads = tf.train.start_queue_runners(sess=sess, coord=coord)
567 | 
568 |         summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
569 | 
570 |         try:
571 |             step = 0
572 |             while not coord.should_stop():
573 |                 start_time = time.time()
574 | 
575 |                 # Run one step of the model.  The return values are
576 |                 # the activations from the `train_op` (which is
577 |                 # discarded) and the `loss` op.  To inspect the values
578 |                 # of your ops or variables, you may include them in
579 |                 # the list passed to sess.run() and the value tensors
580 |                 # will be returned in the tuple from the call.
581 |                 _, loss_value = sess.run([train_op, loss])
582 | 
583 |                 duration = time.time() - start_time
584 | 
585 |                 assert not np.isnan(
586 |                     loss_value), 'Model diverged with loss = NaN'
587 | 
588 |                 # Print an overview fairly often.
589 |                 if step % 100 == 0:
590 |                     num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
591 |                     examples_per_sec = num_examples_per_step / duration
592 |                     sec_per_batch = duration / FLAGS.num_gpus
593 |                     format_str = (
594 |                         '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
595 |                         'sec/batch)')
596 |                     print(format_str % (datetime.now(), step, loss_value,
597 |                                         examples_per_sec, sec_per_batch))
598 |                 if FLAGS.tb_logging:
599 |                     if step % 10 == 0:
600 |                         summary_str = sess.run(summary_op)
601 |                         summary_writer.add_summary(summary_str, step)
602 | 
603 |                 # Save the model checkpoint periodically.
604 |                 if step % 1000 == 0 or (
605 |                     step + 1) == FLAGS.num_epochs * FLAGS.batch_size:
606 |                     checkpoint_path = os.path.join(FLAGS.train_dir,
607 |                                                    'model.ckpt')
608 |                     saver.save(sess, checkpoint_path, global_step=step)
609 | 
610 |                 step += 1
611 |         except tf.errors.OutOfRangeError:
612 |             print('Done training for %d epochs, %d steps.' % (
613 |                 FLAGS.num_epochs, step))
614 |         finally:
615 |             # When done, ask the threads to stop.
616 |             coord.request_stop()
617 | 
618 |         # Wait for threads to finish.
619 |         coord.join(threads)
620 |         sess.close()
621 | 
622 | def evaluate():
623 |     """Eval MNIST for a number of steps."""
624 |     with tf.Graph().as_default():
625 |         # Get images and labels for MNIST.
626 |         mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False)
627 |         images = mnist.test.images
628 |         labels = mnist.test.labels
629 | 
630 |         # Build a Graph that computes the logits predictions from the
631 |         # inference model.
632 |         logits = inference(images)
633 | 
634 |         # Calculate predictions.
635 |         top_k_op = tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
636 | 
637 |         # Create saver to restore the learned variables for eval.
638 |         saver = tf.train.Saver()
639 | 
640 |         with tf.Session() as sess:
641 |             ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
642 |             if ckpt and ckpt.model_checkpoint_path:
643 |                 # Restores from checkpoint
644 |                 saver.restore(sess, ckpt.model_checkpoint_path)
645 |             else:
646 |                 print('No checkpoint file found')
647 |                 return
648 | 
649 |             predictions = np.sum(sess.run([top_k_op]))
650 | 
651 |             # Compute precision.
652 |             print('%s: precision = %.3f' % (datetime.now(), predictions))
653 | 
654 | def main(argv=None):  # pylint: disable=unused-argument
655 |     start_time = time.time()
656 |     train()
657 |     duration = time.time() - start_time
658 |     print('Total Duration (%.3f sec)' % duration)
659 |     evaluate()
660 | 
661 | if __name__ == '__main__':
662 |     tf.app.run()
663 | 


--------------------------------------------------------------------------------
/mnist_multi_gpu_keras.py:
--------------------------------------------------------------------------------
  1 | '''Trains a simple convnet on the MNIST dataset.
  2 | Gets to 99.25% test accuracy after 12 epochs
  3 | (there is still a lot of margin for parameter tuning).
  4 | 16 seconds per epoch on a GRID K520 GPU.
  5 | '''
  6 | 
  7 | from __future__ import print_function
  8 | import numpy as np
  9 | np.random.seed(1337)  # for reproducibility
 10 | 
 11 | from tensorflow.contrib.keras.api.keras.datasets import mnist
 12 | from keras.models import Sequential
 13 | from keras.layers import Dense, Dropout, Activation, Flatten
 14 | from keras.layers import Convolution2D, MaxPooling2D, BatchNormalization
 15 | from keras.utils import np_utils
 16 | from keras import backend as K
 17 | from keras.callbacks import TensorBoard, ModelCheckpoint
 18 | tensorboard = TensorBoard(log_dir='/home/norman/MNIST_train', histogram_freq=1,
 19 |                           write_graph=True, write_images=False, embeddings_freq=1)
 20 | import time
 21 | import argparse
 22 | parser = argparse.ArgumentParser()
 23 | parser.add_argument('--extras', help='(absolute) path to keras-extras')
 24 | parser.add_argument('--gpus', help='number of GPUs')
 25 | parser.print_help()
 26 | args = parser.parse_args()
 27 | 
 28 | import sys
 29 | sys.path.append(args.extras)
 30 | 
 31 | from multi_gpu import make_parallel
 32 | 
 33 | #ngpus = int(args.gpus)
 34 | ngpus = int(2)
 35 | print("Using %i GPUs" %ngpus)
 36 | 
 37 | batch_size = 128
 38 | nb_classes = 10
 39 | nb_epoch = 12
 40 | 
 41 | # input image dimensions
 42 | img_rows, img_cols = 28, 28
 43 | # number of convolutional filters to use
 44 | nb_filters = 32
 45 | # size of pooling area for max pooling
 46 | pool_size = (2, 2)
 47 | # convolution kernel size
 48 | kernel_size = (3, 3)
 49 | 
 50 | # the data, shuffled and split between train and test sets
 51 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
 52 | 
 53 | if K.image_dim_ordering() == 'th':
 54 |     X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
 55 |     X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
 56 |     input_shape = (1, img_rows, img_cols)
 57 | else:
 58 |     X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
 59 |     X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
 60 |     input_shape = (img_rows, img_cols, 1)
 61 | 
 62 | X_train = X_train.astype('float32')
 63 | X_test = X_test.astype('float32')
 64 | X_train /= 255
 65 | X_test /= 255
 66 | print('X_train shape:', X_train.shape)
 67 | print(X_train.shape[0], 'train samples')
 68 | print(X_test.shape[0], 'test samples')
 69 | 
 70 | # convert class vectors to binary class matrices
 71 | Y_train = np_utils.to_categorical(y_train, nb_classes)
 72 | Y_test = np_utils.to_categorical(y_test, nb_classes)
 73 | 
 74 | model = Sequential()
 75 | 
 76 | model.add(Convolution2D(nb_filters, (kernel_size[0], kernel_size[1]),
 77 |                         padding='valid',
 78 |                         input_shape=input_shape))
 79 | model.add(Activation('relu'))
 80 | model.add(Convolution2D(256, (kernel_size[0], kernel_size[1])))
 81 | model.add(Activation('relu'))
 82 | model.add(BatchNormalization())
 83 | model.add(Convolution2D(128, (kernel_size[0], kernel_size[1])))
 84 | model.add(Activation('relu'))
 85 | model.add(MaxPooling2D(pool_size=pool_size))
 86 | model.add(Dropout(0.25))
 87 | model.add(Flatten())
 88 | model.add(Dense(128))
 89 | model.add(Activation('relu'))
 90 | model.add(Dropout(0.5))
 91 | model.add(Dense(nb_classes))
 92 | model.add(Activation('softmax'))
 93 | 
 94 | if ngpus > 1:
 95 |     model = make_parallel(model,ngpus)
 96 | 
 97 | model.compile(loss='categorical_crossentropy',
 98 |               optimizer='adadelta',
 99 |               metrics=['accuracy'])
100 | 
101 | start_time = time.time()
102 | model.fit(X_train, Y_train, batch_size=batch_size*ngpus, epochs=nb_epoch,
103 |           verbose=1, validation_data=(X_test, Y_test))#, callbacks=[tensorboard])
104 | score = model.evaluate(X_test, Y_test, verbose=0)
105 | print('Test score:', score[0])
106 | print('Test accuracy:', score[1])
107 | duration = time.time() - start_time
108 | print('Total Duration (%.3f sec)' % duration)
109 | 


--------------------------------------------------------------------------------
/mnist_multi_gpu_sonnet.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """A binary to train MNIST using multiple GPU's with synchronous updates.
 16 | 
 17 | Accuracy:
 18 | 
 19 | Done training with 2 GPUs, for 20 epochs, 11000 steps.
 20 | Total Duration (327.396 sec)
 21 | 2017-04-21 20:46:18.466392: precision = 9848.000
 22 | Done training with 1 GPUs, for 20 epochs, 22000 steps.
 23 | Total Duration (500.122 sec)
 24 | 2017-04-21 20:56:40.639580: precision = 9884.000
 25 | 
 26 | Speed: With batch_size 50.
 27 | 
 28 | System        | Step Time (sec/batch)  | Accuracy
 29 | -------------------------------------------------------------------------
 30 | 1 GTX 1080    | 258.136 sec            | ~94.58% at 11K steps
 31 | 2 GTX 1080    | 189.572 sec            | ~94.59% at 11K steps
 32 | 
 33 | Usage:
 34 | Please see the TensorFlow website for how to download the MNIST
 35 | data set, compile and train models.
 36 | 
 37 | """
 38 | 
 39 | from __future__ import absolute_import
 40 | from __future__ import division
 41 | from __future__ import print_function
 42 | 
 43 | import os.path
 44 | import re
 45 | import time
 46 | import numpy as np
 47 | from datetime import datetime
 48 | 
 49 | from tensorflow.examples.tutorials.mnist import input_data
 50 | 
 51 | import tensorflow as tf
 52 | import sonnet as snt
 53 | 
 54 | # Constants used for dealing with the files, matches convert_to_records.
 55 | TRAIN_FILE = 'train.tfrecords'
 56 | VALIDATION_FILE = 'validation.tfrecords'
 57 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name
 58 | # to differentiate the operations. Note that this prefix is removed from the
 59 | # names of the summaries when visualizing a model.
 60 | TOWER_NAME = 'tower'
 61 | IMAGE_PIXELS = 784
 62 | 
 63 | # Constants describing the training process.
 64 | MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
 65 | NUM_EPOCHS_PER_DECAY = 20.0  # Epochs after which learning rate decays.
 66 | LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
 67 | INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.
 68 | 
 69 | # Global constants describing the MNIST data set.
 70 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
 71 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
 72 | 
 73 | FLAGS = tf.app.flags.FLAGS
 74 | 
 75 | tf.app.flags.DEFINE_integer('batch_size', 50,
 76 |                             """Number of images to process in a batch.""")
 77 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
 78 |                            """Path to the MNIST data directory.""")
 79 | tf.app.flags.DEFINE_string('train_dir', '/home/norman/MNIST_train',
 80 |                            """Directory where to write event logs """
 81 |                            """and checkpoint.""")
 82 | tf.app.flags.DEFINE_integer('num_gpus', 2,
 83 |                             """How many GPUs to use.""")
 84 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 85 |                             """Whether to log device placement.""")
 86 | tf.app.flags.DEFINE_boolean('tb_logging', False,
 87 |                             """Whether to log to Tensorboard.""")
 88 | tf.app.flags.DEFINE_integer('num_epochs', 20,
 89 |                             """Number of epochs to run trainer.""")
 90 | 
 91 | 
 92 | def read_and_decode(filename_queue):
 93 |     reader = tf.TFRecordReader()
 94 |     _, serialized_example = reader.read(filename_queue)
 95 |     features = tf.parse_single_example(
 96 |         serialized_example,
 97 |         # Defaults are not specified since both keys are required.
 98 |         features={
 99 |             'image_raw': tf.FixedLenFeature([], tf.string),
100 |             'label': tf.FixedLenFeature([], tf.int64),
101 |         })
102 | 
103 |     # Convert from a scalar string tensor (whose single string has
104 |     # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
105 |     # [mnist.IMAGE_PIXELS].
106 |     image = tf.decode_raw(features['image_raw'], tf.uint8)
107 |     image.set_shape([IMAGE_PIXELS])
108 | 
109 |     # OPTIONAL: Could reshape into a 28x28 image and apply distortions
110 |     # here.  Since we are not applying any distortions in this
111 |     # example, and the next step expects the image to be flattened
112 |     # into a vector, we don't bother.
113 | 
114 |     # Convert from [0, 255] -> [-0.5, 0.5] floats.
115 |     image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
116 | 
117 |     # Convert label from a scalar uint8 tensor to an int32 scalar.
118 |     label = tf.cast(features['label'], tf.int32)
119 | 
120 |     return image, label
121 | 
122 | 
123 | def inputs(train, batch_size, num_epochs):
124 |     """Reads input data num_epochs times.
125 |     Args:
126 |       train: Selects between the training (True) and validation (False) data.
127 |       batch_size: Number of examples per returned batch.
128 |       num_epochs: Number of times to read the input data, or 0/None to
129 |          train forever.
130 |     Returns:
131 |       A tuple (images, labels), where:
132 |       * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]
133 |         in the range [-0.5, 0.5].
134 |       * labels is an int32 tensor with shape [batch_size] with the true label,
135 |         a number in the range [0, mnist.NUM_CLASSES).
136 |       Note that an tf.train.QueueRunner is added to the graph, which
137 |       must be run using e.g. tf.train.start_queue_runners().
138 |     """
139 |     if not num_epochs: num_epochs = None
140 |     filename = os.path.join(FLAGS.data_dir,
141 |                             TRAIN_FILE if train else VALIDATION_FILE)
142 | 
143 |     with tf.name_scope('input'):
144 |         filename_queue = tf.train.string_input_producer(
145 |             [filename], num_epochs=num_epochs)
146 | 
147 |         # Even when reading in multiple threads, share the filename
148 |         # queue.
149 |         image, label = read_and_decode(filename_queue)
150 | 
151 |         # Shuffle the examples and collect them into batch_size batches.
152 |         # (Internally uses a RandomShuffleQueue.)
153 |         # We run this in two threads to avoid being a bottleneck.
154 |         images, sparse_labels = tf.train.shuffle_batch(
155 |             [image, label], batch_size=batch_size, num_threads=2,
156 |             capacity=1000 + 3 * batch_size,
157 |             # Ensures a minimum amount of shuffling of examples.
158 |             min_after_dequeue=1000)
159 | 
160 |         return images, sparse_labels
161 | 
162 | 
163 | def custom_build(inputs, is_training, keep_prob):
164 |   x_inputs = tf.reshape(inputs, [-1, 28, 28, 1])
165 |   """A custom build method to wrap into a sonnet Module."""
166 |   outputs = snt.Conv2D(output_channels=32, kernel_shape=4, stride=2)(x_inputs)
167 |   outputs = snt.BatchNorm()(outputs, is_training=is_training)
168 |   outputs = tf.nn.relu(outputs)
169 |   outputs = tf.nn.max_pool(outputs, ksize=[1, 2, 2, 1],
170 |                            strides=[1, 2, 2, 1], padding='SAME')
171 |   outputs = snt.Conv2D(output_channels=64, kernel_shape=4, stride=2)(outputs)
172 |   outputs = snt.BatchNorm()(outputs, is_training=is_training)
173 |   outputs = tf.nn.relu(outputs)
174 |   outputs = tf.nn.max_pool(outputs, ksize=[1, 2, 2, 1],
175 |                            strides=[1, 2, 2, 1], padding='SAME')
176 |   outputs = snt.Conv2D(output_channels=1024, kernel_shape=1, stride=1)(outputs)
177 |   outputs = snt.BatchNorm()(outputs, is_training=is_training)
178 |   outputs = tf.nn.relu(outputs)
179 |   outputs = snt.BatchFlatten()(outputs)
180 |   outputs = tf.nn.dropout(outputs, keep_prob=keep_prob)
181 |   outputs = snt.Linear(output_size=10)(outputs)
182 | #  _activation_summary(outputs)
183 |   return outputs
184 | 
185 | 
186 | def _variable_with_weight_decay(name, shape, stddev, wd):
187 |     """Helper to create an initialized Variable with weight decay.
188 | 
189 |     Note that the Variable is initialized with a truncated normal distribution.
190 |     A weight decay is added only if one is specified.
191 | 
192 |     Args:
193 |       name: name of the variable
194 |       shape: list of ints
195 |       stddev: standard deviation of a truncated Gaussian
196 |       wd: add L2Loss weight decay multiplied by this float. If None, weight
197 |           decay is not added for this Variable.
198 | 
199 |     Returns:
200 |       Variable Tensor
201 |     """
202 |     dtype = tf.float32
203 |     var = _variable_on_cpu(
204 |         name,
205 |         shape,
206 |         tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
207 |     if wd is not None:
208 |         weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
209 |         tf.add_to_collection('losses', weight_decay)
210 |     return var
211 | 
212 | 
213 | def _variable_on_cpu(name, shape, initializer):
214 |     """Helper to create a Variable stored on CPU memory.
215 | 
216 |     Args:
217 |       name: name of the variable
218 |       shape: list of ints
219 |       initializer: initializer for Variable
220 | 
221 |     Returns:
222 |       Variable Tensor
223 |     """
224 |     with tf.device('/cpu:0'):
225 |         dtype = tf.float32
226 |         var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
227 |     return var
228 | 
229 | 
230 | def _activation_summary(x):
231 |     """Helper to create summaries for activations.
232 | 
233 |     Creates a summary that provides a histogram of activations.
234 |     Creates a summary that measures the sparsity of activations.
235 | 
236 |     Args:
237 |       x: Tensor
238 |     Returns:
239 |       nothing
240 |     """
241 |     # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
242 |     # session. This helps the clarity of presentation on tensorboard.
243 |     if FLAGS.tb_logging:
244 |         tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
245 |         tf.summary.histogram(tensor_name + '/activations', x)
246 |         tf.summary.scalar(tensor_name + '/sparsity',
247 |                           tf.nn.zero_fraction(x))
248 | 
249 | 
250 | def loss(logits, labels):
251 |     """Add L2Loss to all the trainable variables.
252 | 
253 |     Add summary for "Loss" and "Loss/avg".
254 |     Args:
255 |       logits: Logits from inference().
256 |       labels: Labels from distorted_inputs or inputs(). 1-D tensor
257 |               of shape [batch_size]
258 | 
259 |     Returns:
260 |       Loss tensor of type float.
261 |     """
262 |     # Calculate the average cross entropy loss across the batch.
263 |     # labels = tf.cast(labels, tf.int64)
264 |     cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
265 |         labels=labels, logits=logits, name='cross_entropy_per_example')
266 |     cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
267 |     tf.add_to_collection('losses', cross_entropy_mean)
268 | 
269 |     # The total loss is defined as the cross entropy loss plus all of the weight
270 |     # decay terms (L2 loss).
271 |     return tf.add_n(tf.get_collection('losses'), name='total_loss')
272 | 
273 | 
274 | def average_gradients(tower_grads):
275 |     """Calculate average gradient for each shared variable across all towers.
276 | 
277 |     Note that this function provides a synchronization point across all towers.
278 | 
279 |     Args:
280 |       tower_grads: List of lists of (gradient, variable) tuples. The outer list
281 |         is over individual gradients. The inner list is over the gradient
282 |         calculation for each tower.
283 |     Returns:
284 |        List of pairs of (gradient, variable) where the gradient has been 
285 |        averaged across all towers.
286 |     """
287 |     # for m in xrange(len(tower_grads)):
288 |     #     for n in xrange(len(tower_grads[m])):
289 |     #         print(type(tower_grads[0][n][0]))
290 |     # for gg in tower_grads:
291 |     #     for x in gg:
292 |     #         print(type(x[0]))
293 |     #     print(tower_grads)
294 | 
295 |     average_grads = []
296 |     for grad_and_vars in zip(*tower_grads):
297 |         # Note that each grad_and_vars looks like the following:
298 |         #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
299 |         grads = []
300 |         for g, _ in grad_and_vars:
301 |             if g != None:
302 |             # Add 0 dimension to the gradients to represent the tower.
303 |                 expanded_g = tf.expand_dims(g, 0)
304 | 
305 |             # Append on a 'tower' dimension which we will average over below.
306 |                 grads.append(expanded_g)
307 | 
308 |         # Average over the 'tower' dimension.
309 |         grad = tf.concat(grads, 0)
310 |         grad = tf.reduce_mean(grad, 0)
311 | 
312 |         # Keep in mind that the Variables are redundant because they are shared
313 |         # across towers. So .. we will just return the first tower's pointer to
314 |         # the Variable.
315 |         v = grad_and_vars[0][1]
316 |         grad_and_var = (grad, v)
317 |         average_grads.append(grad_and_var)
318 |     return average_grads
319 | 
320 | 
321 | def tower_loss(scope):
322 |     """Calculate the total loss on a single tower running the MNIST model.
323 | 
324 |     Args:
325 |       scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0'
326 | 
327 |     Returns:
328 |        Tensor of shape [] containing the total loss for a batch of data
329 |     """
330 |     # Input images and labels.
331 | 
332 |     images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
333 |                             num_epochs=(FLAGS.num_epochs / FLAGS.num_gpus))
334 |     # Build inference Graph.
335 |     # The line below takes custom_build and
336 |     # wraps it to construct a sonnet Module.
337 |     module_with_build_args = snt.Module(custom_build, name='simple_net')
338 | 
339 |     train_model_outputs = module_with_build_args(images, is_training=True,
340 |                                                  keep_prob=tf.constant(0.5))
341 | 
342 |     # Build the portion of the Graph calculating the losses. Note that we will
343 |     # assemble the total_loss using a custom function below.
344 |     _ = loss(train_model_outputs, labels)
345 | 
346 |     # Assemble all of the losses for the current tower only.
347 |     losses = tf.get_collection('losses', scope)
348 | 
349 |     # Calculate the total loss for the current tower.
350 |     total_loss = tf.add_n(losses, name='total_loss')
351 | 
352 |     # Attach a scalar summary to all individual losses and the total loss; do
353 |     # the same for the averaged version of the losses.
354 |     if FLAGS.tb_logging:
355 |         for l in losses + [total_loss]:
356 |             # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU
357 |             # training session. This helps the clarity of presentation on
358 |             # tensorboard.
359 |             loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
360 |             tf.summary.scalar(loss_name, l)
361 | 
362 |     return total_loss
363 | 
364 | 
365 | def train():
366 |     with tf.Graph().as_default(), tf.device('/cpu:0'):
367 |         # Create a variable to count the number of train() calls. This equals
368 |         # the number of batches processed * FLAGS.num_gpus.
369 |         global_step = tf.get_variable(
370 |             'global_step', [],
371 |             initializer=tf.constant_initializer(0), trainable=False)
372 | 
373 |         # Calculate the learning rate schedule.
374 |         num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
375 |                                  (FLAGS.batch_size * FLAGS.num_gpus))
376 |         decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
377 | 
378 |         # Decay the learning rate exponentially based on the number of steps.
379 |         lr = tf.train.exponential_decay(learning_rate=INITIAL_LEARNING_RATE,
380 |                                         global_step=global_step,
381 |                                         decay_steps=decay_steps,
382 |                                         decay_rate=LEARNING_RATE_DECAY_FACTOR,
383 |                                         staircase=True)
384 | 
385 |         opt = tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True,
386 |                                         use_locking=True)
387 |         #opt = tf.train.AdamOptimizer(1e-4)
388 |         # opt = tf.train.ProximalAdagradOptimizer(1e-2)
389 | 
390 |         # Calculate the gradients for each model tower.
391 |         tower_grads = []
392 |         with tf.variable_scope(tf.get_variable_scope()):
393 |             for i in xrange(FLAGS.num_gpus):
394 |                 with tf.device('/gpu:%d' % i):
395 |                     with tf.name_scope(
396 |                                     '%s_%d' % (TOWER_NAME, i)) as scope:
397 |                         # Calculate the loss for one tower of the CIFAR model.
398 |                         # This function constructs the entire CIFAR model but
399 |                         # shares the variables across all towers.
400 |                         loss = tower_loss(scope)
401 | 
402 |                         # Reuse variables for the next tower.
403 |                         # No need for this with Sonnet?
404 |                         #tf.get_variable_scope().reuse_variables()
405 | 
406 |                         # Retain the summaries from the final tower.
407 |                         summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
408 |                                                       scope)
409 | 
410 |                         # Calculate the gradients for the batch of data on this
411 |                         # MNIST tower.
412 |                         grads = opt.compute_gradients(loss, gate_gradients=2)
413 |                         # for x in grads:
414 |                         #     print(type(x[0]))
415 |                         # print (grads)
416 | 
417 |                         # Keep track of the gradients across all towers.
418 |                         tower_grads.append(grads)
419 | 
420 |         # We must calculate the mean of each gradient. Note that this is the
421 |         # synchronization point across all towers.
422 |         grads = average_gradients(tower_grads)
423 | 
424 |         # Add histograms for gradients.
425 |         if FLAGS.tb_logging:
426 |             for grad, var in grads:
427 |                 if grad is not None:
428 |                     summaries.append(
429 |                         tf.summary.histogram(var.op.name + '/gradients', grad))
430 |             # Add a summary to track the learning rate.
431 |             summaries.append(tf.summary.scalar('learning_rate', lr))
432 | 
433 |         train_op = opt.apply_gradients(grads, global_step=global_step)
434 | 
435 |         # Add histograms for trainable variables.
436 |         if FLAGS.tb_logging:
437 |             for var in tf.trainable_variables():
438 |                 summaries.append(tf.summary.histogram(var.op.name, var))
439 | 
440 |         # Create a saver.
441 |         saver = tf.train.Saver(tf.global_variables(), sharded=True)
442 | 
443 |         # Build the summary operation from the last tower summaries.
444 |         summary_op = tf.summary.merge(summaries)
445 | 
446 |         # Build an initialization operation to run below.
447 |         # init = tf.global_variables_initializer()
448 | 
449 |         # The op for initializing the variables.
450 |         init_op = tf.group(tf.global_variables_initializer(),
451 |                            tf.local_variables_initializer())
452 | 
453 |         # Start running operations on the Graph. allow_soft_placement must be
454 |         # set to True to build towers on GPU, as some of the ops do not have GPU
455 |         # implementations.
456 |         gpu_options = tf.GPUOptions(allow_growth=True)
457 |         sess = tf.Session(config=tf.ConfigProto(
458 |             allow_soft_placement=True,
459 |             log_device_placement=FLAGS.log_device_placement,
460 |             gpu_options=gpu_options))
461 |         sess.run(init_op)
462 | 
463 |         # Start input enqueue threads.
464 |         coord = tf.train.Coordinator()
465 |         threads = tf.train.start_queue_runners(sess=sess, coord=coord)
466 | 
467 |         summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
468 | 
469 |         try:
470 |             step = 0
471 |             while not coord.should_stop():
472 |                 start_time = time.time()
473 | 
474 |                 # Run one step of the model.  The return values are
475 |                 # the activations from the `train_op` (which is
476 |                 # discarded) and the `loss` op.  To inspect the values
477 |                 # of your ops or variables, you may include them in
478 |                 # the list passed to sess.run() and the value tensors
479 |                 # will be returned in the tuple from the call.
480 |                 _, loss_value = sess.run([train_op, loss])
481 | 
482 |                 duration = time.time() - start_time
483 | 
484 |                 assert not np.isnan(
485 |                     loss_value), 'Model diverged with loss = NaN'
486 | 
487 |                 # Print an overview fairly often.
488 |                 if step % 100 == 0:
489 |                     num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
490 |                     examples_per_sec = num_examples_per_step / duration
491 |                     sec_per_batch = duration / FLAGS.num_gpus
492 |                     format_str = (
493 |                         '%s: step %d, epochs %d, loss = %.3f '
494 |                         '(%.1f examples/sec; %.3f sec/batch)')
495 |                     print(format_str % (datetime.now(), step,
496 |                                         # step * gpu / batchsize * gpu = 100
497 |                                         FLAGS.num_epochs,
498 |                                         loss_value,
499 |                                         examples_per_sec, sec_per_batch))
500 |                 if FLAGS.tb_logging:
501 |                     if step % 10 == 0:
502 |                         summary_str = sess.run(summary_op)
503 |                         summary_writer.add_summary(summary_str, step)
504 | 
505 |                 # Save the model checkpoint periodically.
506 |                 if step % 1000 == 0 or (
507 |                             step + 1) == FLAGS.num_epochs * FLAGS.batch_size:
508 |                     checkpoint_path = os.path.join(FLAGS.train_dir,
509 |                                                    'model.ckpt')
510 |                     saver.save(sess, checkpoint_path, global_step=step)
511 | 
512 |                 step += 1
513 |         except tf.errors.OutOfRangeError:
514 |             print('Done training with %d GPUs, for %d epochs, %d steps.' % (
515 |                 FLAGS.num_gpus, FLAGS.num_epochs, step))
516 |         finally:
517 |             # When done, ask the threads to stop.
518 |             coord.request_stop()
519 | 
520 |         # Wait for threads to finish.
521 |         coord.join(threads)
522 |         sess.close()
523 | 
524 | 
525 | def evaluate():
526 |     """Eval MNIST for a number of steps."""
527 |     with tf.Graph().as_default():
528 |         # Get images and labels for MNIST.
529 |         mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False)
530 |         images = mnist.test.images
531 |         labels = mnist.test.labels
532 | 
533 |         # Build a Graph that computes the logits predictions from the
534 |         # inference model.
535 |         # The line below takes custom_build and wraps it to construct a sonnet Module.
536 |         module_with_build_args = snt.Module(custom_build, name='simple_net')
537 |         test_model_outputs = module_with_build_args(images, is_training=False,
538 |                                                     keep_prob=tf.constant(1.0))
539 | 
540 |         # Calculate predictions.
541 |         top_k_op = tf.nn.in_top_k(predictions=test_model_outputs, targets=labels, k=1)
542 | 
543 |         # Create saver to restore the learned variables for eval.
544 |         saver = tf.train.Saver()
545 | 
546 |         with tf.Session() as sess:
547 |             ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
548 |             if ckpt and ckpt.model_checkpoint_path:
549 |                 # Restores from checkpoint
550 |                 saver.restore(sess, ckpt.model_checkpoint_path)
551 |             else:
552 |                 print('No checkpoint file found')
553 |                 return
554 | 
555 |             predictions = np.sum(sess.run([top_k_op]))
556 | 
557 |             # Compute precision.
558 |             print('%s: precision = %.3f' % (datetime.now(), predictions))
559 | 
560 | 
561 | def main(argv=None):  # pylint: disable=unused-argument
562 |     start_time = time.time()
563 |     train()
564 |     duration = time.time() - start_time
565 |     print('Total Duration (%.3f sec)' % duration)
566 |     evaluate()
567 | 
568 | 
569 | if __name__ == '__main__':
570 |     tf.app.run()
571 | 


--------------------------------------------------------------------------------
/multi_gpu.py:
--------------------------------------------------------------------------------
 1 | # ref: https://raw.githubusercontent.com/kuza55/keras-extras/master/utils/multi_gpu.py @IgnorePep8
 2 | 
 3 | from keras.layers.merge import concatenate
 4 | from keras.layers.core import Lambda
 5 | from keras.models import Model
 6 | from keras import backend as K
 7 | 
 8 | if K.backend() == 'tensorflow':
 9 |     import tensorflow as tf  # @UnresolvedImport
10 |     from tensorflow.python.client import device_lib
11 | 
12 | def get_available_gpus():
13 |     local_device_protos = device_lib.list_local_devices()
14 |     return [x.name for x in local_device_protos if x.device_type == 'GPU']
15 | 
16 | def make_parallel(model, gpu_count):
17 |     def get_slice(data, idx, parts):
18 |         shape = tf.shape(data)
19 |         size = tf.concat([shape[:1] // parts, shape[1:]], axis=0)
20 |         stride = tf.concat([shape[:1] // parts, shape[1:] * 0], axis=0)
21 |         start = stride * idx
22 |         return tf.slice(data, start, size)
23 | 
24 |     outputs_all = []
25 |     for i in range(len(model.outputs)):
26 |         outputs_all.append([])
27 | 
28 |     # Place a copy of the model on each GPU, each getting a slice of the batch
29 |     for i in range(gpu_count):
30 |         with tf.device('/gpu:%d' % i):
31 |             with tf.name_scope('tower_%d' % i) as scope:
32 | 
33 |                 inputs = []
34 |                 # Slice each input into a piece for processing on this GPU
35 |                 for x in model.inputs:
36 |                     input_shape = tuple(x.get_shape().as_list())[1:]
37 |                     slice_n = Lambda(get_slice, output_shape=input_shape,
38 |                                      arguments={'idx': i, 'parts': gpu_count})(
39 |                         x)
40 |                     inputs.append(slice_n)
41 | 
42 |                 outputs = model(inputs)
43 | 
44 |                 if not isinstance(outputs, list):
45 |                     outputs = [outputs]
46 | 
47 |                 # Save all the outputs for merging back together later
48 |                 for l in range(len(outputs)):
49 |                     outputs_all[l].append(outputs[l])
50 | 
51 |     # merge outputs on CPU
52 |     with tf.device('/cpu:0'):
53 |         merged = []
54 |         for outputs in outputs_all:
55 |             merged.append(concatenate(inputs=outputs, axis=0))
56 | 
57 |         return Model(inputs=model.inputs, outputs=merged)


--------------------------------------------------------------------------------
/network.py:
--------------------------------------------------------------------------------
 1 | """Class that represents the network to be evolved."""
 2 | import random
 3 | import logging
 4 | from train import train_and_score
 5 | 
 6 | class Network():
 7 |     """Represent a network and let us operate on it.
 8 | 
 9 |     Currently only works for an MLP.
10 |     """
11 | 
12 |     def __init__(self, nn_param_choices=None):
13 |         """Initialize our network.
14 | 
15 |         Args:
16 |             nn_param_choices (dict): Parameters for the network, includes:
17 |                 nb_neurons (list): [64, 128, 256]
18 |                 nb_layers (list): [1, 2, 3, 4]
19 |                 activation (list): ['relu', 'elu']
20 |                 optimizer (list): ['rmsprop', 'adam']
21 |         """
22 |         self.accuracy = 0.
23 |         self.nn_param_choices = nn_param_choices
24 |         self.network = {}  # (dic): represents MLP network parameters
25 | 
26 |     def create_random(self):
27 |         """Create a random network."""
28 |         for key in self.nn_param_choices:
29 |             self.network[key] = random.choice(self.nn_param_choices[key])
30 | 
31 |     def create_set(self, network):
32 |         """Set network properties.
33 | 
34 |         Args:
35 |             network (dict): The network parameters
36 | 
37 |         """
38 |         self.network = network
39 | 
40 |     def train(self, dataset):
41 |         """Train the network and record the accuracy.
42 | 
43 |         Args:
44 |             dataset (str): Name of dataset to use.
45 | 
46 |         """
47 |         if self.accuracy == 0.:
48 |             self.accuracy = train_and_score(self.network, dataset)
49 | 
50 |     def print_network(self):
51 |         """Print out a network."""
52 |         logging.info(self.network)
53 |         logging.info("Network accuracy: %.2f%%" % (self.accuracy * 100))
54 | 


--------------------------------------------------------------------------------
/older/mnist_multi_gpu_eval.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Evaluation for MNIST.
 17 | 
 18 | Accuracy:
 19 | 
 20 | Speed:
 21 | 
 22 | Usage:
 23 | 
 24 | """
 25 | from __future__ import absolute_import
 26 | from __future__ import division
 27 | from __future__ import print_function
 28 | 
 29 | from datetime import datetime
 30 | import math
 31 | import time
 32 | 
 33 | import numpy as np
 34 | import tensorflow as tf
 35 | 
 36 | import model
 37 | 
 38 | from tensorflow.examples.tutorials.mnist import input_data
 39 | 
 40 | FLAGS = tf.app.flags.FLAGS
 41 | 
 42 | tf.app.flags.DEFINE_integer('batch_size', 50,
 43 |                             """Number of images to process in a batch.""")
 44 | tf.app.flags.DEFINE_string('eval_dir', '/home/norman/MNIST_train',
 45 |                            """Directory where to write event logs.""")
 46 | tf.app.flags.DEFINE_string('eval_data', 'test',
 47 |                            """Either 'test' or 'train_eval'.""")
 48 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
 49 |                            """Path to the MNIST data directory.""")
 50 | tf.app.flags.DEFINE_string('checkpoint_dir', '/home/norman/MNIST_train',
 51 |                            """Directory where to read model checkpoints.""")
 52 | tf.app.flags.DEFINE_integer('eval_interval_secs', 5,
 53 |                             """How often to run the eval.""")
 54 | tf.app.flags.DEFINE_integer('num_examples', 10000,
 55 |                             """Number of examples to run.""")
 56 | tf.app.flags.DEFINE_boolean('run_once', False,
 57 |                             """Whether to run eval only once.""")
 58 | tf.app.flags.DEFINE_boolean('use_fp16', False,
 59 |                             """Train the model using fp16.""")
 60 | 
 61 | 
 62 | def eval_once(saver, top_k_op):
 63 |     """Run Eval once.
 64 |   
 65 |     Args:
 66 |       saver: Saver.
 67 |       summary_writer: Summary writer.
 68 |       top_k_op: Top K op.
 69 |     """
 70 |     with tf.Session() as sess:
 71 |         ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
 72 |         if ckpt and ckpt.model_checkpoint_path:
 73 |             # Restores from checkpoint
 74 |             saver.restore(sess, ckpt.model_checkpoint_path)
 75 |             # Assuming model_checkpoint_path looks something like:
 76 |             #   /my-favorite-path/MNIST_train/model.ckpt-0,
 77 |             # extract global_step from it.
 78 |             global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[
 79 |                 -1]
 80 |         else:
 81 |             print('No checkpoint file found')
 82 |             return
 83 | 
 84 |         predictions = np.sum(sess.run([top_k_op]))
 85 | 
 86 |         # Compute precision.
 87 |         print('%s: precision = %.3f' % (datetime.now(), predictions))
 88 | 
 89 | def evaluate():
 90 |     """Eval MNIST for a number of steps."""
 91 |     with tf.Graph().as_default() as g:
 92 |         # Get images and labels for MNIST.
 93 |         mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False)
 94 |         images = mnist.test.images
 95 |         labels = mnist.test.labels
 96 | 
 97 |         # Build a Graph that computes the logits predictions from the
 98 |         # inference model.
 99 |         logits = model.inference(images, keep_prob=1.0)
100 | 
101 |         # Calculate predictions.
102 |         top_k_op = tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
103 | 
104 |         # Create saver to restore the learned variables for eval.
105 |         saver = tf.train.Saver()
106 | 
107 |         eval_once(saver, top_k_op)
108 | 
109 | def main(argv=None):  # pylint: disable=unused-argument
110 |     evaluate()
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     tf.app.run()
115 | 


--------------------------------------------------------------------------------
/older/mnist_multi_gpu_train.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """A binary to train MNIST using multiple GPU's with synchronous updates.
 16 | 
 17 | Accuracy:
 18 | mnist_multi_gpu_train.py achieves ~xx% accuracy after 20K steps (xxx
 19 | epochs of data) as judged by mnist_multi_gpu_batching_eval.py.
 20 | 
 21 | Speed: With batch_size 50.
 22 | 
 23 | System        | Step Time (sec/batch)  |     Accuracy
 24 | --------------------------------------------------------------------
 25 | 1 GTX 1080    | 0.08-0.10              | ~xx% at 20K steps  (x hours)
 26 | 2 GTX 1080    | 0.08-0.10              | ~xx% at 20K steps  (x hours)
 27 | 
 28 | Usage:
 29 | Please see the tutorial and website for how to download the MNIST
 30 | data set, compile the program and train the model.
 31 | 
 32 | """
 33 | 
 34 | from __future__ import absolute_import
 35 | from __future__ import division
 36 | from __future__ import print_function
 37 | 
 38 | from datetime import datetime
 39 | import os.path
 40 | import re
 41 | import time
 42 | 
 43 | import numpy as np
 44 | from six.moves import xrange  # pylint: disable=redefined-builtin
 45 | import tensorflow as tf
 46 | 
 47 | import model
 48 | 
 49 | 
 50 | 
 51 | FLAGS = tf.app.flags.FLAGS
 52 | 
 53 | tf.app.flags.DEFINE_integer('batch_size', 1000,
 54 |                             """Number of images to process in a batch.""")
 55 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
 56 |                            """Path to the MNIST data directory.""")
 57 | tf.app.flags.DEFINE_boolean('use_fp16', False,
 58 |                             """Train the model using fp16.""")
 59 | tf.app.flags.DEFINE_string('train_dir', '/home/norman/MNIST_train',
 60 |                            """Directory where to write event logs """
 61 |                            """and checkpoint.""")
 62 | tf.app.flags.DEFINE_integer('max_steps', 20000,
 63 |                             """Number of batches to run.""")
 64 | tf.app.flags.DEFINE_integer('num_gpus', 2,
 65 |                             """How many GPUs to use.""")
 66 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
 67 |                             """Whether to log device placement.""")
 68 | tf.app.flags.DEFINE_boolean('tb_logging', False,
 69 |                             """Whether to log to Tensorboard.""")
 70 | 
 71 | def tower_loss(scope):
 72 |     """Calculate the total loss on a single tower running the MNIST model.
 73 |   
 74 |     Args:
 75 |       scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0'
 76 |   
 77 |     Returns:
 78 |        Tensor of shape [] containing the total loss for a batch of data
 79 |     """
 80 |     # Get images and labels for MSNIT.
 81 |     images, labels = model.inputs(FLAGS.batch_size)
 82 | 
 83 |     # Build inference Graph.
 84 |     logits = model.inference(images, keep_prob=0.5)
 85 | 
 86 |     # Build the portion of the Graph calculating the losses. Note that we will
 87 |     # assemble the total_loss using a custom function below.
 88 |     _ = model.loss(logits, labels)
 89 | 
 90 |     # Assemble all of the losses for the current tower only.
 91 |     losses = tf.get_collection('losses', scope)
 92 | 
 93 |     # Calculate the total loss for the current tower.
 94 |     total_loss = tf.add_n(losses, name='total_loss')
 95 | 
 96 |     # Attach a scalar summary to all individual losses and the total loss; do
 97 |     # the same for the averaged version of the losses.
 98 |     if (FLAGS.tb_logging):
 99 |         for l in losses + [total_loss]:
100 |             # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU
101 |             # training session. This helps the clarity of presentation on
102 |             # tensorboard.
103 |             loss_name = re.sub('%s_[0-9]*/' % model.TOWER_NAME, '', l.op.name)
104 |             tf.summary.scalar(loss_name, l)
105 | 
106 |     return total_loss
107 | 
108 | 
109 | def average_gradients(tower_grads):
110 |     """Calculate average gradient for each shared variable across all towers.
111 |   
112 |     Note that this function provides a synchronization point across all towers.
113 |   
114 |     Args:
115 |       tower_grads: List of lists of (gradient, variable) tuples. The outer list
116 |         is over individual gradients. The inner list is over the gradient
117 |         calculation for each tower.
118 |     Returns:
119 |        List of pairs of (gradient, variable) where the gradient has been 
120 |        averaged across all towers.
121 |     """
122 |     average_grads = []
123 |     for grad_and_vars in zip(*tower_grads):
124 |         # Note that each grad_and_vars looks like the following:
125 |         #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
126 |         grads = []
127 |         for g, _ in grad_and_vars:
128 |             # Add 0 dimension to the gradients to represent the tower.
129 |             expanded_g = tf.expand_dims(g, 0)
130 | 
131 |             # Append on a 'tower' dimension which we will average over below.
132 |             grads.append(expanded_g)
133 | 
134 |         # Average over the 'tower' dimension.
135 |         grad = tf.concat(grads, 0)
136 |         grad = tf.reduce_mean(grad, 0)
137 | 
138 |         # Keep in mind that the Variables are redundant because they are shared
139 |         # across towers. So .. we will just return the first tower's pointer to
140 |         # the Variable.
141 |         v = grad_and_vars[0][1]
142 |         grad_and_var = (grad, v)
143 |         average_grads.append(grad_and_var)
144 |     return average_grads
145 | 
146 | def train():
147 |     """Train MNIST for a number of steps."""
148 |     with tf.Graph().as_default(), tf.device('/cpu:0'):
149 | 
150 |         # Create a variable to count the number of train() calls. This equals
151 |         # the number of batches processed * FLAGS.num_gpus.
152 |         global_step = tf.get_variable(
153 |             'global_step', [],
154 |             initializer=tf.constant_initializer(0), trainable=False)
155 | 
156 |         # Use AdamOptimizer.
157 |         opt = tf.train.AdamOptimizer(model.INITIAL_LEARNING_RATE)
158 | 
159 |         # Calculate the gradients for each model tower.
160 |         tower_grads = []
161 |         with tf.variable_scope(tf.get_variable_scope()):
162 |             for i in xrange(FLAGS.num_gpus):
163 |                 with tf.device('/gpu:%d' % i):
164 |                     with tf.name_scope(
165 |                                     '%s_%d' % (model.TOWER_NAME, i)) as scope:
166 |                         # Calculate the loss for one tower of the MNIST model.
167 |                         # This function constructs the entire MNIST model but
168 |                         # shares the variables across all towers.
169 |                         loss = tower_loss(scope)
170 | 
171 |                         # Reuse variables for the next tower.
172 |                         tf.get_variable_scope().reuse_variables()
173 | 
174 |                         # Retain the summaries from the final tower.
175 |                         summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
176 |                                                       scope)
177 | 
178 |                         # Calculate the gradients for the batch of data on this
179 |                         # MNIST tower.
180 |                         grads = opt.compute_gradients(loss)
181 | 
182 |                         # Keep track of the gradients across all towers.
183 |                         tower_grads.append(grads)
184 | 
185 |         # We must calculate the mean of each gradient. Note that this is the
186 |         # synchronization point across all towers.
187 |         grads = average_gradients(tower_grads)
188 | 
189 |         # Add histograms for gradients.
190 |         if (FLAGS.tb_logging):
191 |             for grad, var in grads:
192 |                 if grad is not None:
193 |                     summaries.append(
194 |                         tf.summary.histogram(var.op.name + '/gradients', grad))
195 | 
196 |         # Apply the gradients to adjust the shared variables.
197 |         train_op = opt.apply_gradients(grads, global_step=global_step)
198 | 
199 |         # Add histograms for trainable variables.
200 |         if (FLAGS.tb_logging):
201 |             for var in tf.trainable_variables():
202 |                 summaries.append(tf.summary.histogram(var.op.name, var))
203 | 
204 |         # Create a saver.
205 |         saver = tf.train.Saver(tf.global_variables())
206 | 
207 |         # Build the summary operation from the last tower summaries.
208 |         summary_op = tf.summary.merge(summaries)
209 | 
210 |         # Build an initialization operation to run below.
211 |         init = tf.global_variables_initializer()
212 | 
213 |         # Start running operations on the Graph. allow_soft_placement must be
214 |         # set to True to build towers on GPU, as some of the ops do not have GPU
215 |         # implementations.
216 |         sess = tf.Session(config=tf.ConfigProto(
217 |             allow_soft_placement=True,
218 |             log_device_placement=FLAGS.log_device_placement))
219 |         sess.run(init)
220 | 
221 |         # Start the queue runners.
222 |         tf.train.start_queue_runners(sess=sess)
223 | 
224 |         summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
225 | 
226 |         for step in xrange(FLAGS.max_steps):
227 |             start_time = time.time()
228 |             _, loss_value = sess.run([train_op, loss])
229 |             duration = time.time() - start_time
230 | 
231 |             assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
232 | 
233 |             if step % 50 == 0:
234 |                 num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
235 |                 examples_per_sec = num_examples_per_step / duration
236 |                 sec_per_batch = duration / FLAGS.num_gpus
237 | 
238 |                 format_str = (
239 |                     '%s: step %d, loss = %.4f (%.1f examples/sec; %.3f '
240 |                     'sec/batch)')
241 |                 print(format_str % (datetime.now(), step, loss_value,
242 |                                     examples_per_sec, sec_per_batch))
243 |             if (FLAGS.tb_logging):
244 |                 if step % 5 == 0:
245 |                     summary_str = sess.run(summary_op)
246 |                     summary_writer.add_summary(summary_str, step)
247 | 
248 |             # Save the model checkpoint periodically.
249 |             if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
250 |                 checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
251 |                 saver.save(sess, checkpoint_path, global_step=step)
252 | 
253 | 
254 | def main(argv=None):  # pylint: disable=unused-argument
255 |     train()
256 | 
257 | 
258 | if __name__ == '__main__':
259 |     tf.app.run()
260 | 


--------------------------------------------------------------------------------
/older/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Builds the MNIST network.
 16 | 
 17 | Summary of available functions:
 18 | 
 19 |  # Compute input images and labels for training. If you would like to run
 20 |  # evaluations, use inputs() instead.
 21 |  inputs, labels = distorted_inputs()
 22 | 
 23 |  # Compute inference on the model inputs to make a prediction.
 24 |  predictions = inference(inputs)
 25 | 
 26 |  # Compute the total loss of the prediction with respect to the labels.
 27 |  loss = loss(predictions, labels)
 28 | 
 29 |  # Create a graph to run one step of training with respect to the loss.
 30 |  train_op = train(loss, global_step)
 31 | """
 32 | 
 33 | from __future__ import absolute_import
 34 | from __future__ import division
 35 | from __future__ import print_function
 36 | 
 37 | import re
 38 | 
 39 | import tensorflow as tf
 40 | from tensorflow.examples.tutorials.mnist import input_data
 41 | 
 42 | FLAGS = tf.app.flags.FLAGS
 43 | # tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
 44 | #                            """Path to the MNIST data directory.""")
 45 | 
 46 | # Global constants describing the MNIST data set.
 47 | IMAGE_SIZE = 28
 48 | NUM_CLASSES = 10
 49 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
 50 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
 51 | 
 52 | # Constants describing the training process.
 53 | INITIAL_LEARNING_RATE = 0.0001  # Initial learning rate.
 54 | 
 55 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name
 56 | # to differentiate the operations. Note that this prefix is removed from the
 57 | # names of the summaries when visualizing a model.
 58 | TOWER_NAME = 'tower'
 59 | 
 60 | mnist = input_data.read_data_sets('/home/norman/MNIST_data', one_hot=False)
 61 | 
 62 | def inputs(batch_size=50):
 63 |   """Construct input for MNIST training using the TensorFlow framework.
 64 | 
 65 |   Returns:
 66 |     images: mnist images
 67 |     labels: mnist labels
 68 | 
 69 |   """
 70 |   images, labels = mnist.train.next_batch(batch_size)
 71 | 
 72 |   return images, labels
 73 | 
 74 | def _variable_with_weight_decay(name, shape, stddev, wd):
 75 |     """Helper to create an initialized Variable with weight decay.
 76 | 
 77 |     Note that the Variable is initialized with a truncated normal distribution.
 78 |     A weight decay is added only if one is specified.
 79 | 
 80 |     Args:
 81 |       name: name of the variable
 82 |       shape: list of ints
 83 |       stddev: standard deviation of a truncated Gaussian
 84 |       wd: add L2Loss weight decay multiplied by this float. If None, weight
 85 |           decay is not added for this Variable.
 86 | 
 87 |     Returns:
 88 |       Variable Tensor
 89 |     """
 90 |     dtype = tf.float32
 91 |     var = _variable_on_cpu(
 92 |         name,
 93 |         shape,
 94 |         tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
 95 |     if wd is not None:
 96 |         weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
 97 |         tf.add_to_collection('losses', weight_decay)
 98 |     return var
 99 | 
100 | 
101 | def _variable_on_cpu(name, shape, initializer):
102 |     """Helper to create a Variable stored on CPU memory.
103 | 
104 |     Args:
105 |       name: name of the variable
106 |       shape: list of ints
107 |       initializer: initializer for Variable
108 | 
109 |     Returns:
110 |       Variable Tensor
111 |     """
112 |     with tf.device('/cpu:0'):
113 |         dtype = tf.float32
114 |         var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
115 |     return var
116 | 
117 | 
118 | def _activation_summary(x):
119 |     """Helper to create summaries for activations.
120 | 
121 |     Creates a summary that provides a histogram of activations.
122 |     Creates a summary that measures the sparsity of activations.
123 | 
124 |     Args:
125 |       x: Tensor
126 |     Returns:
127 |       nothing
128 |     """
129 |     # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
130 |     # session. This helps the clarity of presentation on tensorboard.
131 |     tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
132 |     tf.summary.histogram(tensor_name + '/activations', x)
133 |     tf.summary.scalar(tensor_name + '/sparsity',
134 |                       tf.nn.zero_fraction(x))
135 | 
136 | 
137 | def loss(logits, labels):
138 |     """Add L2Loss to all the trainable variables.
139 | 
140 |     Add summary for "Loss" and "Loss/avg".
141 |     Args:
142 |       logits: Logits from inference().
143 |       labels: Labels from MNIST or inputs(). 1-D tensor
144 |               of shape [batch_size]
145 | 
146 |     Returns:
147 |       Loss tensor of type float.
148 |     """
149 |     # Calculate the average cross entropy loss across the batch.
150 |     labels = tf.cast(labels, tf.int32)
151 |     cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
152 |         labels=labels, logits=logits, name='cross_entropy_per_example')
153 |     cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
154 |     tf.add_to_collection('losses', cross_entropy_mean)
155 | 
156 |     # The total loss is defined as the cross entropy loss plus all of the weight
157 |     # decay terms (L2 loss).
158 |     return tf.add_n(tf.get_collection('losses'), name='total_loss')
159 | 
160 | 
161 | def inference(images, keep_prob=1.0):
162 |     """Build the MNIST model.
163 | 
164 |     Args:
165 |       images: Images returned from MNIST or inputs().
166 | 
167 |     Returns:
168 |       Logits.
169 |     """
170 |     # We instantiate all variables using tf.get_variable() instead of
171 |     # tf.Variable() in order to share variables across multiple GPU training
172 |     # runs. If we only ran this model on a single GPU, we could simplify this
173 |     # function by replacing all instances of tf.get_variable()
174 |     # with tf.Variable().
175 | 
176 |     # Reshape to use within a convolutional neural net.
177 |     # Last dimension is for "features" - there is only one here, since images
178 |     # are grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
179 |     x_image = tf.reshape(images, [-1, 28, 28, 1])
180 | 
181 |     # conv1
182 |     with tf.variable_scope('conv1') as scope:
183 |         kernel = _variable_with_weight_decay('weights',
184 |                                              shape=[5, 5, 1, 32],
185 |                                              stddev=5e-2,
186 |                                              wd=0.0)
187 |         biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.0))
188 |         conv = tf.nn.conv2d(x_image, kernel, strides=[1, 1, 1, 1],
189 |                             padding='SAME')
190 |         pre_activation = tf.nn.bias_add(conv, biases)
191 |         conv1 = tf.nn.relu(pre_activation, name=scope.name)
192 |         _activation_summary(conv1)
193 | 
194 |     # pool1
195 |     pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
196 |                            padding='SAME', name='pool1')
197 | 
198 |     # norm1
199 |     norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
200 |                       name='norm1')
201 | 
202 |     # conv2
203 |     with tf.variable_scope('conv2') as scope:
204 |         kernel = _variable_with_weight_decay('weights',
205 |                                              shape=[5, 5, 32, 64],
206 |                                              stddev=5e-2,
207 |                                              wd=0.0)
208 |         conv = tf.nn.conv2d(norm1, kernel, strides=[1, 1, 1, 1], padding='SAME')
209 |         biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
210 |         pre_activation = tf.nn.bias_add(conv, biases)
211 |         conv2 = tf.nn.relu(pre_activation, name=scope.name)
212 |         _activation_summary(conv2)
213 | 
214 |     # norm2
215 |     norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
216 |                       name='norm1')
217 | 
218 |     # pool2
219 |     pool2 = tf.nn.max_pool(norm2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
220 |                            padding='SAME', name='pool2')
221 | 
222 |     # local3
223 |     with tf.variable_scope('local3') as scope:
224 |         # Move everything into depth so we can perform a single matrix multiply.
225 |         reshape = tf.reshape(pool2, [-1, 7 * 7 * 64])
226 |         dim = reshape.get_shape()[1].value
227 |         weights = _variable_with_weight_decay('weights', shape=[dim, 1024],
228 |                                               stddev=0.04, wd=0.004)
229 |         biases = _variable_on_cpu('biases', [1024],
230 |                                   tf.constant_initializer(0.1))
231 |         local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,
232 |                             name=scope.name)
233 |         _activation_summary(local3)
234 | 
235 |     # local4 with dropout
236 |     with tf.variable_scope('local4') as scope:
237 |         # keep_prob = tf.placeholder(tf.float32, name="keep_prob")
238 |         local4 = tf.nn.dropout(local3, keep_prob, name=scope.name)
239 |         weights = _variable_with_weight_decay('weights', shape=[1024, 10],
240 |                                               stddev=0.04, wd=0.004)
241 |         biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.1))
242 |         softmax_linear = tf.add(tf.matmul(local4, weights), biases,
243 |                                 name=scope.name)
244 |         _activation_summary(softmax_linear)
245 | 
246 |     return softmax_linear
247 | 


--------------------------------------------------------------------------------
/optimizer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Class that holds a genetic algorithm for evolving a network.
  3 | 
  4 | Credit:
  5 |     A lot of those code was originally inspired by:
  6 |     http://lethain.com/genetic-algorithms-cool-name-damn-simple/
  7 | """
  8 | from functools import reduce
  9 | from operator import add
 10 | import random
 11 | from network import Network
 12 | 
 13 | class Optimizer():
 14 |     """Class that implements genetic algorithm for MLP optimization."""
 15 | 
 16 |     def __init__(self, nn_param_choices, retain=0.4,
 17 |                  random_select=0.1, mutate_chance=0.2):
 18 |         """Create an optimizer.
 19 | 
 20 |         Args:
 21 |             nn_param_choices (dict): Possible network paremters
 22 |             retain (float): Percentage of population to retain after
 23 |                 each generation
 24 |             random_select (float): Probability of a rejected network
 25 |                 remaining in the population
 26 |             mutate_chance (float): Probability a network will be
 27 |                 randomly mutated
 28 | 
 29 |         """
 30 |         self.mutate_chance = mutate_chance
 31 |         self.random_select = random_select
 32 |         self.retain = retain
 33 |         self.nn_param_choices = nn_param_choices
 34 | 
 35 |     def create_population(self, count):
 36 |         """Create a population of random networks.
 37 | 
 38 |         Args:
 39 |             count (int): Number of networks to generate, aka the
 40 |                 size of the population
 41 | 
 42 |         Returns:
 43 |             (list): Population of network objects
 44 | 
 45 |         """
 46 |         pop = []
 47 |         for _ in range(0, count):
 48 |             # Create a random network.
 49 |             network = Network(self.nn_param_choices)
 50 |             network.create_random()
 51 | 
 52 |             # Add the network to our population.
 53 |             pop.append(network)
 54 | 
 55 |         return pop
 56 | 
 57 |     @staticmethod
 58 |     def fitness(network):
 59 |         """Return the accuracy, which is our fitness function."""
 60 |         return network.accuracy
 61 | 
 62 |     def grade(self, pop):
 63 |         """Find average fitness for a population.
 64 | 
 65 |         Args:
 66 |             pop (list): The population of networks
 67 | 
 68 |         Returns:
 69 |             (float): The average accuracy of the population
 70 | 
 71 |         """
 72 |         summed = reduce(add, (self.fitness(network) for network in pop))
 73 |         return summed / float((len(pop)))
 74 | 
 75 |     def breed(self, mother, father):
 76 |         """Make two children as parts of their parents.
 77 | 
 78 |         Args:
 79 |             mother (dict): Network parameters
 80 |             father (dict): Network parameters
 81 | 
 82 |         Returns:
 83 |             (list): Two network objects
 84 | 
 85 |         """
 86 |         children = []
 87 |         for _ in range(2):
 88 | 
 89 |             child = {}
 90 | 
 91 |             # Loop through the parameters and pick params for the kid.
 92 |             for param in self.nn_param_choices:
 93 |                 child[param] = random.choice(
 94 |                     [mother.network[param], father.network[param]]
 95 |                 )
 96 | 
 97 |             # Now create a network object.
 98 |             network = Network(self.nn_param_choices)
 99 |             network.create_set(child)
100 | 
101 |             children.append(network)
102 | 
103 |         return children
104 | 
105 |     def mutate(self, network):
106 |         """Randomly mutate one part of the network.
107 | 
108 |         Args:
109 |             network (dict): The network parameters to mutate
110 | 
111 |         Returns:
112 |             (Network): A randomly mutated network object
113 | 
114 |         """
115 |         # Choose a random key.
116 |         mutation = random.choice(list(self.nn_param_choices.keys()))
117 | 
118 |         # Mutate one of the params.
119 |         network.network[mutation] = random.choice(self.nn_param_choices[mutation])
120 | 
121 |         return network
122 | 
123 |     def evolve(self, pop):
124 |         """Evolve a population of networks.
125 | 
126 |         Args:
127 |             pop (list): A list of network parameters
128 | 
129 |         Returns:
130 |             (list): The evolved population of networks
131 | 
132 |         """
133 |         # Get scores for each network.
134 |         graded = [(self.fitness(network), network) for network in pop]
135 | 
136 |         # Sort on the scores.
137 |         graded = [x[1] for x in sorted(graded, key=lambda x: x[0], reverse=True)]
138 | 
139 |         # Get the number we want to keep for the next gen.
140 |         retain_length = int(len(graded)*self.retain)
141 | 
142 |         # The parents are every network we want to keep.
143 |         parents = graded[:retain_length]
144 | 
145 |         # For those we aren't keeping, randomly keep some anyway.
146 |         for individual in graded[retain_length:]:
147 |             if self.random_select > random.random():
148 |                 parents.append(individual)
149 | 
150 |         # Randomly mutate some of the networks we're keeping.
151 |         for individual in parents:
152 |             if self.mutate_chance > random.random():
153 |                 individual = self.mutate(individual)
154 | 
155 |         # Now find out how many spots we have left to fill.
156 |         parents_length = len(parents)
157 |         desired_length = len(pop) - parents_length
158 |         children = []
159 | 
160 |         # Add children, which are bred from two remaining networks.
161 |         while len(children) < desired_length:
162 | 
163 |             # Get a random mom and dad.
164 |             male = random.randint(0, parents_length-1)
165 |             female = random.randint(0, parents_length-1)
166 | 
167 |             # Assuming they aren't the same network...
168 |             if male != female:
169 |                 male = parents[male]
170 |                 female = parents[female]
171 | 
172 |                 # Breed them.
173 |                 babies = self.breed(male, female)
174 | 
175 |                 # Add the children one at a time.
176 |                 for baby in babies:
177 |                     # Don't grow larger than desired length.
178 |                     if len(children) < desired_length:
179 |                         children.append(baby)
180 | 
181 |         parents.extend(children)
182 | 
183 |         return parents
184 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility used by the Network class to actually train.
  3 | 
  4 | Based on:
  5 |     https://github.com/fchollet/keras/blob/master/examples/mnist_mlp.py
  6 | 
  7 | """
  8 | from keras.datasets import mnist, cifar10
  9 | from keras.models import Sequential
 10 | from keras.layers import Dense, Dropout
 11 | from keras.utils.np_utils import to_categorical
 12 | from keras.callbacks import EarlyStopping
 13 | 
 14 | # Helper: Early stopping.
 15 | early_stopper = EarlyStopping(patience=5)
 16 | 
 17 | def get_cifar10():
 18 |     """Retrieve the CIFAR dataset and process the data."""
 19 |     # Set defaults.
 20 |     nb_classes = 10
 21 |     batch_size = 64
 22 |     input_shape = (3072,)
 23 | 
 24 |     # Get the data.
 25 |     (x_train, y_train), (x_test, y_test) = cifar10.load_data()
 26 |     x_train = x_train.reshape(50000, 3072)
 27 |     x_test = x_test.reshape(10000, 3072)
 28 |     x_train = x_train.astype('float32')
 29 |     x_test = x_test.astype('float32')
 30 |     x_train /= 255
 31 |     x_test /= 255
 32 | 
 33 |     # convert class vectors to binary class matrices
 34 |     y_train = to_categorical(y_train, nb_classes)
 35 |     y_test = to_categorical(y_test, nb_classes)
 36 | 
 37 |     return (nb_classes, batch_size, input_shape, x_train, x_test, y_train, y_test)
 38 | 
 39 | def get_mnist():
 40 |     """Retrieve the MNIST dataset and process the data."""
 41 |     # Set defaults.
 42 |     nb_classes = 10
 43 |     batch_size = 128
 44 |     input_shape = (784,)
 45 | 
 46 |     # Get the data.
 47 |     (x_train, y_train), (x_test, y_test) = mnist.load_data()
 48 |     x_train = x_train.reshape(60000, 784)
 49 |     x_test = x_test.reshape(10000, 784)
 50 |     x_train = x_train.astype('float32')
 51 |     x_test = x_test.astype('float32')
 52 |     x_train /= 255
 53 |     x_test /= 255
 54 | 
 55 |     # convert class vectors to binary class matrices
 56 |     y_train = to_categorical(y_train, nb_classes)
 57 |     y_test = to_categorical(y_test, nb_classes)
 58 | 
 59 |     return (nb_classes, batch_size, input_shape, x_train, x_test, y_train, y_test)
 60 | 
 61 | def compile_model(network, nb_classes, input_shape):
 62 |     """Compile a sequential model.
 63 | 
 64 |     Args:
 65 |         network (dict): the parameters of the network
 66 | 
 67 |     Returns:
 68 |         a compiled network.
 69 | 
 70 |     """
 71 |     # Get our network parameters.
 72 |     nb_layers = network['nb_layers']
 73 |     nb_neurons = network['nb_neurons']
 74 |     activation = network['activation']
 75 |     optimizer = network['optimizer']
 76 | 
 77 |     model = Sequential()
 78 | 
 79 |     # Add each layer.
 80 |     for i in range(nb_layers):
 81 | 
 82 |         # Need input shape for first layer.
 83 |         if i == 0:
 84 |             model.add(Dense(nb_neurons, activation=activation, input_shape=input_shape))
 85 |         else:
 86 |             model.add(Dense(nb_neurons, activation=activation))
 87 | 
 88 |         model.add(Dropout(0.2))  # hard-coded dropout
 89 | 
 90 |     # Output layer.
 91 |     model.add(Dense(nb_classes, activation='softmax'))
 92 | 
 93 |     model.compile(loss='categorical_crossentropy', optimizer=optimizer,
 94 |                   metrics=['accuracy'])
 95 | 
 96 |     return model
 97 | 
 98 | def train_and_score(network, dataset):
 99 |     """Train the model, return test loss.
100 | 
101 |     Args:
102 |         network (dict): the parameters of the network
103 |         dataset (str): Dataset to use for training/evaluating
104 | 
105 |     """
106 |     if dataset == 'cifar10':
107 |         nb_classes, batch_size, input_shape, x_train, \
108 |             x_test, y_train, y_test = get_cifar10()
109 |     elif dataset == 'mnist':
110 |         nb_classes, batch_size, input_shape, x_train, \
111 |             x_test, y_train, y_test = get_mnist()
112 | 
113 |     model = compile_model(network, nb_classes, input_shape)
114 | 
115 |     model.fit(x_train, y_train,
116 |               batch_size=batch_size,
117 |               epochs=10000,  # using early stopping, so no real limit
118 |               verbose=0,
119 |               validation_data=(x_test, y_test),
120 |               callbacks=[early_stopper])
121 | 
122 |     score = model.evaluate(x_test, y_test, verbose=0)
123 | 
124 |     return score[1]  # 1 is accuracy. 0 is loss.
125 | 


--------------------------------------------------------------------------------