├── .gitignore
├── LICENSE.md
├── README.md
├── __init__.py
├── brute.py
├── convert_to_records.py
├── images
├── Parallelism.png
└── mnist_graph.png
├── main.py
├── mnist_multi_gpu_batching_train.py
├── mnist_multi_gpu_keras.py
├── mnist_multi_gpu_sonnet.py
├── multi_gpu.py
├── network.py
├── older
├── mnist_multi_gpu_eval.py
├── mnist_multi_gpu_train.py
└── model.py
├── optimizer.py
└── train.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /mnist_with_summaries.zip
2 | /mnist_data.zip
3 | /mnist_data/train-labels-idx1-ubyte.gz
4 | /mnist_data/train-images-idx3-ubyte.gz
5 | /mnist_data/t10k-labels-idx1-ubyte.gz
6 | /mnist_data/t10k-images-idx3-ubyte.gz
7 | /.idea
8 | *.pyc
9 | .DS_Store
10 | /logs/test
11 | /logs/train
12 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "{}"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright 2017 Norman Heckscher
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MNIST Multi GPU with TensorFlow
2 | A ConvNet for MNIST digit classification.
3 |
4 | Multi GPU example with TensorFlow utilising local tower architecture for each GPU.
5 |
6 | Several different examples. Utilise batching and direct feed.
7 |
8 | Keras performs the best. It utilises the MultiGPU code from: https://github.com/kuza55/keras-extras
9 |
10 | ## Training a Model Using Multiple GPU Cards
11 |
12 | Modern workstations may contain multiple GPUs for scientific computation.
13 | TensorFlow can leverage this environment to run the training operation
14 | concurrently across multiple cards.
15 |
16 | Training a model in a parallel, distributed fashion requires
17 | coordinating training processes. For what follows we term *model replica*
18 | to be one copy of a model training on a subset of data.
19 |
20 | Naively employing asynchronous updates of model parameters
21 | leads to sub-optimal training performance
22 | because an individual model replica might be trained on a stale
23 | copy of the model parameters. Conversely, employing fully synchronous
24 | updates will be as slow as the slowest model replica.
25 |
26 | In a workstation with multiple GPU cards, each GPU will have similar speed
27 | and contain enough memory to run an entire MNIST model. Thus, we opt to
28 | design our training system in the following manner:
29 |
30 | * Place an individual model replica on each GPU.
31 | * Update model parameters synchronously by waiting for all GPUs to finish
32 | processing a batch of data.
33 |
34 | Here is a diagram of this model:
35 |
36 |
37 |

38 |
39 |
40 | Note that each GPU computes inference as well as the gradients for a unique
41 | batch of data. This setup effectively permits dividing up a larger batch
42 | of data across the GPUs.
43 |
44 | This setup requires that all GPUs share the model parameters. A well-known
45 | fact is that transferring data to and from GPUs is quite slow. For this
46 | reason, we decide to store and update all model parameters on the CPU (see
47 | green box). A fresh set of model parameters is transferred to the GPU
48 | when a new batch of data is processed by all GPUs.
49 |
50 | The GPUs are synchronized in operation. All gradients are accumulated from
51 | the GPUs and averaged (see green box). The model parameters are updated with
52 | the gradients averaged across all model replicas.
53 |
54 | ### Model Prediction
55 |
56 | The prediction part of the model is constructed by the `inference()` function
57 | which adds operations to compute the *logits* of the predictions. That part of
58 | the model is organized as follows:
59 |
60 | Layer Name | Description
61 | --- | ---
62 | `conv1` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation.
63 | `pool1` | @{tf.nn.max_pool$max pooling}.
64 | `norm1` | @{tf.nn.local_response_normalization$local response normalization}.
65 | `conv2` | @{tf.nn.conv2d$convolution} and @{tf.nn.relu$rectified linear} activation.
66 | `norm2` | @{tf.nn.local_response_normalization$local response normalization}.
67 | `pool2` | @{tf.nn.max_pool$max pooling}.
68 | `local3` | @{$python/nn$fully connected layer with rectified linear activation}.
69 | `local4` | @{$python/nn$fully connected layer with rectified linear activation}.
70 | `softmax_linear` | linear transformation to produce logits.
71 |
72 | Here is a graph generated from TensorBoard describing the inference operation:
73 |
74 |
75 |

76 |
77 |
78 |
79 | # Evolve a neural network with a genetic algorithm
80 |
81 | Taken from https://github.com/harvitronix/neural-network-genetic-algorithm
82 |
83 | `train.py`
84 | `optimizer.py`
85 | `network.py`
86 | `main.py`
87 | `brute.py`
88 |
89 | This is an example of how we can use a genetic algorithm in an attempt to find the optimal network parameters for classification tasks.
90 |
91 | It's currently limited to only MLPs (ie. fully connected networks) and uses the Keras library to build, train and validate.
92 |
93 | On the easy MNIST dataset, we are able to quickly find a network that reaches > 98% accuracy. On the more challenging CIFAR10 dataset, we get to 56% after 10 generations (with population 20).
94 |
95 | For more, see this blog post:
96 | https://medium.com/@harvitronix/lets-evolve-a-neural-network-with-a-genetic-algorithm-code-included-8809bece164
97 |
98 | ## To run
99 |
100 | To run the brute force algorithm:
101 |
102 | ```python3 brute.py```
103 |
104 | To run the genetic algorithm:
105 |
106 | ```python3 main.py```
107 |
108 | You can set your network parameter choices by editing each of those files first. You can also choose whether to use the MNIST or CIFAR10 datasets. Simply set `dataset` to either `mnist` or `cifar10`.
109 |
110 |
111 | # Contribution
112 | Your comments (issues) and PRs are always welcome.
113 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the 'License');
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
--------------------------------------------------------------------------------
/brute.py:
--------------------------------------------------------------------------------
1 | """Iterate over every combination of hyperparameters."""
2 | import logging
3 | from network import Network
4 | from tqdm import tqdm
5 |
6 | # Setup logging.
7 | logging.basicConfig(
8 | format='%(asctime)s - %(levelname)s - %(message)s',
9 | datefmt='%m/%d/%Y %I:%M:%S %p',
10 | level=logging.DEBUG,
11 | filename='brute-log.txt'
12 | )
13 |
14 | def train_networks(networks, dataset):
15 | """Train each network.
16 |
17 | Args:
18 | networks (list): Current population of networks
19 | dataset (str): Dataset to use for training/evaluating
20 | """
21 | pbar = tqdm(total=len(networks))
22 | for network in networks:
23 | network.train(dataset)
24 | network.print_network()
25 | pbar.update(1)
26 | pbar.close()
27 |
28 | # Sort our final population.
29 | networks = sorted(networks, key=lambda x: x.accuracy, reverse=True)
30 |
31 | # Print out the top 5 networks.
32 | print_networks(networks[:5])
33 |
34 | def print_networks(networks):
35 | """Print a list of networks.
36 |
37 | Args:
38 | networks (list): The population of networks
39 |
40 | """
41 | logging.info('-'*80)
42 | for network in networks:
43 | network.print_network()
44 |
45 | def generate_network_list(nn_param_choices):
46 | """Generate a list of all possible networks.
47 |
48 | Args:
49 | nn_param_choices (dict): The parameter choices
50 |
51 | Returns:
52 | networks (list): A list of network objects
53 |
54 | """
55 | networks = []
56 |
57 | # This is silly.
58 | for nbn in nn_param_choices['nb_neurons']:
59 | for nbl in nn_param_choices['nb_layers']:
60 | for a in nn_param_choices['activation']:
61 | for o in nn_param_choices['optimizer']:
62 |
63 | # Set the parameters.
64 | network = {
65 | 'nb_neurons': nbn,
66 | 'nb_layers': nbl,
67 | 'activation': a,
68 | 'optimizer': o,
69 | }
70 |
71 | # Instantiate a network object with set parameters.
72 | network_obj = Network()
73 | network_obj.create_set(network)
74 |
75 | networks.append(network_obj)
76 |
77 | return networks
78 |
79 | def main():
80 | """Brute force test every network."""
81 | dataset = 'cifar10'
82 |
83 | nn_param_choices = {
84 | 'nb_neurons': [64, 128, 256, 512, 768, 1024],
85 | 'nb_layers': [1, 2, 3, 4],
86 | 'activation': ['relu', 'elu', 'tanh', 'sigmoid'],
87 | 'optimizer': ['rmsprop', 'adam', 'sgd', 'adagrad',
88 | 'adadelta', 'adamax', 'nadam'],
89 | }
90 |
91 | logging.info("***Brute forcing networks***")
92 |
93 | networks = generate_network_list(nn_param_choices)
94 |
95 | train_networks(networks, dataset)
96 |
97 | if __name__ == '__main__':
98 | main()
99 |
--------------------------------------------------------------------------------
/convert_to_records.py:
--------------------------------------------------------------------------------
1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Converts MNIST data to TFRecords file format with Example protos."""
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import argparse
22 | import os
23 | import sys
24 |
25 | import tensorflow as tf
26 |
27 | from tensorflow.contrib.learn.python.learn.datasets import mnist
28 |
29 | FLAGS = None
30 |
31 |
32 | def _int64_feature(value):
33 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
34 |
35 |
36 | def _bytes_feature(value):
37 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
38 |
39 |
40 | def convert_to(data_set, name):
41 | """Converts a dataset to tfrecords."""
42 | images = data_set.images
43 | labels = data_set.labels
44 | num_examples = data_set.num_examples
45 |
46 | if images.shape[0] != num_examples:
47 | raise ValueError('Images size %d does not match label size %d.' %
48 | (images.shape[0], num_examples))
49 | rows = images.shape[1]
50 | cols = images.shape[2]
51 | depth = images.shape[3]
52 |
53 | filename = os.path.join(FLAGS.directory, name + '.tfrecords')
54 | print('Writing', filename)
55 | writer = tf.python_io.TFRecordWriter(filename)
56 | for index in range(num_examples):
57 | image_raw = images[index].tostring()
58 | example = tf.train.Example(features=tf.train.Features(feature={
59 | 'height': _int64_feature(rows),
60 | 'width': _int64_feature(cols),
61 | 'depth': _int64_feature(depth),
62 | 'label': _int64_feature(int(labels[index])),
63 | 'image_raw': _bytes_feature(image_raw)}))
64 | writer.write(example.SerializeToString())
65 | writer.close()
66 |
67 |
68 | def main(unused_argv):
69 | # Get the data.
70 | data_sets = mnist.read_data_sets(FLAGS.directory,
71 | dtype=tf.uint8,
72 | reshape=False,
73 | validation_size=FLAGS.validation_size)
74 |
75 | # Convert to Examples and write the result to TFRecords.
76 | convert_to(data_sets.train, 'train')
77 | convert_to(data_sets.validation, 'validation')
78 | convert_to(data_sets.test, 'test')
79 |
80 |
81 | if __name__ == '__main__':
82 | parser = argparse.ArgumentParser()
83 | parser.add_argument(
84 | '--directory',
85 | type=str,
86 | default='/home/norman/MNIST_data',
87 | help='Directory to download data files and write the converted result'
88 | )
89 | parser.add_argument(
90 | '--validation_size',
91 | type=int,
92 | default=5000,
93 | help="""\
94 | Number of examples to separate from the training data for the validation
95 | set.\
96 | """
97 | )
98 | FLAGS, unparsed = parser.parse_known_args()
99 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
--------------------------------------------------------------------------------
/images/Parallelism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/normanheckscher/mnist-multi-gpu/3bbd69d852c9029bd3f86ca83786d33b15a54a8d/images/Parallelism.png
--------------------------------------------------------------------------------
/images/mnist_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/normanheckscher/mnist-multi-gpu/3bbd69d852c9029bd3f86ca83786d33b15a54a8d/images/mnist_graph.png
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | """Entry point to evolving the neural network. Start here."""
2 | import logging
3 | from optimizer import Optimizer
4 | from tqdm import tqdm
5 |
6 | # Setup logging.
7 | logging.basicConfig(
8 | format='%(asctime)s - %(levelname)s - %(message)s',
9 | datefmt='%m/%d/%Y %I:%M:%S %p',
10 | level=logging.DEBUG,
11 | filename='log.txt'
12 | )
13 |
14 | def train_networks(networks, dataset):
15 | """Train each network.
16 |
17 | Args:
18 | networks (list): Current population of networks
19 | dataset (str): Dataset to use for training/evaluating
20 | """
21 | pbar = tqdm(total=len(networks))
22 | for network in networks:
23 | network.train(dataset)
24 | pbar.update(1)
25 | pbar.close()
26 |
27 | def get_average_accuracy(networks):
28 | """Get the average accuracy for a group of networks.
29 |
30 | Args:
31 | networks (list): List of networks
32 |
33 | Returns:
34 | float: The average accuracy of a population of networks.
35 |
36 | """
37 | total_accuracy = 0
38 | for network in networks:
39 | total_accuracy += network.accuracy
40 |
41 | return total_accuracy / len(networks)
42 |
43 | def generate(generations, population, nn_param_choices, dataset):
44 | """Generate a network with the genetic algorithm.
45 |
46 | Args:
47 | generations (int): Number of times to evole the population
48 | population (int): Number of networks in each generation
49 | nn_param_choices (dict): Parameter choices for networks
50 | dataset (str): Dataset to use for training/evaluating
51 |
52 | """
53 | optimizer = Optimizer(nn_param_choices)
54 | networks = optimizer.create_population(population)
55 |
56 | # Evolve the generation.
57 | for i in range(generations):
58 | logging.info("***Doing generation %d of %d***" %
59 | (i + 1, generations))
60 |
61 | # Train and get accuracy for networks.
62 | train_networks(networks, dataset)
63 |
64 | # Get the average accuracy for this generation.
65 | average_accuracy = get_average_accuracy(networks)
66 |
67 | # Print out the average accuracy each generation.
68 | logging.info("Generation average: %.2f%%" % (average_accuracy * 100))
69 | logging.info('-'*80)
70 |
71 | # Evolve, except on the last iteration.
72 | if i != generations - 1:
73 | # Do the evolution.
74 | networks = optimizer.evolve(networks)
75 |
76 | # Sort our final population.
77 | networks = sorted(networks, key=lambda x: x.accuracy, reverse=True)
78 |
79 | # Print out the top 5 networks.
80 | print_networks(networks[:5])
81 |
82 | def print_networks(networks):
83 | """Print a list of networks.
84 |
85 | Args:
86 | networks (list): The population of networks
87 |
88 | """
89 | logging.info('-'*80)
90 | for network in networks:
91 | network.print_network()
92 |
93 | def main():
94 | """Evolve a network."""
95 | generations = 10 # Number of times to evole the population.
96 | population = 20 # Number of networks in each generation.
97 | dataset = 'mnist'
98 |
99 | nn_param_choices = {
100 | 'nb_neurons': [64, 128, 256, 512, 768, 1024],
101 | 'nb_layers': [1, 2, 3, 4],
102 | 'activation': ['relu', 'elu', 'tanh', 'sigmoid'],
103 | 'optimizer': ['rmsprop', 'adam', 'sgd', 'adagrad',
104 | 'adadelta', 'adamax', 'nadam'],
105 | }
106 |
107 | logging.info("***Evolving %d generations with population %d***" %
108 | (generations, population))
109 |
110 | generate(generations, population, nn_param_choices, dataset)
111 |
112 | if __name__ == '__main__':
113 | main()
114 |
--------------------------------------------------------------------------------
/mnist_multi_gpu_batching_train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the 'License');
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """A binary to train MNIST using multiple GPU's with synchronous updates.
16 |
17 | Accuracy:
18 | Should achieve ~99.2% accuracy after 20K steps, unfortunately it's not at
19 | the moment.
20 |
21 | Speed: With batch_size 50.
22 |
23 | System | Step Time (sec/batch) | Accuracy
24 | -------------------------------------------------------------------------
25 | 1 GTX 1080 | 0.018-0.022 | ~xx.xx% at 20K steps (x hours)
26 | 2 GTX 1080 | 0.012-0.015 | ~xx.xx% at 20K steps (x hours)
27 |
28 | Usage:
29 | Please see the TensorFlow website for how to download the MNIST
30 | data set, compile and train models.
31 |
32 | """
33 |
34 | from __future__ import absolute_import
35 | from __future__ import division
36 | from __future__ import print_function
37 |
38 | import os.path
39 | import re
40 | import time
41 | import numpy as np
42 | from datetime import datetime
43 |
44 | from tensorflow.examples.tutorials.mnist import input_data
45 |
46 | import tensorflow as tf
47 |
48 | # Constants used for dealing with the files, matches convert_to_records.
49 | TRAIN_FILE = 'train.tfrecords'
50 | VALIDATION_FILE = 'validation.tfrecords'
51 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name
52 | # to differentiate the operations. Note that this prefix is removed from the
53 | # names of the summaries when visualizing a model.
54 | TOWER_NAME = 'tower'
55 | IMAGE_PIXELS = 784
56 |
57 | # Constants describing the training process.
58 | MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average.
59 | NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays.
60 | LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor.
61 | INITIAL_LEARNING_RATE = 0.1 # Initial learning rate.
62 |
63 | # Global constants describing the MNIST data set.
64 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
65 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
66 |
67 | FLAGS = tf.app.flags.FLAGS
68 |
69 | tf.app.flags.DEFINE_integer('batch_size', 64,
70 | """Number of images to process in a batch.""")
71 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
72 | """Path to the MNIST data directory.""")
73 | tf.app.flags.DEFINE_string('train_dir', '/home/norman/MNIST_train',
74 | """Directory where to write event logs """
75 | """and checkpoint.""")
76 | tf.app.flags.DEFINE_integer('num_gpus', 2,
77 | """How many GPUs to use.""")
78 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
79 | """Whether to log device placement.""")
80 | tf.app.flags.DEFINE_boolean('tb_logging', False,
81 | """Whether to log to Tensorboard.""")
82 | tf.app.flags.DEFINE_integer('num_epochs', 10,
83 | """Number of epochs to run trainer.""")
84 | # 17/4/17
85 | # 1 gpu
86 | # Done training for 20 epochs, 22000 steps.
87 | # Total Duration (474.817 sec)
88 | # 2017-04-17 15:24:51.190879: precision = 9743.000v
89 | # Done training for 20 epochs, 22000 steps.
90 | # Total Duration (497.690 sec)
91 | # 2017-04-17 15:35:10.070366: precision = 9305.000
92 | # 2 gpu
93 | # Done training for 20 epochs, 22000 steps.
94 | # Total Duration (687.583 sec)
95 | # 2017-04-17 15:14:28.793936: precision = 9472.000
96 | # Done training for 20 epochs, 22000 steps.
97 | # Total Duration (672.720 sec)
98 | # 2017-04-17 15:52:16.096935: precision = 9672.000
99 | # 17/4/17
100 |
101 | # 18/4/17
102 | # 2 GPU
103 | # Done training for 10 epochs, 8593 steps.
104 | # Total Duration (339.430 sec)
105 | # 2017-04-18 10:50:53.269983: precision = 9677.000
106 | # Done training for 10 epochs, 8593 steps.
107 | # Total Duration (335.611 sec)
108 | # 2017-04-18 11:14:26.685982: precision = 9674.000
109 | # Done training for 10 epochs, 8593 steps.
110 | # Total Duration (349.731 sec)
111 | # 2017-04-18 12:48:15.148828: precision = 9267.000
112 | # Done training for 10 epochs, 8593 steps.
113 | # Total Duration (350.593 sec)
114 | # 2017-04-18 13:14:51.974247: precision = 9270.000
115 | # Done training for 10 epochs, 8593 steps.
116 | # Total Duration (361.926 sec)
117 | # 2017-04-18 13:58:02.775474: precision = 9507.000
118 | # Done training for 10 epochs, 8593 steps.
119 | # Total Duration (346.119 sec)
120 | # 2017-04-18 14:46:51.579685: precision = 9471.000
121 | # Done training for 10 epochs, 8593 steps.
122 | # Total Duration (334.561 sec)
123 | # 2017-04-18 14:58:06.942195: precision = 9781.000
124 |
125 | # 1 GPU
126 | # Done training for 10 epochs, 8593 steps.
127 | # Total Duration (238.033 sec)
128 | # 2017-04-18 11:02:06.403359: precision = 9679.000
129 | # Done training for 10 epochs, 8593 steps.
130 | # Total Duration (256.169 sec)
131 | # 2017-04-18 11:20:54.328206: precision = 9362.000
132 | # Done training for 10 epochs, 8593 steps.
133 | # Total Duration (257.144 sec)
134 | # 2017-04-18 12:30:53.954074: precision = 8989.000
135 | # Done training for 10 epochs, 8593 steps.
136 | # Total Duration (250.306 sec)
137 | # 2017-04-18 12:40:26.649277: precision = 9512.000
138 | # Done training for 10 epochs, 8593 steps.
139 | # Total Duration (257.795 sec)
140 | # 2017-04-18 13:22:48.300705: precision = 9692.000
141 | # Done training for 10 epochs, 8593 steps.
142 | # Total Duration (254.077 sec)
143 | # 2017-04-18 13:35:26.700627: precision = 9391.000
144 | # Done training for 10 epochs, 8593 steps.
145 | # Total Duration (253.215 sec)
146 | # 2017-04-18 13:41:46.708623: precision = 9734.000
147 |
148 |
149 |
150 | def read_and_decode(filename_queue):
151 | reader = tf.TFRecordReader()
152 | _, serialized_example = reader.read(filename_queue)
153 | features = tf.parse_single_example(
154 | serialized_example,
155 | # Defaults are not specified since both keys are required.
156 | features={
157 | 'image_raw': tf.FixedLenFeature([], tf.string),
158 | 'label': tf.FixedLenFeature([], tf.int64),
159 | })
160 |
161 | # Convert from a scalar string tensor (whose single string has
162 | # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
163 | # [mnist.IMAGE_PIXELS].
164 | image = tf.decode_raw(features['image_raw'], tf.uint8)
165 | image.set_shape([IMAGE_PIXELS])
166 |
167 | # OPTIONAL: Could reshape into a 28x28 image and apply distortions
168 | # here. Since we are not applying any distortions in this
169 | # example, and the next step expects the image to be flattened
170 | # into a vector, we don't bother.
171 |
172 | # Convert from [0, 255] -> [-0.5, 0.5] floats.
173 | image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
174 |
175 | # Convert label from a scalar uint8 tensor to an int32 scalar.
176 | label = tf.cast(features['label'], tf.int32)
177 |
178 | return image, label
179 |
180 | def inputs(train, batch_size, num_epochs):
181 | """Reads input data num_epochs times.
182 | Args:
183 | train: Selects between the training (True) and validation (False) data.
184 | batch_size: Number of examples per returned batch.
185 | num_epochs: Number of times to read the input data, or 0/None to
186 | train forever.
187 | Returns:
188 | A tuple (images, labels), where:
189 | * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]
190 | in the range [-0.5, 0.5].
191 | * labels is an int32 tensor with shape [batch_size] with the true label,
192 | a number in the range [0, mnist.NUM_CLASSES).
193 | Note that an tf.train.QueueRunner is added to the graph, which
194 | must be run using e.g. tf.train.start_queue_runners().
195 | """
196 | if not num_epochs: num_epochs = None
197 | filename = os.path.join(FLAGS.data_dir,
198 | TRAIN_FILE if train else VALIDATION_FILE)
199 |
200 | with tf.name_scope('input'):
201 | filename_queue = tf.train.string_input_producer(
202 | [filename], num_epochs=num_epochs)
203 |
204 | # Even when reading in multiple threads, share the filename
205 | # queue.
206 | image, label = read_and_decode(filename_queue)
207 |
208 | # Shuffle the examples and collect them into batch_size batches.
209 | # (Internally uses a RandomShuffleQueue.)
210 | # We run this in two threads to avoid being a bottleneck.
211 | images, sparse_labels = tf.train.shuffle_batch(
212 | [image, label], batch_size=batch_size, num_threads=2,
213 | capacity=1000 + 3 * batch_size,
214 | # Ensures a minimum amount of shuffling of examples.
215 | min_after_dequeue=1000)
216 |
217 | return images, sparse_labels
218 |
219 | def inference(images):
220 | """Build the MNIST model.
221 |
222 | Args:
223 | images: Images returned from MNIST or inputs().
224 |
225 | Returns:
226 | Logits.
227 | """
228 | # We instantiate all variables using tf.get_variable() instead of
229 | # tf.Variable() in order to share variables across multiple GPU training
230 | # runs. If we only ran this model on a single GPU, we could simplify this
231 | # function by replacing all instances of tf.get_variable()
232 | # with tf.Variable().
233 |
234 | # Reshape to use within a convolutional neural net.
235 | # Last dimension is for "features" - there is only one here, since images
236 | # are grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
237 | x_image = tf.reshape(images, [-1, 28, 28, 1])
238 |
239 | # conv1
240 | with tf.variable_scope('conv1') as scope:
241 | kernel = _variable_with_weight_decay('weights',
242 | shape=[5, 5, 1, 32],
243 | stddev=5e-2,
244 | wd=0.0)
245 | biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.0))
246 | conv = tf.nn.conv2d(x_image, kernel, strides=[1, 1, 1, 1],
247 | padding='SAME')
248 | pre_activation = tf.nn.bias_add(conv, biases)
249 | conv1 = tf.nn.relu(pre_activation, name=scope.name)
250 | _activation_summary(conv1)
251 |
252 | # pool1
253 | pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
254 | padding='SAME', name='pool1')
255 |
256 | # norm1
257 | norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
258 | name='norm1')
259 |
260 | # conv2
261 | with tf.variable_scope('conv2') as scope:
262 | kernel = _variable_with_weight_decay('weights',
263 | shape=[5, 5, 32, 64],
264 | stddev=5e-2,
265 | wd=0.0)
266 | conv = tf.nn.conv2d(norm1, kernel, strides=[1, 1, 1, 1], padding='SAME')
267 | biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
268 | pre_activation = tf.nn.bias_add(conv, biases)
269 | conv2 = tf.nn.relu(pre_activation, name=scope.name)
270 | _activation_summary(conv2)
271 |
272 | # norm2
273 | norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
274 | name='norm1')
275 |
276 | # pool2
277 | pool2 = tf.nn.max_pool(norm2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
278 | padding='SAME', name='pool2')
279 |
280 | # local3
281 | with tf.variable_scope('local3') as scope:
282 | # Move everything into depth so we can perform a single matrix multiply.
283 | reshape = tf.reshape(pool2, [-1, 7 * 7 * 64])
284 | dim = reshape.get_shape()[1].value
285 | weights = _variable_with_weight_decay('weights', shape=[dim, 1024],
286 | stddev=0.04, wd=0.004)
287 | biases = _variable_on_cpu('biases', [1024],
288 | tf.constant_initializer(0.1))
289 | local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,
290 | name=scope.name)
291 | _activation_summary(local3)
292 |
293 | # local4
294 | with tf.variable_scope('local4') as scope:
295 | weights = _variable_with_weight_decay('weights', shape=[1024, 10],
296 | stddev=0.04, wd=0.004)
297 | biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.1))
298 | local4 = tf.nn.relu(tf.matmul(local3, weights) + biases,
299 | name=scope.name)
300 | _activation_summary(local4)
301 |
302 | # linear layer(WX + b),
303 | # We don't apply softmax here because
304 | # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
305 | # and performs the softmax internally for efficiency.
306 | with tf.variable_scope('softmax_linear') as scope:
307 | weights = _variable_with_weight_decay('weights', [10, 10],
308 | stddev=1 / 192.0, wd=0.0)
309 | biases = _variable_on_cpu('biases', [10],
310 | tf.constant_initializer(0.0))
311 | softmax_linear = tf.add(tf.matmul(local4, weights), biases,
312 | name=scope.name)
313 | _activation_summary(softmax_linear)
314 |
315 | return softmax_linear
316 |
317 | def _variable_with_weight_decay(name, shape, stddev, wd):
318 | """Helper to create an initialized Variable with weight decay.
319 |
320 | Note that the Variable is initialized with a truncated normal distribution.
321 | A weight decay is added only if one is specified.
322 |
323 | Args:
324 | name: name of the variable
325 | shape: list of ints
326 | stddev: standard deviation of a truncated Gaussian
327 | wd: add L2Loss weight decay multiplied by this float. If None, weight
328 | decay is not added for this Variable.
329 |
330 | Returns:
331 | Variable Tensor
332 | """
333 | dtype = tf.float32
334 | var = _variable_on_cpu(
335 | name,
336 | shape,
337 | tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
338 | if wd is not None:
339 | weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
340 | tf.add_to_collection('losses', weight_decay)
341 | return var
342 |
343 | def _variable_on_cpu(name, shape, initializer):
344 | """Helper to create a Variable stored on CPU memory.
345 |
346 | Args:
347 | name: name of the variable
348 | shape: list of ints
349 | initializer: initializer for Variable
350 |
351 | Returns:
352 | Variable Tensor
353 | """
354 | with tf.device('/cpu:0'):
355 | dtype = tf.float32
356 | var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
357 | return var
358 |
359 | def _activation_summary(x):
360 | """Helper to create summaries for activations.
361 |
362 | Creates a summary that provides a histogram of activations.
363 | Creates a summary that measures the sparsity of activations.
364 |
365 | Args:
366 | x: Tensor
367 | Returns:
368 | nothing
369 | """
370 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
371 | # session. This helps the clarity of presentation on tensorboard.
372 | if FLAGS.tb_logging:
373 | tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
374 | tf.summary.histogram(tensor_name + '/activations', x)
375 | tf.summary.scalar(tensor_name + '/sparsity',
376 | tf.nn.zero_fraction(x))
377 |
378 | def loss(logits, labels):
379 | """Add L2Loss to all the trainable variables.
380 |
381 | Add summary for "Loss" and "Loss/avg".
382 | Args:
383 | logits: Logits from inference().
384 | labels: Labels from distorted_inputs or inputs(). 1-D tensor
385 | of shape [batch_size]
386 |
387 | Returns:
388 | Loss tensor of type float.
389 | """
390 | # Calculate the average cross entropy loss across the batch.
391 | labels = tf.cast(labels, tf.int64)
392 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
393 | labels=labels, logits=logits, name='cross_entropy_per_example')
394 | cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
395 | tf.add_to_collection('losses', cross_entropy_mean)
396 |
397 | # The total loss is defined as the cross entropy loss plus all of the weight
398 | # decay terms (L2 loss).
399 | return tf.add_n(tf.get_collection('losses'), name='total_loss')
400 |
401 | def tower_loss(scope):
402 | """Calculate the total loss on a single tower running the MNIST model.
403 |
404 | Args:
405 | scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0'
406 |
407 | Returns:
408 | Tensor of shape [] containing the total loss for a batch of data
409 | """
410 | # Input images and labels.
411 | images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
412 | num_epochs=FLAGS.num_epochs)
413 | # Build inference Graph.
414 | logits = inference(images)
415 |
416 | # Build the portion of the Graph calculating the losses. Note that we will
417 | # assemble the total_loss using a custom function below.
418 | _ = loss(logits, labels)
419 |
420 | # Assemble all of the losses for the current tower only.
421 | losses = tf.get_collection('losses', scope)
422 |
423 | # Calculate the total loss for the current tower.
424 | total_loss = tf.add_n(losses, name='total_loss')
425 |
426 | # Attach a scalar summary to all individual losses and the total loss; do
427 | # the same for the averaged version of the losses.
428 | if FLAGS.tb_logging:
429 | for l in losses + [total_loss]:
430 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU
431 | # training session. This helps the clarity of presentation on
432 | # tensorboard.
433 | loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
434 | tf.summary.scalar(loss_name, l)
435 |
436 | return total_loss
437 |
438 | def average_gradients(tower_grads):
439 | """Calculate average gradient for each shared variable across all towers.
440 |
441 | Note that this function provides a synchronization point across all towers.
442 |
443 | Args:
444 | tower_grads: List of lists of (gradient, variable) tuples. The outer list
445 | is over individual gradients. The inner list is over the gradient
446 | calculation for each tower.
447 | Returns:
448 | List of pairs of (gradient, variable) where the gradient has been
449 | averaged across all towers.
450 | """
451 | average_grads = []
452 | for grad_and_vars in zip(*tower_grads):
453 | # Note that each grad_and_vars looks like the following:
454 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
455 | grads = []
456 | for g, _ in grad_and_vars:
457 | # Add 0 dimension to the gradients to represent the tower.
458 | expanded_g = tf.expand_dims(g, 0)
459 |
460 | # Append on a 'tower' dimension which we will average over below.
461 | grads.append(expanded_g)
462 |
463 | # Average over the 'tower' dimension.
464 | grad = tf.concat(grads, 0)
465 | grad = tf.reduce_mean(grad, 0)
466 |
467 | # Keep in mind that the Variables are redundant because they are shared
468 | # across towers. So .. we will just return the first tower's pointer to
469 | # the Variable.
470 | v = grad_and_vars[0][1]
471 | grad_and_var = (grad, v)
472 | average_grads.append(grad_and_var)
473 | return average_grads
474 |
475 | def train():
476 | with tf.Graph().as_default(), tf.device('/cpu:0'):
477 | # Create a variable to count the number of train() calls. This equals
478 | # the number of batches processed * FLAGS.num_gpus.
479 | global_step = tf.get_variable(
480 | 'global_step', [],
481 | initializer=tf.constant_initializer(0), trainable=False)
482 |
483 | # Calculate the learning rate schedule.
484 | num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
485 | FLAGS.batch_size)
486 | decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
487 |
488 | # Decay the learning rate exponentially based on the number of steps.
489 | lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
490 | global_step,
491 | decay_steps,
492 | LEARNING_RATE_DECAY_FACTOR,
493 | staircase=True)
494 |
495 | opt = tf.train.MomentumOptimizer(lr,0.9,use_nesterov=True,use_locking=True)
496 |
497 | # Calculate the gradients for each model tower.
498 | tower_grads = []
499 | with tf.variable_scope(tf.get_variable_scope()):
500 | for i in xrange(FLAGS.num_gpus):
501 | with tf.device('/gpu:%d' % i):
502 | with tf.name_scope(
503 | '%s_%d' % (TOWER_NAME, i)) as scope:
504 | # Calculate the loss for one tower of the CIFAR model.
505 | # This function constructs the entire CIFAR model but
506 | # shares the variables across all towers.
507 | loss = tower_loss(scope)
508 |
509 | # Reuse variables for the next tower.
510 | tf.get_variable_scope().reuse_variables()
511 |
512 | # Retain the summaries from the final tower.
513 | summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
514 | scope)
515 |
516 | # Calculate the gradients for the batch of data on this
517 | # MNIST tower.
518 | grads = opt.compute_gradients(loss, gate_gradients=0)
519 |
520 | # Keep track of the gradients across all towers.
521 | tower_grads.append(grads)
522 |
523 | # We must calculate the mean of each gradient. Note that this is the
524 | # synchronization point across all towers.
525 | grads = average_gradients(tower_grads)
526 |
527 | # Add histograms for gradients.
528 | if FLAGS.tb_logging:
529 | for grad, var in grads:
530 | if grad is not None:
531 | summaries.append(
532 | tf.summary.histogram(var.op.name + '/gradients', grad))
533 | # Add a summary to track the learning rate.
534 | summaries.append(tf.summary.scalar('learning_rate', lr))
535 |
536 | train_op = opt.apply_gradients(grads, global_step=global_step)
537 |
538 | # Add histograms for trainable variables.
539 | if FLAGS.tb_logging:
540 | for var in tf.trainable_variables():
541 | summaries.append(tf.summary.histogram(var.op.name, var))
542 |
543 | # Create a saver.
544 | saver = tf.train.Saver(tf.global_variables(),sharded=True)
545 |
546 | # Build the summary operation from the last tower summaries.
547 | summary_op = tf.summary.merge(summaries)
548 |
549 | # Build an initialization operation to run below.
550 | # init = tf.global_variables_initializer()
551 |
552 | # The op for initializing the variables.
553 | init_op = tf.group(tf.global_variables_initializer(),
554 | tf.local_variables_initializer())
555 |
556 | # Start running operations on the Graph. allow_soft_placement must be
557 | # set to True to build towers on GPU, as some of the ops do not have GPU
558 | # implementations.
559 | sess = tf.Session(config=tf.ConfigProto(
560 | allow_soft_placement=True,
561 | log_device_placement=FLAGS.log_device_placement))
562 | sess.run(init_op)
563 |
564 | # Start input enqueue threads.
565 | coord = tf.train.Coordinator()
566 | threads = tf.train.start_queue_runners(sess=sess, coord=coord)
567 |
568 | summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
569 |
570 | try:
571 | step = 0
572 | while not coord.should_stop():
573 | start_time = time.time()
574 |
575 | # Run one step of the model. The return values are
576 | # the activations from the `train_op` (which is
577 | # discarded) and the `loss` op. To inspect the values
578 | # of your ops or variables, you may include them in
579 | # the list passed to sess.run() and the value tensors
580 | # will be returned in the tuple from the call.
581 | _, loss_value = sess.run([train_op, loss])
582 |
583 | duration = time.time() - start_time
584 |
585 | assert not np.isnan(
586 | loss_value), 'Model diverged with loss = NaN'
587 |
588 | # Print an overview fairly often.
589 | if step % 100 == 0:
590 | num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
591 | examples_per_sec = num_examples_per_step / duration
592 | sec_per_batch = duration / FLAGS.num_gpus
593 | format_str = (
594 | '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
595 | 'sec/batch)')
596 | print(format_str % (datetime.now(), step, loss_value,
597 | examples_per_sec, sec_per_batch))
598 | if FLAGS.tb_logging:
599 | if step % 10 == 0:
600 | summary_str = sess.run(summary_op)
601 | summary_writer.add_summary(summary_str, step)
602 |
603 | # Save the model checkpoint periodically.
604 | if step % 1000 == 0 or (
605 | step + 1) == FLAGS.num_epochs * FLAGS.batch_size:
606 | checkpoint_path = os.path.join(FLAGS.train_dir,
607 | 'model.ckpt')
608 | saver.save(sess, checkpoint_path, global_step=step)
609 |
610 | step += 1
611 | except tf.errors.OutOfRangeError:
612 | print('Done training for %d epochs, %d steps.' % (
613 | FLAGS.num_epochs, step))
614 | finally:
615 | # When done, ask the threads to stop.
616 | coord.request_stop()
617 |
618 | # Wait for threads to finish.
619 | coord.join(threads)
620 | sess.close()
621 |
622 | def evaluate():
623 | """Eval MNIST for a number of steps."""
624 | with tf.Graph().as_default():
625 | # Get images and labels for MNIST.
626 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False)
627 | images = mnist.test.images
628 | labels = mnist.test.labels
629 |
630 | # Build a Graph that computes the logits predictions from the
631 | # inference model.
632 | logits = inference(images)
633 |
634 | # Calculate predictions.
635 | top_k_op = tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
636 |
637 | # Create saver to restore the learned variables for eval.
638 | saver = tf.train.Saver()
639 |
640 | with tf.Session() as sess:
641 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
642 | if ckpt and ckpt.model_checkpoint_path:
643 | # Restores from checkpoint
644 | saver.restore(sess, ckpt.model_checkpoint_path)
645 | else:
646 | print('No checkpoint file found')
647 | return
648 |
649 | predictions = np.sum(sess.run([top_k_op]))
650 |
651 | # Compute precision.
652 | print('%s: precision = %.3f' % (datetime.now(), predictions))
653 |
654 | def main(argv=None): # pylint: disable=unused-argument
655 | start_time = time.time()
656 | train()
657 | duration = time.time() - start_time
658 | print('Total Duration (%.3f sec)' % duration)
659 | evaluate()
660 |
661 | if __name__ == '__main__':
662 | tf.app.run()
663 |
--------------------------------------------------------------------------------
/mnist_multi_gpu_keras.py:
--------------------------------------------------------------------------------
1 | '''Trains a simple convnet on the MNIST dataset.
2 | Gets to 99.25% test accuracy after 12 epochs
3 | (there is still a lot of margin for parameter tuning).
4 | 16 seconds per epoch on a GRID K520 GPU.
5 | '''
6 |
7 | from __future__ import print_function
8 | import numpy as np
9 | np.random.seed(1337) # for reproducibility
10 |
11 | from tensorflow.contrib.keras.api.keras.datasets import mnist
12 | from keras.models import Sequential
13 | from keras.layers import Dense, Dropout, Activation, Flatten
14 | from keras.layers import Convolution2D, MaxPooling2D, BatchNormalization
15 | from keras.utils import np_utils
16 | from keras import backend as K
17 | from keras.callbacks import TensorBoard, ModelCheckpoint
18 | tensorboard = TensorBoard(log_dir='/home/norman/MNIST_train', histogram_freq=1,
19 | write_graph=True, write_images=False, embeddings_freq=1)
20 | import time
21 | import argparse
22 | parser = argparse.ArgumentParser()
23 | parser.add_argument('--extras', help='(absolute) path to keras-extras')
24 | parser.add_argument('--gpus', help='number of GPUs')
25 | parser.print_help()
26 | args = parser.parse_args()
27 |
28 | import sys
29 | sys.path.append(args.extras)
30 |
31 | from multi_gpu import make_parallel
32 |
33 | #ngpus = int(args.gpus)
34 | ngpus = int(2)
35 | print("Using %i GPUs" %ngpus)
36 |
37 | batch_size = 128
38 | nb_classes = 10
39 | nb_epoch = 12
40 |
41 | # input image dimensions
42 | img_rows, img_cols = 28, 28
43 | # number of convolutional filters to use
44 | nb_filters = 32
45 | # size of pooling area for max pooling
46 | pool_size = (2, 2)
47 | # convolution kernel size
48 | kernel_size = (3, 3)
49 |
50 | # the data, shuffled and split between train and test sets
51 | (X_train, y_train), (X_test, y_test) = mnist.load_data()
52 |
53 | if K.image_dim_ordering() == 'th':
54 | X_train = X_train.reshape(X_train.shape[0], 1, img_rows, img_cols)
55 | X_test = X_test.reshape(X_test.shape[0], 1, img_rows, img_cols)
56 | input_shape = (1, img_rows, img_cols)
57 | else:
58 | X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
59 | X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
60 | input_shape = (img_rows, img_cols, 1)
61 |
62 | X_train = X_train.astype('float32')
63 | X_test = X_test.astype('float32')
64 | X_train /= 255
65 | X_test /= 255
66 | print('X_train shape:', X_train.shape)
67 | print(X_train.shape[0], 'train samples')
68 | print(X_test.shape[0], 'test samples')
69 |
70 | # convert class vectors to binary class matrices
71 | Y_train = np_utils.to_categorical(y_train, nb_classes)
72 | Y_test = np_utils.to_categorical(y_test, nb_classes)
73 |
74 | model = Sequential()
75 |
76 | model.add(Convolution2D(nb_filters, (kernel_size[0], kernel_size[1]),
77 | padding='valid',
78 | input_shape=input_shape))
79 | model.add(Activation('relu'))
80 | model.add(Convolution2D(256, (kernel_size[0], kernel_size[1])))
81 | model.add(Activation('relu'))
82 | model.add(BatchNormalization())
83 | model.add(Convolution2D(128, (kernel_size[0], kernel_size[1])))
84 | model.add(Activation('relu'))
85 | model.add(MaxPooling2D(pool_size=pool_size))
86 | model.add(Dropout(0.25))
87 | model.add(Flatten())
88 | model.add(Dense(128))
89 | model.add(Activation('relu'))
90 | model.add(Dropout(0.5))
91 | model.add(Dense(nb_classes))
92 | model.add(Activation('softmax'))
93 |
94 | if ngpus > 1:
95 | model = make_parallel(model,ngpus)
96 |
97 | model.compile(loss='categorical_crossentropy',
98 | optimizer='adadelta',
99 | metrics=['accuracy'])
100 |
101 | start_time = time.time()
102 | model.fit(X_train, Y_train, batch_size=batch_size*ngpus, epochs=nb_epoch,
103 | verbose=1, validation_data=(X_test, Y_test))#, callbacks=[tensorboard])
104 | score = model.evaluate(X_test, Y_test, verbose=0)
105 | print('Test score:', score[0])
106 | print('Test accuracy:', score[1])
107 | duration = time.time() - start_time
108 | print('Total Duration (%.3f sec)' % duration)
109 |
--------------------------------------------------------------------------------
/mnist_multi_gpu_sonnet.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the 'License');
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """A binary to train MNIST using multiple GPU's with synchronous updates.
16 |
17 | Accuracy:
18 |
19 | Done training with 2 GPUs, for 20 epochs, 11000 steps.
20 | Total Duration (327.396 sec)
21 | 2017-04-21 20:46:18.466392: precision = 9848.000
22 | Done training with 1 GPUs, for 20 epochs, 22000 steps.
23 | Total Duration (500.122 sec)
24 | 2017-04-21 20:56:40.639580: precision = 9884.000
25 |
26 | Speed: With batch_size 50.
27 |
28 | System | Step Time (sec/batch) | Accuracy
29 | -------------------------------------------------------------------------
30 | 1 GTX 1080 | 258.136 sec | ~94.58% at 11K steps
31 | 2 GTX 1080 | 189.572 sec | ~94.59% at 11K steps
32 |
33 | Usage:
34 | Please see the TensorFlow website for how to download the MNIST
35 | data set, compile and train models.
36 |
37 | """
38 |
39 | from __future__ import absolute_import
40 | from __future__ import division
41 | from __future__ import print_function
42 |
43 | import os.path
44 | import re
45 | import time
46 | import numpy as np
47 | from datetime import datetime
48 |
49 | from tensorflow.examples.tutorials.mnist import input_data
50 |
51 | import tensorflow as tf
52 | import sonnet as snt
53 |
54 | # Constants used for dealing with the files, matches convert_to_records.
55 | TRAIN_FILE = 'train.tfrecords'
56 | VALIDATION_FILE = 'validation.tfrecords'
57 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name
58 | # to differentiate the operations. Note that this prefix is removed from the
59 | # names of the summaries when visualizing a model.
60 | TOWER_NAME = 'tower'
61 | IMAGE_PIXELS = 784
62 |
63 | # Constants describing the training process.
64 | MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average.
65 | NUM_EPOCHS_PER_DECAY = 20.0 # Epochs after which learning rate decays.
66 | LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor.
67 | INITIAL_LEARNING_RATE = 0.1 # Initial learning rate.
68 |
69 | # Global constants describing the MNIST data set.
70 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
71 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
72 |
73 | FLAGS = tf.app.flags.FLAGS
74 |
75 | tf.app.flags.DEFINE_integer('batch_size', 50,
76 | """Number of images to process in a batch.""")
77 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
78 | """Path to the MNIST data directory.""")
79 | tf.app.flags.DEFINE_string('train_dir', '/home/norman/MNIST_train',
80 | """Directory where to write event logs """
81 | """and checkpoint.""")
82 | tf.app.flags.DEFINE_integer('num_gpus', 2,
83 | """How many GPUs to use.""")
84 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
85 | """Whether to log device placement.""")
86 | tf.app.flags.DEFINE_boolean('tb_logging', False,
87 | """Whether to log to Tensorboard.""")
88 | tf.app.flags.DEFINE_integer('num_epochs', 20,
89 | """Number of epochs to run trainer.""")
90 |
91 |
92 | def read_and_decode(filename_queue):
93 | reader = tf.TFRecordReader()
94 | _, serialized_example = reader.read(filename_queue)
95 | features = tf.parse_single_example(
96 | serialized_example,
97 | # Defaults are not specified since both keys are required.
98 | features={
99 | 'image_raw': tf.FixedLenFeature([], tf.string),
100 | 'label': tf.FixedLenFeature([], tf.int64),
101 | })
102 |
103 | # Convert from a scalar string tensor (whose single string has
104 | # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
105 | # [mnist.IMAGE_PIXELS].
106 | image = tf.decode_raw(features['image_raw'], tf.uint8)
107 | image.set_shape([IMAGE_PIXELS])
108 |
109 | # OPTIONAL: Could reshape into a 28x28 image and apply distortions
110 | # here. Since we are not applying any distortions in this
111 | # example, and the next step expects the image to be flattened
112 | # into a vector, we don't bother.
113 |
114 | # Convert from [0, 255] -> [-0.5, 0.5] floats.
115 | image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
116 |
117 | # Convert label from a scalar uint8 tensor to an int32 scalar.
118 | label = tf.cast(features['label'], tf.int32)
119 |
120 | return image, label
121 |
122 |
123 | def inputs(train, batch_size, num_epochs):
124 | """Reads input data num_epochs times.
125 | Args:
126 | train: Selects between the training (True) and validation (False) data.
127 | batch_size: Number of examples per returned batch.
128 | num_epochs: Number of times to read the input data, or 0/None to
129 | train forever.
130 | Returns:
131 | A tuple (images, labels), where:
132 | * images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]
133 | in the range [-0.5, 0.5].
134 | * labels is an int32 tensor with shape [batch_size] with the true label,
135 | a number in the range [0, mnist.NUM_CLASSES).
136 | Note that an tf.train.QueueRunner is added to the graph, which
137 | must be run using e.g. tf.train.start_queue_runners().
138 | """
139 | if not num_epochs: num_epochs = None
140 | filename = os.path.join(FLAGS.data_dir,
141 | TRAIN_FILE if train else VALIDATION_FILE)
142 |
143 | with tf.name_scope('input'):
144 | filename_queue = tf.train.string_input_producer(
145 | [filename], num_epochs=num_epochs)
146 |
147 | # Even when reading in multiple threads, share the filename
148 | # queue.
149 | image, label = read_and_decode(filename_queue)
150 |
151 | # Shuffle the examples and collect them into batch_size batches.
152 | # (Internally uses a RandomShuffleQueue.)
153 | # We run this in two threads to avoid being a bottleneck.
154 | images, sparse_labels = tf.train.shuffle_batch(
155 | [image, label], batch_size=batch_size, num_threads=2,
156 | capacity=1000 + 3 * batch_size,
157 | # Ensures a minimum amount of shuffling of examples.
158 | min_after_dequeue=1000)
159 |
160 | return images, sparse_labels
161 |
162 |
163 | def custom_build(inputs, is_training, keep_prob):
164 | x_inputs = tf.reshape(inputs, [-1, 28, 28, 1])
165 | """A custom build method to wrap into a sonnet Module."""
166 | outputs = snt.Conv2D(output_channels=32, kernel_shape=4, stride=2)(x_inputs)
167 | outputs = snt.BatchNorm()(outputs, is_training=is_training)
168 | outputs = tf.nn.relu(outputs)
169 | outputs = tf.nn.max_pool(outputs, ksize=[1, 2, 2, 1],
170 | strides=[1, 2, 2, 1], padding='SAME')
171 | outputs = snt.Conv2D(output_channels=64, kernel_shape=4, stride=2)(outputs)
172 | outputs = snt.BatchNorm()(outputs, is_training=is_training)
173 | outputs = tf.nn.relu(outputs)
174 | outputs = tf.nn.max_pool(outputs, ksize=[1, 2, 2, 1],
175 | strides=[1, 2, 2, 1], padding='SAME')
176 | outputs = snt.Conv2D(output_channels=1024, kernel_shape=1, stride=1)(outputs)
177 | outputs = snt.BatchNorm()(outputs, is_training=is_training)
178 | outputs = tf.nn.relu(outputs)
179 | outputs = snt.BatchFlatten()(outputs)
180 | outputs = tf.nn.dropout(outputs, keep_prob=keep_prob)
181 | outputs = snt.Linear(output_size=10)(outputs)
182 | # _activation_summary(outputs)
183 | return outputs
184 |
185 |
186 | def _variable_with_weight_decay(name, shape, stddev, wd):
187 | """Helper to create an initialized Variable with weight decay.
188 |
189 | Note that the Variable is initialized with a truncated normal distribution.
190 | A weight decay is added only if one is specified.
191 |
192 | Args:
193 | name: name of the variable
194 | shape: list of ints
195 | stddev: standard deviation of a truncated Gaussian
196 | wd: add L2Loss weight decay multiplied by this float. If None, weight
197 | decay is not added for this Variable.
198 |
199 | Returns:
200 | Variable Tensor
201 | """
202 | dtype = tf.float32
203 | var = _variable_on_cpu(
204 | name,
205 | shape,
206 | tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
207 | if wd is not None:
208 | weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
209 | tf.add_to_collection('losses', weight_decay)
210 | return var
211 |
212 |
213 | def _variable_on_cpu(name, shape, initializer):
214 | """Helper to create a Variable stored on CPU memory.
215 |
216 | Args:
217 | name: name of the variable
218 | shape: list of ints
219 | initializer: initializer for Variable
220 |
221 | Returns:
222 | Variable Tensor
223 | """
224 | with tf.device('/cpu:0'):
225 | dtype = tf.float32
226 | var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
227 | return var
228 |
229 |
230 | def _activation_summary(x):
231 | """Helper to create summaries for activations.
232 |
233 | Creates a summary that provides a histogram of activations.
234 | Creates a summary that measures the sparsity of activations.
235 |
236 | Args:
237 | x: Tensor
238 | Returns:
239 | nothing
240 | """
241 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
242 | # session. This helps the clarity of presentation on tensorboard.
243 | if FLAGS.tb_logging:
244 | tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
245 | tf.summary.histogram(tensor_name + '/activations', x)
246 | tf.summary.scalar(tensor_name + '/sparsity',
247 | tf.nn.zero_fraction(x))
248 |
249 |
250 | def loss(logits, labels):
251 | """Add L2Loss to all the trainable variables.
252 |
253 | Add summary for "Loss" and "Loss/avg".
254 | Args:
255 | logits: Logits from inference().
256 | labels: Labels from distorted_inputs or inputs(). 1-D tensor
257 | of shape [batch_size]
258 |
259 | Returns:
260 | Loss tensor of type float.
261 | """
262 | # Calculate the average cross entropy loss across the batch.
263 | # labels = tf.cast(labels, tf.int64)
264 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
265 | labels=labels, logits=logits, name='cross_entropy_per_example')
266 | cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
267 | tf.add_to_collection('losses', cross_entropy_mean)
268 |
269 | # The total loss is defined as the cross entropy loss plus all of the weight
270 | # decay terms (L2 loss).
271 | return tf.add_n(tf.get_collection('losses'), name='total_loss')
272 |
273 |
274 | def average_gradients(tower_grads):
275 | """Calculate average gradient for each shared variable across all towers.
276 |
277 | Note that this function provides a synchronization point across all towers.
278 |
279 | Args:
280 | tower_grads: List of lists of (gradient, variable) tuples. The outer list
281 | is over individual gradients. The inner list is over the gradient
282 | calculation for each tower.
283 | Returns:
284 | List of pairs of (gradient, variable) where the gradient has been
285 | averaged across all towers.
286 | """
287 | # for m in xrange(len(tower_grads)):
288 | # for n in xrange(len(tower_grads[m])):
289 | # print(type(tower_grads[0][n][0]))
290 | # for gg in tower_grads:
291 | # for x in gg:
292 | # print(type(x[0]))
293 | # print(tower_grads)
294 |
295 | average_grads = []
296 | for grad_and_vars in zip(*tower_grads):
297 | # Note that each grad_and_vars looks like the following:
298 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
299 | grads = []
300 | for g, _ in grad_and_vars:
301 | if g != None:
302 | # Add 0 dimension to the gradients to represent the tower.
303 | expanded_g = tf.expand_dims(g, 0)
304 |
305 | # Append on a 'tower' dimension which we will average over below.
306 | grads.append(expanded_g)
307 |
308 | # Average over the 'tower' dimension.
309 | grad = tf.concat(grads, 0)
310 | grad = tf.reduce_mean(grad, 0)
311 |
312 | # Keep in mind that the Variables are redundant because they are shared
313 | # across towers. So .. we will just return the first tower's pointer to
314 | # the Variable.
315 | v = grad_and_vars[0][1]
316 | grad_and_var = (grad, v)
317 | average_grads.append(grad_and_var)
318 | return average_grads
319 |
320 |
321 | def tower_loss(scope):
322 | """Calculate the total loss on a single tower running the MNIST model.
323 |
324 | Args:
325 | scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0'
326 |
327 | Returns:
328 | Tensor of shape [] containing the total loss for a batch of data
329 | """
330 | # Input images and labels.
331 |
332 | images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
333 | num_epochs=(FLAGS.num_epochs / FLAGS.num_gpus))
334 | # Build inference Graph.
335 | # The line below takes custom_build and
336 | # wraps it to construct a sonnet Module.
337 | module_with_build_args = snt.Module(custom_build, name='simple_net')
338 |
339 | train_model_outputs = module_with_build_args(images, is_training=True,
340 | keep_prob=tf.constant(0.5))
341 |
342 | # Build the portion of the Graph calculating the losses. Note that we will
343 | # assemble the total_loss using a custom function below.
344 | _ = loss(train_model_outputs, labels)
345 |
346 | # Assemble all of the losses for the current tower only.
347 | losses = tf.get_collection('losses', scope)
348 |
349 | # Calculate the total loss for the current tower.
350 | total_loss = tf.add_n(losses, name='total_loss')
351 |
352 | # Attach a scalar summary to all individual losses and the total loss; do
353 | # the same for the averaged version of the losses.
354 | if FLAGS.tb_logging:
355 | for l in losses + [total_loss]:
356 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU
357 | # training session. This helps the clarity of presentation on
358 | # tensorboard.
359 | loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
360 | tf.summary.scalar(loss_name, l)
361 |
362 | return total_loss
363 |
364 |
365 | def train():
366 | with tf.Graph().as_default(), tf.device('/cpu:0'):
367 | # Create a variable to count the number of train() calls. This equals
368 | # the number of batches processed * FLAGS.num_gpus.
369 | global_step = tf.get_variable(
370 | 'global_step', [],
371 | initializer=tf.constant_initializer(0), trainable=False)
372 |
373 | # Calculate the learning rate schedule.
374 | num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
375 | (FLAGS.batch_size * FLAGS.num_gpus))
376 | decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
377 |
378 | # Decay the learning rate exponentially based on the number of steps.
379 | lr = tf.train.exponential_decay(learning_rate=INITIAL_LEARNING_RATE,
380 | global_step=global_step,
381 | decay_steps=decay_steps,
382 | decay_rate=LEARNING_RATE_DECAY_FACTOR,
383 | staircase=True)
384 |
385 | opt = tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True,
386 | use_locking=True)
387 | #opt = tf.train.AdamOptimizer(1e-4)
388 | # opt = tf.train.ProximalAdagradOptimizer(1e-2)
389 |
390 | # Calculate the gradients for each model tower.
391 | tower_grads = []
392 | with tf.variable_scope(tf.get_variable_scope()):
393 | for i in xrange(FLAGS.num_gpus):
394 | with tf.device('/gpu:%d' % i):
395 | with tf.name_scope(
396 | '%s_%d' % (TOWER_NAME, i)) as scope:
397 | # Calculate the loss for one tower of the CIFAR model.
398 | # This function constructs the entire CIFAR model but
399 | # shares the variables across all towers.
400 | loss = tower_loss(scope)
401 |
402 | # Reuse variables for the next tower.
403 | # No need for this with Sonnet?
404 | #tf.get_variable_scope().reuse_variables()
405 |
406 | # Retain the summaries from the final tower.
407 | summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
408 | scope)
409 |
410 | # Calculate the gradients for the batch of data on this
411 | # MNIST tower.
412 | grads = opt.compute_gradients(loss, gate_gradients=2)
413 | # for x in grads:
414 | # print(type(x[0]))
415 | # print (grads)
416 |
417 | # Keep track of the gradients across all towers.
418 | tower_grads.append(grads)
419 |
420 | # We must calculate the mean of each gradient. Note that this is the
421 | # synchronization point across all towers.
422 | grads = average_gradients(tower_grads)
423 |
424 | # Add histograms for gradients.
425 | if FLAGS.tb_logging:
426 | for grad, var in grads:
427 | if grad is not None:
428 | summaries.append(
429 | tf.summary.histogram(var.op.name + '/gradients', grad))
430 | # Add a summary to track the learning rate.
431 | summaries.append(tf.summary.scalar('learning_rate', lr))
432 |
433 | train_op = opt.apply_gradients(grads, global_step=global_step)
434 |
435 | # Add histograms for trainable variables.
436 | if FLAGS.tb_logging:
437 | for var in tf.trainable_variables():
438 | summaries.append(tf.summary.histogram(var.op.name, var))
439 |
440 | # Create a saver.
441 | saver = tf.train.Saver(tf.global_variables(), sharded=True)
442 |
443 | # Build the summary operation from the last tower summaries.
444 | summary_op = tf.summary.merge(summaries)
445 |
446 | # Build an initialization operation to run below.
447 | # init = tf.global_variables_initializer()
448 |
449 | # The op for initializing the variables.
450 | init_op = tf.group(tf.global_variables_initializer(),
451 | tf.local_variables_initializer())
452 |
453 | # Start running operations on the Graph. allow_soft_placement must be
454 | # set to True to build towers on GPU, as some of the ops do not have GPU
455 | # implementations.
456 | gpu_options = tf.GPUOptions(allow_growth=True)
457 | sess = tf.Session(config=tf.ConfigProto(
458 | allow_soft_placement=True,
459 | log_device_placement=FLAGS.log_device_placement,
460 | gpu_options=gpu_options))
461 | sess.run(init_op)
462 |
463 | # Start input enqueue threads.
464 | coord = tf.train.Coordinator()
465 | threads = tf.train.start_queue_runners(sess=sess, coord=coord)
466 |
467 | summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
468 |
469 | try:
470 | step = 0
471 | while not coord.should_stop():
472 | start_time = time.time()
473 |
474 | # Run one step of the model. The return values are
475 | # the activations from the `train_op` (which is
476 | # discarded) and the `loss` op. To inspect the values
477 | # of your ops or variables, you may include them in
478 | # the list passed to sess.run() and the value tensors
479 | # will be returned in the tuple from the call.
480 | _, loss_value = sess.run([train_op, loss])
481 |
482 | duration = time.time() - start_time
483 |
484 | assert not np.isnan(
485 | loss_value), 'Model diverged with loss = NaN'
486 |
487 | # Print an overview fairly often.
488 | if step % 100 == 0:
489 | num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
490 | examples_per_sec = num_examples_per_step / duration
491 | sec_per_batch = duration / FLAGS.num_gpus
492 | format_str = (
493 | '%s: step %d, epochs %d, loss = %.3f '
494 | '(%.1f examples/sec; %.3f sec/batch)')
495 | print(format_str % (datetime.now(), step,
496 | # step * gpu / batchsize * gpu = 100
497 | FLAGS.num_epochs,
498 | loss_value,
499 | examples_per_sec, sec_per_batch))
500 | if FLAGS.tb_logging:
501 | if step % 10 == 0:
502 | summary_str = sess.run(summary_op)
503 | summary_writer.add_summary(summary_str, step)
504 |
505 | # Save the model checkpoint periodically.
506 | if step % 1000 == 0 or (
507 | step + 1) == FLAGS.num_epochs * FLAGS.batch_size:
508 | checkpoint_path = os.path.join(FLAGS.train_dir,
509 | 'model.ckpt')
510 | saver.save(sess, checkpoint_path, global_step=step)
511 |
512 | step += 1
513 | except tf.errors.OutOfRangeError:
514 | print('Done training with %d GPUs, for %d epochs, %d steps.' % (
515 | FLAGS.num_gpus, FLAGS.num_epochs, step))
516 | finally:
517 | # When done, ask the threads to stop.
518 | coord.request_stop()
519 |
520 | # Wait for threads to finish.
521 | coord.join(threads)
522 | sess.close()
523 |
524 |
525 | def evaluate():
526 | """Eval MNIST for a number of steps."""
527 | with tf.Graph().as_default():
528 | # Get images and labels for MNIST.
529 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False)
530 | images = mnist.test.images
531 | labels = mnist.test.labels
532 |
533 | # Build a Graph that computes the logits predictions from the
534 | # inference model.
535 | # The line below takes custom_build and wraps it to construct a sonnet Module.
536 | module_with_build_args = snt.Module(custom_build, name='simple_net')
537 | test_model_outputs = module_with_build_args(images, is_training=False,
538 | keep_prob=tf.constant(1.0))
539 |
540 | # Calculate predictions.
541 | top_k_op = tf.nn.in_top_k(predictions=test_model_outputs, targets=labels, k=1)
542 |
543 | # Create saver to restore the learned variables for eval.
544 | saver = tf.train.Saver()
545 |
546 | with tf.Session() as sess:
547 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
548 | if ckpt and ckpt.model_checkpoint_path:
549 | # Restores from checkpoint
550 | saver.restore(sess, ckpt.model_checkpoint_path)
551 | else:
552 | print('No checkpoint file found')
553 | return
554 |
555 | predictions = np.sum(sess.run([top_k_op]))
556 |
557 | # Compute precision.
558 | print('%s: precision = %.3f' % (datetime.now(), predictions))
559 |
560 |
561 | def main(argv=None): # pylint: disable=unused-argument
562 | start_time = time.time()
563 | train()
564 | duration = time.time() - start_time
565 | print('Total Duration (%.3f sec)' % duration)
566 | evaluate()
567 |
568 |
569 | if __name__ == '__main__':
570 | tf.app.run()
571 |
--------------------------------------------------------------------------------
/multi_gpu.py:
--------------------------------------------------------------------------------
1 | # ref: https://raw.githubusercontent.com/kuza55/keras-extras/master/utils/multi_gpu.py @IgnorePep8
2 |
3 | from keras.layers.merge import concatenate
4 | from keras.layers.core import Lambda
5 | from keras.models import Model
6 | from keras import backend as K
7 |
8 | if K.backend() == 'tensorflow':
9 | import tensorflow as tf # @UnresolvedImport
10 | from tensorflow.python.client import device_lib
11 |
12 | def get_available_gpus():
13 | local_device_protos = device_lib.list_local_devices()
14 | return [x.name for x in local_device_protos if x.device_type == 'GPU']
15 |
16 | def make_parallel(model, gpu_count):
17 | def get_slice(data, idx, parts):
18 | shape = tf.shape(data)
19 | size = tf.concat([shape[:1] // parts, shape[1:]], axis=0)
20 | stride = tf.concat([shape[:1] // parts, shape[1:] * 0], axis=0)
21 | start = stride * idx
22 | return tf.slice(data, start, size)
23 |
24 | outputs_all = []
25 | for i in range(len(model.outputs)):
26 | outputs_all.append([])
27 |
28 | # Place a copy of the model on each GPU, each getting a slice of the batch
29 | for i in range(gpu_count):
30 | with tf.device('/gpu:%d' % i):
31 | with tf.name_scope('tower_%d' % i) as scope:
32 |
33 | inputs = []
34 | # Slice each input into a piece for processing on this GPU
35 | for x in model.inputs:
36 | input_shape = tuple(x.get_shape().as_list())[1:]
37 | slice_n = Lambda(get_slice, output_shape=input_shape,
38 | arguments={'idx': i, 'parts': gpu_count})(
39 | x)
40 | inputs.append(slice_n)
41 |
42 | outputs = model(inputs)
43 |
44 | if not isinstance(outputs, list):
45 | outputs = [outputs]
46 |
47 | # Save all the outputs for merging back together later
48 | for l in range(len(outputs)):
49 | outputs_all[l].append(outputs[l])
50 |
51 | # merge outputs on CPU
52 | with tf.device('/cpu:0'):
53 | merged = []
54 | for outputs in outputs_all:
55 | merged.append(concatenate(inputs=outputs, axis=0))
56 |
57 | return Model(inputs=model.inputs, outputs=merged)
--------------------------------------------------------------------------------
/network.py:
--------------------------------------------------------------------------------
1 | """Class that represents the network to be evolved."""
2 | import random
3 | import logging
4 | from train import train_and_score
5 |
6 | class Network():
7 | """Represent a network and let us operate on it.
8 |
9 | Currently only works for an MLP.
10 | """
11 |
12 | def __init__(self, nn_param_choices=None):
13 | """Initialize our network.
14 |
15 | Args:
16 | nn_param_choices (dict): Parameters for the network, includes:
17 | nb_neurons (list): [64, 128, 256]
18 | nb_layers (list): [1, 2, 3, 4]
19 | activation (list): ['relu', 'elu']
20 | optimizer (list): ['rmsprop', 'adam']
21 | """
22 | self.accuracy = 0.
23 | self.nn_param_choices = nn_param_choices
24 | self.network = {} # (dic): represents MLP network parameters
25 |
26 | def create_random(self):
27 | """Create a random network."""
28 | for key in self.nn_param_choices:
29 | self.network[key] = random.choice(self.nn_param_choices[key])
30 |
31 | def create_set(self, network):
32 | """Set network properties.
33 |
34 | Args:
35 | network (dict): The network parameters
36 |
37 | """
38 | self.network = network
39 |
40 | def train(self, dataset):
41 | """Train the network and record the accuracy.
42 |
43 | Args:
44 | dataset (str): Name of dataset to use.
45 |
46 | """
47 | if self.accuracy == 0.:
48 | self.accuracy = train_and_score(self.network, dataset)
49 |
50 | def print_network(self):
51 | """Print out a network."""
52 | logging.info(self.network)
53 | logging.info("Network accuracy: %.2f%%" % (self.accuracy * 100))
54 |
--------------------------------------------------------------------------------
/older/mnist_multi_gpu_eval.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the 'License');
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Evaluation for MNIST.
17 |
18 | Accuracy:
19 |
20 | Speed:
21 |
22 | Usage:
23 |
24 | """
25 | from __future__ import absolute_import
26 | from __future__ import division
27 | from __future__ import print_function
28 |
29 | from datetime import datetime
30 | import math
31 | import time
32 |
33 | import numpy as np
34 | import tensorflow as tf
35 |
36 | import model
37 |
38 | from tensorflow.examples.tutorials.mnist import input_data
39 |
40 | FLAGS = tf.app.flags.FLAGS
41 |
42 | tf.app.flags.DEFINE_integer('batch_size', 50,
43 | """Number of images to process in a batch.""")
44 | tf.app.flags.DEFINE_string('eval_dir', '/home/norman/MNIST_train',
45 | """Directory where to write event logs.""")
46 | tf.app.flags.DEFINE_string('eval_data', 'test',
47 | """Either 'test' or 'train_eval'.""")
48 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
49 | """Path to the MNIST data directory.""")
50 | tf.app.flags.DEFINE_string('checkpoint_dir', '/home/norman/MNIST_train',
51 | """Directory where to read model checkpoints.""")
52 | tf.app.flags.DEFINE_integer('eval_interval_secs', 5,
53 | """How often to run the eval.""")
54 | tf.app.flags.DEFINE_integer('num_examples', 10000,
55 | """Number of examples to run.""")
56 | tf.app.flags.DEFINE_boolean('run_once', False,
57 | """Whether to run eval only once.""")
58 | tf.app.flags.DEFINE_boolean('use_fp16', False,
59 | """Train the model using fp16.""")
60 |
61 |
62 | def eval_once(saver, top_k_op):
63 | """Run Eval once.
64 |
65 | Args:
66 | saver: Saver.
67 | summary_writer: Summary writer.
68 | top_k_op: Top K op.
69 | """
70 | with tf.Session() as sess:
71 | ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
72 | if ckpt and ckpt.model_checkpoint_path:
73 | # Restores from checkpoint
74 | saver.restore(sess, ckpt.model_checkpoint_path)
75 | # Assuming model_checkpoint_path looks something like:
76 | # /my-favorite-path/MNIST_train/model.ckpt-0,
77 | # extract global_step from it.
78 | global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[
79 | -1]
80 | else:
81 | print('No checkpoint file found')
82 | return
83 |
84 | predictions = np.sum(sess.run([top_k_op]))
85 |
86 | # Compute precision.
87 | print('%s: precision = %.3f' % (datetime.now(), predictions))
88 |
89 | def evaluate():
90 | """Eval MNIST for a number of steps."""
91 | with tf.Graph().as_default() as g:
92 | # Get images and labels for MNIST.
93 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False)
94 | images = mnist.test.images
95 | labels = mnist.test.labels
96 |
97 | # Build a Graph that computes the logits predictions from the
98 | # inference model.
99 | logits = model.inference(images, keep_prob=1.0)
100 |
101 | # Calculate predictions.
102 | top_k_op = tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
103 |
104 | # Create saver to restore the learned variables for eval.
105 | saver = tf.train.Saver()
106 |
107 | eval_once(saver, top_k_op)
108 |
109 | def main(argv=None): # pylint: disable=unused-argument
110 | evaluate()
111 |
112 |
113 | if __name__ == '__main__':
114 | tf.app.run()
115 |
--------------------------------------------------------------------------------
/older/mnist_multi_gpu_train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the 'License');
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """A binary to train MNIST using multiple GPU's with synchronous updates.
16 |
17 | Accuracy:
18 | mnist_multi_gpu_train.py achieves ~xx% accuracy after 20K steps (xxx
19 | epochs of data) as judged by mnist_multi_gpu_batching_eval.py.
20 |
21 | Speed: With batch_size 50.
22 |
23 | System | Step Time (sec/batch) | Accuracy
24 | --------------------------------------------------------------------
25 | 1 GTX 1080 | 0.08-0.10 | ~xx% at 20K steps (x hours)
26 | 2 GTX 1080 | 0.08-0.10 | ~xx% at 20K steps (x hours)
27 |
28 | Usage:
29 | Please see the tutorial and website for how to download the MNIST
30 | data set, compile the program and train the model.
31 |
32 | """
33 |
34 | from __future__ import absolute_import
35 | from __future__ import division
36 | from __future__ import print_function
37 |
38 | from datetime import datetime
39 | import os.path
40 | import re
41 | import time
42 |
43 | import numpy as np
44 | from six.moves import xrange # pylint: disable=redefined-builtin
45 | import tensorflow as tf
46 |
47 | import model
48 |
49 |
50 |
51 | FLAGS = tf.app.flags.FLAGS
52 |
53 | tf.app.flags.DEFINE_integer('batch_size', 1000,
54 | """Number of images to process in a batch.""")
55 | tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
56 | """Path to the MNIST data directory.""")
57 | tf.app.flags.DEFINE_boolean('use_fp16', False,
58 | """Train the model using fp16.""")
59 | tf.app.flags.DEFINE_string('train_dir', '/home/norman/MNIST_train',
60 | """Directory where to write event logs """
61 | """and checkpoint.""")
62 | tf.app.flags.DEFINE_integer('max_steps', 20000,
63 | """Number of batches to run.""")
64 | tf.app.flags.DEFINE_integer('num_gpus', 2,
65 | """How many GPUs to use.""")
66 | tf.app.flags.DEFINE_boolean('log_device_placement', False,
67 | """Whether to log device placement.""")
68 | tf.app.flags.DEFINE_boolean('tb_logging', False,
69 | """Whether to log to Tensorboard.""")
70 |
71 | def tower_loss(scope):
72 | """Calculate the total loss on a single tower running the MNIST model.
73 |
74 | Args:
75 | scope: unique prefix string identifying the MNIST tower, e.g. 'tower_0'
76 |
77 | Returns:
78 | Tensor of shape [] containing the total loss for a batch of data
79 | """
80 | # Get images and labels for MSNIT.
81 | images, labels = model.inputs(FLAGS.batch_size)
82 |
83 | # Build inference Graph.
84 | logits = model.inference(images, keep_prob=0.5)
85 |
86 | # Build the portion of the Graph calculating the losses. Note that we will
87 | # assemble the total_loss using a custom function below.
88 | _ = model.loss(logits, labels)
89 |
90 | # Assemble all of the losses for the current tower only.
91 | losses = tf.get_collection('losses', scope)
92 |
93 | # Calculate the total loss for the current tower.
94 | total_loss = tf.add_n(losses, name='total_loss')
95 |
96 | # Attach a scalar summary to all individual losses and the total loss; do
97 | # the same for the averaged version of the losses.
98 | if (FLAGS.tb_logging):
99 | for l in losses + [total_loss]:
100 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU
101 | # training session. This helps the clarity of presentation on
102 | # tensorboard.
103 | loss_name = re.sub('%s_[0-9]*/' % model.TOWER_NAME, '', l.op.name)
104 | tf.summary.scalar(loss_name, l)
105 |
106 | return total_loss
107 |
108 |
109 | def average_gradients(tower_grads):
110 | """Calculate average gradient for each shared variable across all towers.
111 |
112 | Note that this function provides a synchronization point across all towers.
113 |
114 | Args:
115 | tower_grads: List of lists of (gradient, variable) tuples. The outer list
116 | is over individual gradients. The inner list is over the gradient
117 | calculation for each tower.
118 | Returns:
119 | List of pairs of (gradient, variable) where the gradient has been
120 | averaged across all towers.
121 | """
122 | average_grads = []
123 | for grad_and_vars in zip(*tower_grads):
124 | # Note that each grad_and_vars looks like the following:
125 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
126 | grads = []
127 | for g, _ in grad_and_vars:
128 | # Add 0 dimension to the gradients to represent the tower.
129 | expanded_g = tf.expand_dims(g, 0)
130 |
131 | # Append on a 'tower' dimension which we will average over below.
132 | grads.append(expanded_g)
133 |
134 | # Average over the 'tower' dimension.
135 | grad = tf.concat(grads, 0)
136 | grad = tf.reduce_mean(grad, 0)
137 |
138 | # Keep in mind that the Variables are redundant because they are shared
139 | # across towers. So .. we will just return the first tower's pointer to
140 | # the Variable.
141 | v = grad_and_vars[0][1]
142 | grad_and_var = (grad, v)
143 | average_grads.append(grad_and_var)
144 | return average_grads
145 |
146 | def train():
147 | """Train MNIST for a number of steps."""
148 | with tf.Graph().as_default(), tf.device('/cpu:0'):
149 |
150 | # Create a variable to count the number of train() calls. This equals
151 | # the number of batches processed * FLAGS.num_gpus.
152 | global_step = tf.get_variable(
153 | 'global_step', [],
154 | initializer=tf.constant_initializer(0), trainable=False)
155 |
156 | # Use AdamOptimizer.
157 | opt = tf.train.AdamOptimizer(model.INITIAL_LEARNING_RATE)
158 |
159 | # Calculate the gradients for each model tower.
160 | tower_grads = []
161 | with tf.variable_scope(tf.get_variable_scope()):
162 | for i in xrange(FLAGS.num_gpus):
163 | with tf.device('/gpu:%d' % i):
164 | with tf.name_scope(
165 | '%s_%d' % (model.TOWER_NAME, i)) as scope:
166 | # Calculate the loss for one tower of the MNIST model.
167 | # This function constructs the entire MNIST model but
168 | # shares the variables across all towers.
169 | loss = tower_loss(scope)
170 |
171 | # Reuse variables for the next tower.
172 | tf.get_variable_scope().reuse_variables()
173 |
174 | # Retain the summaries from the final tower.
175 | summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
176 | scope)
177 |
178 | # Calculate the gradients for the batch of data on this
179 | # MNIST tower.
180 | grads = opt.compute_gradients(loss)
181 |
182 | # Keep track of the gradients across all towers.
183 | tower_grads.append(grads)
184 |
185 | # We must calculate the mean of each gradient. Note that this is the
186 | # synchronization point across all towers.
187 | grads = average_gradients(tower_grads)
188 |
189 | # Add histograms for gradients.
190 | if (FLAGS.tb_logging):
191 | for grad, var in grads:
192 | if grad is not None:
193 | summaries.append(
194 | tf.summary.histogram(var.op.name + '/gradients', grad))
195 |
196 | # Apply the gradients to adjust the shared variables.
197 | train_op = opt.apply_gradients(grads, global_step=global_step)
198 |
199 | # Add histograms for trainable variables.
200 | if (FLAGS.tb_logging):
201 | for var in tf.trainable_variables():
202 | summaries.append(tf.summary.histogram(var.op.name, var))
203 |
204 | # Create a saver.
205 | saver = tf.train.Saver(tf.global_variables())
206 |
207 | # Build the summary operation from the last tower summaries.
208 | summary_op = tf.summary.merge(summaries)
209 |
210 | # Build an initialization operation to run below.
211 | init = tf.global_variables_initializer()
212 |
213 | # Start running operations on the Graph. allow_soft_placement must be
214 | # set to True to build towers on GPU, as some of the ops do not have GPU
215 | # implementations.
216 | sess = tf.Session(config=tf.ConfigProto(
217 | allow_soft_placement=True,
218 | log_device_placement=FLAGS.log_device_placement))
219 | sess.run(init)
220 |
221 | # Start the queue runners.
222 | tf.train.start_queue_runners(sess=sess)
223 |
224 | summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
225 |
226 | for step in xrange(FLAGS.max_steps):
227 | start_time = time.time()
228 | _, loss_value = sess.run([train_op, loss])
229 | duration = time.time() - start_time
230 |
231 | assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
232 |
233 | if step % 50 == 0:
234 | num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
235 | examples_per_sec = num_examples_per_step / duration
236 | sec_per_batch = duration / FLAGS.num_gpus
237 |
238 | format_str = (
239 | '%s: step %d, loss = %.4f (%.1f examples/sec; %.3f '
240 | 'sec/batch)')
241 | print(format_str % (datetime.now(), step, loss_value,
242 | examples_per_sec, sec_per_batch))
243 | if (FLAGS.tb_logging):
244 | if step % 5 == 0:
245 | summary_str = sess.run(summary_op)
246 | summary_writer.add_summary(summary_str, step)
247 |
248 | # Save the model checkpoint periodically.
249 | if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
250 | checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
251 | saver.save(sess, checkpoint_path, global_step=step)
252 |
253 |
254 | def main(argv=None): # pylint: disable=unused-argument
255 | train()
256 |
257 |
258 | if __name__ == '__main__':
259 | tf.app.run()
260 |
--------------------------------------------------------------------------------
/older/model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Norman Heckscher. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the 'License');
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Builds the MNIST network.
16 |
17 | Summary of available functions:
18 |
19 | # Compute input images and labels for training. If you would like to run
20 | # evaluations, use inputs() instead.
21 | inputs, labels = distorted_inputs()
22 |
23 | # Compute inference on the model inputs to make a prediction.
24 | predictions = inference(inputs)
25 |
26 | # Compute the total loss of the prediction with respect to the labels.
27 | loss = loss(predictions, labels)
28 |
29 | # Create a graph to run one step of training with respect to the loss.
30 | train_op = train(loss, global_step)
31 | """
32 |
33 | from __future__ import absolute_import
34 | from __future__ import division
35 | from __future__ import print_function
36 |
37 | import re
38 |
39 | import tensorflow as tf
40 | from tensorflow.examples.tutorials.mnist import input_data
41 |
42 | FLAGS = tf.app.flags.FLAGS
43 | # tf.app.flags.DEFINE_string('data_dir', '/home/norman/MNIST_data',
44 | # """Path to the MNIST data directory.""")
45 |
46 | # Global constants describing the MNIST data set.
47 | IMAGE_SIZE = 28
48 | NUM_CLASSES = 10
49 | NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
50 | NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
51 |
52 | # Constants describing the training process.
53 | INITIAL_LEARNING_RATE = 0.0001 # Initial learning rate.
54 |
55 | # If a model is trained with multiple GPUs, prefix all Op names with tower_name
56 | # to differentiate the operations. Note that this prefix is removed from the
57 | # names of the summaries when visualizing a model.
58 | TOWER_NAME = 'tower'
59 |
60 | mnist = input_data.read_data_sets('/home/norman/MNIST_data', one_hot=False)
61 |
62 | def inputs(batch_size=50):
63 | """Construct input for MNIST training using the TensorFlow framework.
64 |
65 | Returns:
66 | images: mnist images
67 | labels: mnist labels
68 |
69 | """
70 | images, labels = mnist.train.next_batch(batch_size)
71 |
72 | return images, labels
73 |
74 | def _variable_with_weight_decay(name, shape, stddev, wd):
75 | """Helper to create an initialized Variable with weight decay.
76 |
77 | Note that the Variable is initialized with a truncated normal distribution.
78 | A weight decay is added only if one is specified.
79 |
80 | Args:
81 | name: name of the variable
82 | shape: list of ints
83 | stddev: standard deviation of a truncated Gaussian
84 | wd: add L2Loss weight decay multiplied by this float. If None, weight
85 | decay is not added for this Variable.
86 |
87 | Returns:
88 | Variable Tensor
89 | """
90 | dtype = tf.float32
91 | var = _variable_on_cpu(
92 | name,
93 | shape,
94 | tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
95 | if wd is not None:
96 | weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
97 | tf.add_to_collection('losses', weight_decay)
98 | return var
99 |
100 |
101 | def _variable_on_cpu(name, shape, initializer):
102 | """Helper to create a Variable stored on CPU memory.
103 |
104 | Args:
105 | name: name of the variable
106 | shape: list of ints
107 | initializer: initializer for Variable
108 |
109 | Returns:
110 | Variable Tensor
111 | """
112 | with tf.device('/cpu:0'):
113 | dtype = tf.float32
114 | var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
115 | return var
116 |
117 |
118 | def _activation_summary(x):
119 | """Helper to create summaries for activations.
120 |
121 | Creates a summary that provides a histogram of activations.
122 | Creates a summary that measures the sparsity of activations.
123 |
124 | Args:
125 | x: Tensor
126 | Returns:
127 | nothing
128 | """
129 | # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
130 | # session. This helps the clarity of presentation on tensorboard.
131 | tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
132 | tf.summary.histogram(tensor_name + '/activations', x)
133 | tf.summary.scalar(tensor_name + '/sparsity',
134 | tf.nn.zero_fraction(x))
135 |
136 |
137 | def loss(logits, labels):
138 | """Add L2Loss to all the trainable variables.
139 |
140 | Add summary for "Loss" and "Loss/avg".
141 | Args:
142 | logits: Logits from inference().
143 | labels: Labels from MNIST or inputs(). 1-D tensor
144 | of shape [batch_size]
145 |
146 | Returns:
147 | Loss tensor of type float.
148 | """
149 | # Calculate the average cross entropy loss across the batch.
150 | labels = tf.cast(labels, tf.int32)
151 | cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
152 | labels=labels, logits=logits, name='cross_entropy_per_example')
153 | cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
154 | tf.add_to_collection('losses', cross_entropy_mean)
155 |
156 | # The total loss is defined as the cross entropy loss plus all of the weight
157 | # decay terms (L2 loss).
158 | return tf.add_n(tf.get_collection('losses'), name='total_loss')
159 |
160 |
161 | def inference(images, keep_prob=1.0):
162 | """Build the MNIST model.
163 |
164 | Args:
165 | images: Images returned from MNIST or inputs().
166 |
167 | Returns:
168 | Logits.
169 | """
170 | # We instantiate all variables using tf.get_variable() instead of
171 | # tf.Variable() in order to share variables across multiple GPU training
172 | # runs. If we only ran this model on a single GPU, we could simplify this
173 | # function by replacing all instances of tf.get_variable()
174 | # with tf.Variable().
175 |
176 | # Reshape to use within a convolutional neural net.
177 | # Last dimension is for "features" - there is only one here, since images
178 | # are grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
179 | x_image = tf.reshape(images, [-1, 28, 28, 1])
180 |
181 | # conv1
182 | with tf.variable_scope('conv1') as scope:
183 | kernel = _variable_with_weight_decay('weights',
184 | shape=[5, 5, 1, 32],
185 | stddev=5e-2,
186 | wd=0.0)
187 | biases = _variable_on_cpu('biases', [32], tf.constant_initializer(0.0))
188 | conv = tf.nn.conv2d(x_image, kernel, strides=[1, 1, 1, 1],
189 | padding='SAME')
190 | pre_activation = tf.nn.bias_add(conv, biases)
191 | conv1 = tf.nn.relu(pre_activation, name=scope.name)
192 | _activation_summary(conv1)
193 |
194 | # pool1
195 | pool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
196 | padding='SAME', name='pool1')
197 |
198 | # norm1
199 | norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
200 | name='norm1')
201 |
202 | # conv2
203 | with tf.variable_scope('conv2') as scope:
204 | kernel = _variable_with_weight_decay('weights',
205 | shape=[5, 5, 32, 64],
206 | stddev=5e-2,
207 | wd=0.0)
208 | conv = tf.nn.conv2d(norm1, kernel, strides=[1, 1, 1, 1], padding='SAME')
209 | biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
210 | pre_activation = tf.nn.bias_add(conv, biases)
211 | conv2 = tf.nn.relu(pre_activation, name=scope.name)
212 | _activation_summary(conv2)
213 |
214 | # norm2
215 | norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
216 | name='norm1')
217 |
218 | # pool2
219 | pool2 = tf.nn.max_pool(norm2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],
220 | padding='SAME', name='pool2')
221 |
222 | # local3
223 | with tf.variable_scope('local3') as scope:
224 | # Move everything into depth so we can perform a single matrix multiply.
225 | reshape = tf.reshape(pool2, [-1, 7 * 7 * 64])
226 | dim = reshape.get_shape()[1].value
227 | weights = _variable_with_weight_decay('weights', shape=[dim, 1024],
228 | stddev=0.04, wd=0.004)
229 | biases = _variable_on_cpu('biases', [1024],
230 | tf.constant_initializer(0.1))
231 | local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,
232 | name=scope.name)
233 | _activation_summary(local3)
234 |
235 | # local4 with dropout
236 | with tf.variable_scope('local4') as scope:
237 | # keep_prob = tf.placeholder(tf.float32, name="keep_prob")
238 | local4 = tf.nn.dropout(local3, keep_prob, name=scope.name)
239 | weights = _variable_with_weight_decay('weights', shape=[1024, 10],
240 | stddev=0.04, wd=0.004)
241 | biases = _variable_on_cpu('biases', [10], tf.constant_initializer(0.1))
242 | softmax_linear = tf.add(tf.matmul(local4, weights), biases,
243 | name=scope.name)
244 | _activation_summary(softmax_linear)
245 |
246 | return softmax_linear
247 |
--------------------------------------------------------------------------------
/optimizer.py:
--------------------------------------------------------------------------------
1 | """
2 | Class that holds a genetic algorithm for evolving a network.
3 |
4 | Credit:
5 | A lot of those code was originally inspired by:
6 | http://lethain.com/genetic-algorithms-cool-name-damn-simple/
7 | """
8 | from functools import reduce
9 | from operator import add
10 | import random
11 | from network import Network
12 |
13 | class Optimizer():
14 | """Class that implements genetic algorithm for MLP optimization."""
15 |
16 | def __init__(self, nn_param_choices, retain=0.4,
17 | random_select=0.1, mutate_chance=0.2):
18 | """Create an optimizer.
19 |
20 | Args:
21 | nn_param_choices (dict): Possible network paremters
22 | retain (float): Percentage of population to retain after
23 | each generation
24 | random_select (float): Probability of a rejected network
25 | remaining in the population
26 | mutate_chance (float): Probability a network will be
27 | randomly mutated
28 |
29 | """
30 | self.mutate_chance = mutate_chance
31 | self.random_select = random_select
32 | self.retain = retain
33 | self.nn_param_choices = nn_param_choices
34 |
35 | def create_population(self, count):
36 | """Create a population of random networks.
37 |
38 | Args:
39 | count (int): Number of networks to generate, aka the
40 | size of the population
41 |
42 | Returns:
43 | (list): Population of network objects
44 |
45 | """
46 | pop = []
47 | for _ in range(0, count):
48 | # Create a random network.
49 | network = Network(self.nn_param_choices)
50 | network.create_random()
51 |
52 | # Add the network to our population.
53 | pop.append(network)
54 |
55 | return pop
56 |
57 | @staticmethod
58 | def fitness(network):
59 | """Return the accuracy, which is our fitness function."""
60 | return network.accuracy
61 |
62 | def grade(self, pop):
63 | """Find average fitness for a population.
64 |
65 | Args:
66 | pop (list): The population of networks
67 |
68 | Returns:
69 | (float): The average accuracy of the population
70 |
71 | """
72 | summed = reduce(add, (self.fitness(network) for network in pop))
73 | return summed / float((len(pop)))
74 |
75 | def breed(self, mother, father):
76 | """Make two children as parts of their parents.
77 |
78 | Args:
79 | mother (dict): Network parameters
80 | father (dict): Network parameters
81 |
82 | Returns:
83 | (list): Two network objects
84 |
85 | """
86 | children = []
87 | for _ in range(2):
88 |
89 | child = {}
90 |
91 | # Loop through the parameters and pick params for the kid.
92 | for param in self.nn_param_choices:
93 | child[param] = random.choice(
94 | [mother.network[param], father.network[param]]
95 | )
96 |
97 | # Now create a network object.
98 | network = Network(self.nn_param_choices)
99 | network.create_set(child)
100 |
101 | children.append(network)
102 |
103 | return children
104 |
105 | def mutate(self, network):
106 | """Randomly mutate one part of the network.
107 |
108 | Args:
109 | network (dict): The network parameters to mutate
110 |
111 | Returns:
112 | (Network): A randomly mutated network object
113 |
114 | """
115 | # Choose a random key.
116 | mutation = random.choice(list(self.nn_param_choices.keys()))
117 |
118 | # Mutate one of the params.
119 | network.network[mutation] = random.choice(self.nn_param_choices[mutation])
120 |
121 | return network
122 |
123 | def evolve(self, pop):
124 | """Evolve a population of networks.
125 |
126 | Args:
127 | pop (list): A list of network parameters
128 |
129 | Returns:
130 | (list): The evolved population of networks
131 |
132 | """
133 | # Get scores for each network.
134 | graded = [(self.fitness(network), network) for network in pop]
135 |
136 | # Sort on the scores.
137 | graded = [x[1] for x in sorted(graded, key=lambda x: x[0], reverse=True)]
138 |
139 | # Get the number we want to keep for the next gen.
140 | retain_length = int(len(graded)*self.retain)
141 |
142 | # The parents are every network we want to keep.
143 | parents = graded[:retain_length]
144 |
145 | # For those we aren't keeping, randomly keep some anyway.
146 | for individual in graded[retain_length:]:
147 | if self.random_select > random.random():
148 | parents.append(individual)
149 |
150 | # Randomly mutate some of the networks we're keeping.
151 | for individual in parents:
152 | if self.mutate_chance > random.random():
153 | individual = self.mutate(individual)
154 |
155 | # Now find out how many spots we have left to fill.
156 | parents_length = len(parents)
157 | desired_length = len(pop) - parents_length
158 | children = []
159 |
160 | # Add children, which are bred from two remaining networks.
161 | while len(children) < desired_length:
162 |
163 | # Get a random mom and dad.
164 | male = random.randint(0, parents_length-1)
165 | female = random.randint(0, parents_length-1)
166 |
167 | # Assuming they aren't the same network...
168 | if male != female:
169 | male = parents[male]
170 | female = parents[female]
171 |
172 | # Breed them.
173 | babies = self.breed(male, female)
174 |
175 | # Add the children one at a time.
176 | for baby in babies:
177 | # Don't grow larger than desired length.
178 | if len(children) < desired_length:
179 | children.append(baby)
180 |
181 | parents.extend(children)
182 |
183 | return parents
184 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | """
2 | Utility used by the Network class to actually train.
3 |
4 | Based on:
5 | https://github.com/fchollet/keras/blob/master/examples/mnist_mlp.py
6 |
7 | """
8 | from keras.datasets import mnist, cifar10
9 | from keras.models import Sequential
10 | from keras.layers import Dense, Dropout
11 | from keras.utils.np_utils import to_categorical
12 | from keras.callbacks import EarlyStopping
13 |
14 | # Helper: Early stopping.
15 | early_stopper = EarlyStopping(patience=5)
16 |
17 | def get_cifar10():
18 | """Retrieve the CIFAR dataset and process the data."""
19 | # Set defaults.
20 | nb_classes = 10
21 | batch_size = 64
22 | input_shape = (3072,)
23 |
24 | # Get the data.
25 | (x_train, y_train), (x_test, y_test) = cifar10.load_data()
26 | x_train = x_train.reshape(50000, 3072)
27 | x_test = x_test.reshape(10000, 3072)
28 | x_train = x_train.astype('float32')
29 | x_test = x_test.astype('float32')
30 | x_train /= 255
31 | x_test /= 255
32 |
33 | # convert class vectors to binary class matrices
34 | y_train = to_categorical(y_train, nb_classes)
35 | y_test = to_categorical(y_test, nb_classes)
36 |
37 | return (nb_classes, batch_size, input_shape, x_train, x_test, y_train, y_test)
38 |
39 | def get_mnist():
40 | """Retrieve the MNIST dataset and process the data."""
41 | # Set defaults.
42 | nb_classes = 10
43 | batch_size = 128
44 | input_shape = (784,)
45 |
46 | # Get the data.
47 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
48 | x_train = x_train.reshape(60000, 784)
49 | x_test = x_test.reshape(10000, 784)
50 | x_train = x_train.astype('float32')
51 | x_test = x_test.astype('float32')
52 | x_train /= 255
53 | x_test /= 255
54 |
55 | # convert class vectors to binary class matrices
56 | y_train = to_categorical(y_train, nb_classes)
57 | y_test = to_categorical(y_test, nb_classes)
58 |
59 | return (nb_classes, batch_size, input_shape, x_train, x_test, y_train, y_test)
60 |
61 | def compile_model(network, nb_classes, input_shape):
62 | """Compile a sequential model.
63 |
64 | Args:
65 | network (dict): the parameters of the network
66 |
67 | Returns:
68 | a compiled network.
69 |
70 | """
71 | # Get our network parameters.
72 | nb_layers = network['nb_layers']
73 | nb_neurons = network['nb_neurons']
74 | activation = network['activation']
75 | optimizer = network['optimizer']
76 |
77 | model = Sequential()
78 |
79 | # Add each layer.
80 | for i in range(nb_layers):
81 |
82 | # Need input shape for first layer.
83 | if i == 0:
84 | model.add(Dense(nb_neurons, activation=activation, input_shape=input_shape))
85 | else:
86 | model.add(Dense(nb_neurons, activation=activation))
87 |
88 | model.add(Dropout(0.2)) # hard-coded dropout
89 |
90 | # Output layer.
91 | model.add(Dense(nb_classes, activation='softmax'))
92 |
93 | model.compile(loss='categorical_crossentropy', optimizer=optimizer,
94 | metrics=['accuracy'])
95 |
96 | return model
97 |
98 | def train_and_score(network, dataset):
99 | """Train the model, return test loss.
100 |
101 | Args:
102 | network (dict): the parameters of the network
103 | dataset (str): Dataset to use for training/evaluating
104 |
105 | """
106 | if dataset == 'cifar10':
107 | nb_classes, batch_size, input_shape, x_train, \
108 | x_test, y_train, y_test = get_cifar10()
109 | elif dataset == 'mnist':
110 | nb_classes, batch_size, input_shape, x_train, \
111 | x_test, y_train, y_test = get_mnist()
112 |
113 | model = compile_model(network, nb_classes, input_shape)
114 |
115 | model.fit(x_train, y_train,
116 | batch_size=batch_size,
117 | epochs=10000, # using early stopping, so no real limit
118 | verbose=0,
119 | validation_data=(x_test, y_test),
120 | callbacks=[early_stopper])
121 |
122 | score = model.evaluate(x_test, y_test, verbose=0)
123 |
124 | return score[1] # 1 is accuracy. 0 is loss.
125 |
--------------------------------------------------------------------------------