├── text
    └── 1_overview
    │   ├── images
    │       ├── figures.pptx
    │       ├── releases.png
    │       ├── basic-arch.png
    │       └── platform-mode.png
    │   ├── 1.4_conclusion.md
    │   ├── 1.0_overview.md
    │   ├── 1.3_architecture.md
    │   ├── 1.2_objectives.md
    │   └── 1.1_introduction.md
├── code
    ├── 4_data_io
    │   ├── 4.3_flags_demo.py
    │   ├── 4.3_demo.py
    │   ├── 4.3_flags_mnist.py
    │   ├── 4.3_new_demo.py
    │   ├── 4.1_reader.py
    │   ├── 4.1_writer.py
    │   └── 4.1_best_practice.py
    ├── 5_control_flow_analysis
    │   ├── 5.2_trainer.py
    │   ├── 5.1_best_practice.py
    │   └── 5.2_best_practice.py
    ├── 6_tensorboard
    │   ├── 6.2_best_practice.py
    │   ├── 6.4_best_practice.py
    │   ├── 6.3_mnist_softmax_scalar.py
    │   ├── 6.3_mnist_softmax_histogram.py
    │   └── 6.3_best_practice.py
    ├── 9_cnn_models
    │   ├── 9.2_data_factory.py
    │   ├── 9.2_nets_factory.py
    │   ├── 9.2_alexnet.py
    │   ├── 9.2_train_image_classifier.py
    │   └── 9.2_model_deploy.py
    ├── 3_basic_concepts
    │   └── 3.6_best_practice.py
    ├── 11_rnn_models
    │   ├── 11.2_reader.py
    │   └── 11.2_ptb_word_lm.py
    ├── 7_tf_serving
    │   └── 7.4_mnist_saved_model.py
    └── 10_gan_models
    │   └── 10.3_model.py
├── preface.md
├── recommendations.md
├── README.md
└── contents.md


/text/1_overview/images/figures.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DjangoPeng/tensorflow-in-depth/HEAD/text/1_overview/images/figures.pptx


--------------------------------------------------------------------------------
/text/1_overview/images/releases.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DjangoPeng/tensorflow-in-depth/HEAD/text/1_overview/images/releases.png


--------------------------------------------------------------------------------
/text/1_overview/images/basic-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DjangoPeng/tensorflow-in-depth/HEAD/text/1_overview/images/basic-arch.png


--------------------------------------------------------------------------------
/text/1_overview/images/platform-mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DjangoPeng/tensorflow-in-depth/HEAD/text/1_overview/images/platform-mode.png


--------------------------------------------------------------------------------
/code/4_data_io/4.3_flags_demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import tensorflow as tf
 3 | 
 4 | flags = tf.app.flags
 5 | flags.DEFINE_string("data_dir", "/tmp/mnist-data",
 6 |                     "Direcotry for storing mnist data")
 7 | 
 8 | FLAGS = flags.FLAGS
 9 | def main(_):
10 |     print(FLAGS.data_dir)
11 | 
12 | if __name__ == "__main__":
13 |     tf.app.run()
14 | 


--------------------------------------------------------------------------------
/text/1_overview/1.4_conclusion.md:
--------------------------------------------------------------------------------
1 | ## 1.4 小结
2 | 
3 | TensorFlow作为一套优秀的深度学习计算库，在新一波人工智能浪潮中脱颖而出。它源于Google公司内部基于海量数据开展感知和预测类应用的需求，并通过围棋大战向公众一展雄姿。依托Google团队雄厚的科研实力，同时借助开源社区的集体智慧，TensorFlow已经成为了一套运算性能强劲、功能灵活多样、语言接口丰富，并支持生产环境部署和端云协同计算的通用人工智能基础平台软件。TensorFlow的构架设计灵活而开放，有助于适应多样的应用场景并吸引第三方开发者贡献特性。我们相信，在当今人工智能理论进步与应用落地并举的时代，TensorFlow势必会引领相关研究与工程领域的高速发展。
4 | 
5 | #
6 | 
7 | **Prev：**[1.3 TensorFlow的基本架构](1.3_architecture.md)


--------------------------------------------------------------------------------
/code/4_data_io/4.3_demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import argparse 
 3 | parser = argparse.ArgumentParser(prog='demo', description='A demo program', epilog='The end of usage')
 4 | 
 5 | parser.add_argument('name')
 6 | parser.add_argument('-a', '--age', type=int, required=True)
 7 | parser.add_argument('-s', '--status', choices=['alpha', 'beta', 'released'], type=str, dest='myStatus')
 8 | 
 9 | args = parser.parse_args() # 将名字空间赋值给args
10 | print(args) # 输出名字空间


--------------------------------------------------------------------------------
/text/1_overview/1.0_overview.md:
--------------------------------------------------------------------------------
1 | # 第1章 TensorFlow系统概述
2 | 
3 | 人工智能和深度学习的热潮将TensorFlow推向了至高的地位，媒体的追捧和业界的宣传也为这一源自Google的开源软件增添了传奇的色彩。对于技术从业者或爱好者而言，我们初识TensorFlow时有必要拨开表象看本质。本章作为引子，首先从技术视角概括性地介绍TensorFlow的产生背景、独特价值、版本变迁，以及它与其他主流深度学习框架的异同。同时，本章从灵活通用性、异构支持性和性能高效性三个视角解析TensorFlow的设计目标，展示TensorFlow作为一款兼具深度学习库、人工智能引擎和基础平台软件身份的开源产品的优势所在。最后，我们将简单介绍TensorFlow的工作形态和组件结构，帮助读者快速建立对TensorFlow软件架构的第一印象。
4 | 
5 | #
6 | 
7 | **Prev：**[前言](preface.md)
8 | 
9 | **Next：**[1.1 TensorFlow简介](1.1_introduction.md)


--------------------------------------------------------------------------------
/code/4_data_io/4.3_flags_mnist.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from tensorflow.examples.tutorials.mnist import input_data
 3 | import tensorflow as tf
 4 | 
 5 | flags = tf.app.flags
 6 | flags.DEFINE_string("data_dir", "/tmp/mnist-data",
 7 |                     "Directory for storing mnist data")
 8 | FLAGS = flags.FLAGS
 9 | def main(_):
10 |   # 导入数据
11 |   mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
12 |   # ...省略中间步骤...
13 | 
14 | if __name__ == "__main__":
15 |   tf.app.run()
16 | 


--------------------------------------------------------------------------------
/code/4_data_io/4.3_new_demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import argparse
 3 | parser = argparse.ArgumentParser(prog='demo')
 4 | parser.add_argument('name')
 5 | parser.add_argument('-a', '--age', type=int, required=True)
 6 | parser.add_argument('-s', '--status', choices=['alpha', 'beta', 'released'], type=str, dest='myStatus')
 7 | parser.add_argument('-v', '--version', action='version', version='%(prog)s 1.0')
 8 | 
 9 | args, unparsed = parser.parse_known_args() # 将解析器中未定义的参数返回给unparsed
10 | print('args=%s, unparsed=%s' % (args, unparsed))


--------------------------------------------------------------------------------
/code/4_data_io/4.1_reader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import tensorflow as tf
 3 | # 创建文件名队列filename_queue
 4 | filename_queue = tf.train.string_input_producer(['stat.tfrecord'])
 5 | # 创建读取TFRecords文件的reader
 6 | reader = tf.TFRecordReader()
 7 | # 取出stat.tfrecord文件中的一条序列化的样例serialized_example
 8 | _, serialized_example = reader.read(filename_queue)
 9 | # 将一条序列化的样例转换为其包含的所有特征张量
10 | features = tf.parse_single_example(
11 |         serialized_example,
12 |         features={
13 |             'id': tf.FixedLenFeature([], tf.int64),
14 |             'age': tf.FixedLenFeature([], tf.int64),
15 |             'income': tf.FixedLenFeature([], tf.float32),
16 |             'outgo': tf.FixedLenFeature([], tf.float32),
17 |         }
18 | )


--------------------------------------------------------------------------------
/code/4_data_io/4.1_writer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import tensorflow as tf
 3 | # 创建向TFRecords文件写数据记录的writer
 4 | writer = tf.python_io.TFRecordWriter('stat.tfrecord')
 5 | # 2轮循环构造输入样例
 6 | for i in range(1,3):
 7 |   # 创建example.proto中定义的样例
 8 |   example = tf.train.Example(
 9 |       features = tf.train.Features(
10 |           feature = {
11 |             'id': tf.train.Feature(int64_list =
12 |                 tf.train.Int64List(value=[i])),
13 |             'age': tf.train.Feature(int64_list =
14 |                 tf.train.Int64List(value=[i*24])),
15 |             'income': tf.train.Feature(float_list =
16 |                 tf.train.FloatList(value=[i*2048.0])),
17 |             'outgo': tf.train.Feature(float_list =
18 |                 tf.train.FloatList(value=[i*1024.0]))
19 |           }
20 |       )
21 |   )
22 |   # 将样例序列化为字符串后，写入stat.tfrecord文件
23 |   writer.write(example.SerializeToString())
24 | # 关闭输出流
25 | writer.close()


--------------------------------------------------------------------------------
/code/5_control_flow_analysis/5.2_trainer.py:
--------------------------------------------------------------------------------
 1 | """trainer.py"""
 2 | # -*- coding: utf-8 -*-
 3 | from tensorflow import flags
 4 | import tensorflow as tf
 5 | 
 6 | # 定义TensorFlow集群参数
 7 | flags.DEFINE_integer("task_index", None,
 8 |                      "Worker task index, should be >= 0. task_index=0 is "
 9 |                      "the master worker task the performs the variable "
10 |                      "initialization.")
11 | flags.DEFINE_string("ps_hosts", None,
12 |                     "Comma-separated list of hostname:port pairs")
13 | flags.DEFINE_string("worker_hosts", None,
14 |                     "Comma-separated list of hostname:port pairs")
15 | flags.DEFINE_string("job_name", None, "job name: worker or PS")
16 | def main(unused_argv):
17 |   # 解析集群参数ps_hosts和worker_hosts
18 |   PS_spec = FLAGS.ps_hosts.split(",")
19 |   worker_spec = FLAGS.worker_hosts.split(",")
20 |   # 定义TensorFlow集群
21 |   cluster = tf.train.ClusterSpec({
22 |       "PS": PS_spec,
23 |       "worker": worker_spec})
24 | 
25 |   server = tf.train.Server(
26 |       cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
27 |   # 启动PS，开始监听各worker的请求
28 |   if FLAGS.job_name == "PS":
29 |     server.join()
30 |   # 将任务编号为0的worker设置为chief worker
31 |   is_chief = (FLAGS.task_index == 0)


--------------------------------------------------------------------------------
/code/6_tensorboard/6.2_best_practice.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | from tensorflow.examples.tutorials.mnist import input_data
 9 | 
10 | mnist = input_data.read_data_sets('/tmp/data/mnist', one_hot=True)
11 | 
12 | with tf.name_scope('input'):
13 |   x = tf.placeholder(tf.float32, [None, 784], name='x-input')
14 |   y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
15 | 
16 | with tf.name_scope('softmax_layer'):
17 |   with tf.name_scope('weights'):
18 |       weights = tf.Variable(tf.zeros([784, 10]))
19 |   with tf.name_scope('biases'):
20 |       biases = tf.Variable(tf.zeros([10]))
21 |   with tf.name_scope('Wx_plus_b'):
22 |       y = tf.matmul(x, weights) + biases
23 |   
24 | with tf.name_scope('cross_entropy'):
25 |   diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
26 |   with tf.name_scope('total'):
27 |     cross_entropy = tf.reduce_mean(diff)
28 | 
29 | with tf.name_scope('train'):
30 |   train_step = tf.train.AdamOptimizer(0.001).minimize(
31 |       cross_entropy)
32 | 
33 | with tf.name_scope('accuracy'):
34 |   with tf.name_scope('correct_prediction'):
35 |     correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
36 |   with tf.name_scope('accuracy'):
37 |     accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
38 | 
39 | sess = tf.InteractiveSession()
40 | writer = tf.summary.FileWriter('/tmp/summary/mnist', sess.graph)
41 | tf.global_variables_initializer().run()
42 | 
43 | writer.close()
44 | 


--------------------------------------------------------------------------------
/code/5_control_flow_analysis/5.1_best_practice.py:
--------------------------------------------------------------------------------
 1 | """5.1_best_practice.py""" 
 2 | # -*- coding: utf-8 -*-
 3 | from tensorflow.examples.tutorials.mnist import input_data
 4 | import tensorflow as tf
 5 | 
 6 | flags = tf.app.flags
 7 | flags.DEFINE_string("data_dir", "/tmp/mnist-data",
 8 |                     "Directory for storing mnist data")
 9 | flags.DEFINE_float("learning_rate", 0.5, "Learning rate")
10 | FLAGS = flags.FLAGS
11 | 
12 | def main(_):
13 |   # 创建MNIST数据集实例
14 |   mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
15 |   # 创建模型
16 |   x = tf.placeholder(tf.float32, [None, 784])	# 图像数据
17 |   W = tf.Variable(tf.zeros([784, 10]))			  # 模型权重
18 |   b = tf.Variable(tf.zeros([10]))				      # 模型偏置
19 |   y = tf.matmul(x, W) + b						          # 推理操作
20 |   y_ = tf.placeholder(tf.float32, [None, 10]) # 图像标签
21 |   # 使用交叉熵作为损失值
22 |   cross_entropy = tf.reduce_mean(
23 |       tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
24 |   # 创建梯度下降优化器
25 |   optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
26 |   # 定义单步训练操作
27 |   train_op = optimizer.minimize(cross_entropy)
28 |   # 创建Saver
29 |   saver = tf.train.Saver()
30 |   sess = tf.InteractiveSession()
31 |   tf.global_variables_initializer().run()
32 |   # 最大训练步数
33 |   for i in range(1000): 
34 |     batch_xs, batch_ys = mnist.train.next_batch(100)
35 |     sess.run(train_op, feed_dict={x: batch_xs, y_: batch_ys})
36 |      # 每100步保存一次模型参数
37 |     if i % 100 == 0:
38 |       saver.save(sess, 'mnist.ckpt')
39 |   correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
40 |   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
41 |   print('acc=%s' % sess.run(accuracy, 
42 |                             feed_dict={x: mnist.test.images,
43 |                                        y_: mnist.test.labels}))
44 | 
45 | if __name__ == '__main__':
46 |   tf.app.run()


--------------------------------------------------------------------------------
/code/9_cnn_models/9.2_data_factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """A factory-pattern class which returns classification image/label pairs."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | from datasets import cifar10
22 | from datasets import flowers
23 | from datasets import imagenet
24 | from datasets import mnist
25 | # 数据集 map,当前支持以下这四种,每种数据集都对应一个单独的模块,都有 get_split 方法
26 | datasets_map = {
27 |     'cifar10': cifar10,
28 |     'flowers': flowers,
29 |     'imagenet': imagenet,
30 |     'mnist': mnist,
31 | }
32 | 
33 | 
34 | def get_dataset(name, split_name, dataset_dir, file_pattern=None, reader=None):
35 |   """Given a dataset name and a split_name returns a Dataset.
36 |   Args:
37 |     name: String, the name of the dataset.
38 |     split_name: A train/test split name.
39 |     dataset_dir: The directory where the dataset files are stored.
40 |     file_pattern: The file pattern to use for matching the dataset source files.
41 |     reader: The subclass of tf.ReaderBase. If left as `None`, then the default
42 |       reader defined by each dataset is used.
43 |   Returns:
44 |     A `Dataset` class.
45 |   Raises:
46 |     ValueError: If the dataset `name` is unknown.
47 |   """
48 |   if name not in datasets_map:
49 |     raise ValueError('Name of dataset unknown %s' % name)
50 |   return datasets_map[name].get_split(
51 |       split_name,
52 |       dataset_dir,
53 |       file_pattern,
54 |       reader)


--------------------------------------------------------------------------------
/code/4_data_io/4.1_best_practice.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # 类别标签为1字节
 3 | LABEL_BYTES = 1
 4 | # 图片尺寸为32字节
 5 | IMAGE_SIZE = 32
 6 | # 图片为RGB 3通道
 7 | IMAGE_DEPTH = 3
 8 | # 图片数据为32x32x3＝3072字节
 9 | IMAGE_BYTES = IMAGE_SIZE * IMAGE_SIZE * IMAGE_DEPTH
10 | # 10类标签
11 | NUM_CLASSES = 10
12 | 
13 | import tensorflow as tf
14 | 
15 | def read_cifar10(data_file, batch_size):
16 |   """从CIFAR-10数据文件读取批样例
17 |   输入参数:
18 |     data_file: CIFAR-10数据文件
19 |     batch_size: 批数据大小
20 |   返回值:
21 |     images: 形如[batch_size, IMAGE_SIZE, IMAGE_SIZE, 3]的图像批数据
22 |     labels: 形如[batch_size，NUM_CLASSES]的标签批数据
23 |   """
24 |   # 单条数据记录大小为1+3072=3073字节
25 |   record_bytes = LABEL_BYTES + IMAGE_BYTES
26 |   # 创建文件名列表
27 |   data_files = tf.gfile.Glob(data_file)
28 |   # 创建文件名队列
29 |   file_queue = tf.train.string_input_producer(data_files, shuffle=True)
30 |   # 创建二进制文件对应的Reader实例，按照记录大小从文件名队列中读取样例
31 |   reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
32 |   _, value = reader.read(file_queue)
33 |   # 将样例拆分为类别标签和图片
34 |   record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
35 |   label = tf.cast(tf.slice(record, [0], [LABEL_BYTES]), tf.int32)
36 |   # 将长度为[depth * height * width]的字符串转换为形如[depth, height, width]的图片张量
37 |   depth_major = tf.reshape(tf.slice(record, [LABEL_BYTES], [IMAGE_BYTES]),
38 |                            [IMAGE_DEPTH, IMAGE_SIZE, IMAGE_SIZE])
39 |   # 改变图片张量各维度顺序，从[depth, height, width]转换为[height, width, depth]
40 |   image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)
41 |   # 创建样例队列
42 |   example_queue = tf.RandomShuffleQueue(
43 |       capacity=16 * batch_size,
44 |       min_after_dequeue=8 * batch_size,
45 |       dtypes=[tf.float32, tf.int32],
46 |       shapes=[[IMAGE_SIZE, IMAGE_SIZE, IMAGE_DEPTH], [1]])
47 |   num_threads = 16
48 |   # 创建样例队列的入队操作
49 |   example_enqueue_op = example_queue.enqueue([image, label])
50 |   # 将定义的16个线程全部添加到queue runner中
51 |   tf.train.add_queue_runner(tf.train.queue_runner.QueueRunner(
52 |       example_queue, [example_enqueue_op] * num_threads))
53 | 
54 |   # 从样例队列中读取批样例图片和标签
55 |   images, labels = example_queue.dequeue_many(batch_size)
56 |   labels = tf.reshape(labels, [batch_size, 1])
57 |   indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
58 |   labels = tf.sparse_to_dense(
59 |       tf.concat(values=[indices, labels], axis=1),
60 |       [batch_size, NUM_CLASSES], 1.0, 0.0)
61 | 
62 |   # 展示images和labels的数据结构
63 |   assert len(images.get_shape()) == 4
64 |   assert images.get_shape()[0] == batch_size
65 |   assert images.get_shape()[-1] == 3
66 |   assert len(labels.get_shape()) == 2
67 |   assert labels.get_shape()[0] == batch_size
68 |   assert labels.get_shape()[1] == NUM_CLASSES
69 | 
70 |   return images, labels


--------------------------------------------------------------------------------
/text/1_overview/1.3_architecture.md:
--------------------------------------------------------------------------------
 1 | ## 1.3 TensorFlow的基本架构
 2 | 
 3 | 在展开介绍TensorFlow的使用方法和设计原理之前，我们有必要建立对TensorFlow基本架构的直观认识。本节从工作形态和组件结构两个角度对TensorFlow进行概要性的说明。读者可以以此为切入点，逐步理顺学习TensorFlow的脉络。
 4 | 
 5 | ### 1.3.1 TensorFlow的工作形态
 6 | 
 7 | 基础平台层软件的设计模式多种多样，它们对应用层开发者体现出的工作形态也有所差别。在众多平台设计模式中，存在两类基础而典型的模式，即图1-2所示的库模式和框架模式。在库模式下，平台层软件以静态或动态的开发库（如.a、.so文件）形式存在，应用层开发者需要编写程序调用这些库提供的函数，实现计算逻辑。程序的入口（如main函数）及整体流程控制权把握在应用层开发者手中。在框架模式下，平台层软件以可执行文件形式存在，并以前端交互式程序或后端守护进程方式独立运行。应用层开发者需要遵从平台规定的接口约束，开发包含计算逻辑在内的子程序，交由框架性质的平台层软件调度执行。程序的入口及整体流程控制权由框架把握。
 8 | 
 9 | ![platform-mode](images/platform-mode.png)
10 | 
11 | 图1-2  平台层软件的典型设计模式
12 | 
13 | 在高性能与大数据计算领域，典型的库模式软件有用于计算的Eigen、NumPy，以及用于通信的MPI、ZeroMQ等。基于这些库开发应用时，编程方式比较灵活，部署模式也相对轻量。应用开发者具有较大的自由度，但不得不编写业务逻辑之外的不少“脚手架”代码，以便将算法代码片段转变为完整可用的软件。典型的框架模式软件有大数据计算平台Hadoop、Spark，以及基于SQL和类SQL语言的数据库、数据仓库等。使用这些框架开发应用时，开发者的工作相对轻松，只需要编写与业务逻辑密切相关的算法代码，不用关心运行时机制的复杂性。不过，程序的灵活性将受制于框架的约束。
14 | 
15 | TensorFlow的设计采用了库模式。之所以如此，是出于灵活通用、端云结合及高性能等设计目标的考虑。库模式的平台层软件便于与各种既有的框架协同工作，不对软件的运行时组件添加新的约束，应用范围也不受制约。除了依赖最基本的编程语言库和操作系统调用，这类平台层软件同其他环境因素解耦，从而可以做到高度的可移植性。在单机和终端等场景下，由于没有守护进程和调度框架的开销，有效计算逻辑的资源利用率也会提高，进而有助于性能优化。
16 | 
17 | 综上，TensorFlow的工作形态是由用户编写主程序代码，调用Python或其他语言函数库提供的接口以实现计算逻辑。用户部署和使用TensorFlow系统时，不需要启动专门的守护进程，也不需要调用特殊的启动工具，只需要像编写普通的本地应用程序那样即可上手。用户也不用担心库模式的开发所必须的那些“脚手架”代码，因为TensorFlow已经提供了多种高级抽象，尽可能地最小化了核心计算逻辑之外的开发工作。
18 | 
19 | ### 1.3.2 TensorFlow的组件结构
20 | 
21 | TensorFlow作为一套包含数十万行代码的大型软件，其组件结构较为复杂。不过，由于其代码组织合理，文档资料充分，我们很容易将它的软件结构进行不同抽象程度的宏观呈现。初识TensorFlow的新手只需要从最高层的抽象视角观察其组件构成。图1-3给出了一幅粗粒度的TensorFlow组件结构示意图，展示了TensorFlow的主要内部结构及其与周边环境的关系。
22 | 
23 | ![basic-arch](images/basic-arch.png)
24 | 
25 | 图1-3  TensorFlow的组件结构示意图
26 | 
27 | 构成TensorFlow的主体是其运行时核心库。对于普通的Python应用层开发者而言，这个核心库就是指通过pip命令等方式安装TensorFlow之后，部署到site-packages或类似目录中的动态链接库文件。生成这个库的C++源代码大致分为3个层次：分布式运行时、公共运行时和算子核函数。其中，公共运行时实现了数据流图计算的基本逻辑，分布式运行时在此基础上实现了数据流图的跨进程协同计算逻辑，算子核函数则包含图上具体操作节点的算法实现代码。
28 | 
29 | TensorFlow运行时核心库导出的函数接口基于C和C++语言。为了使用其他语言进行应用开发，TensorFlow提供了多语言的API层。Python应用层开发者在代码中调用“`import tensorflow as tf`”时，导入的便是TensorFlow安装在Python第三方库目录下的API层模块（本书后文沿用这种Python包导入惯例，使用“tf”作为“tensorflow”命名空间的缩写）。API层对用户屏蔽了TensorFlow核心库的动态链接逻辑，使得用户可以使用自己熟悉的语言编写算法模型。
30 | 
31 | 为了简化经典模型的开发，使得TensorFlow成为一套“开箱即用”的工具，Google官方团队及开源贡献者们在TensorFlow社区开设了若干算法模型库及人工智能应用程序项目。用户可以复用这些项目的成果，加快自己的项目开发进度；也可以学习它们的实现原理，提升自己的模型与应用设计水平。这些外围项目中的部分代码（如Kares）已被认为具有较高的共性价值，因此逐步被加入到TensorFlow主项目之中。
32 | 
33 | TensorFlow运行时核心库底层对接的是各种计算库和通信库。这些库有的是外部组件（如用于CPU代数计算的Eigen库），有的则作为TensorFlow源代码的一部分集成在核心库内部（如用于GPU并行计算的StreamExecutor库）。用户在开发应用程序时看不到这些库的细节，只需要按照软件文档安装好必要的外部依赖包即可。
34 | 
35 | 上面所有组件均运行在本地操作系统和硬件基础设施之上。在服务器端运行场景，最常见的宿主操作系统是Linux，硬件一般为x86 CPU和NVIDIA GPU。在移动终端运行场景，宿主操作系统可以是Android、iOS等，硬件一般为ARM CPU和专用的人工智能芯片。TensorFlow不仅支持原生的物理环境，它对虚拟机和容器也完全兼容，这构成了云计算环境下的最佳实践。
36 | 
37 | #
38 | 
39 | **Prev：**[1.2 TensorFlow的设计目标](1.2_objectives.md)
40 | 
41 | **Next：**[1.4 小结](1.4_conclusion.md)
42 | 
43 | 


--------------------------------------------------------------------------------
/code/6_tensorboard/6.4_best_practice.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import argparse
 3 | import sys
 4 | import os
 5 | 
 6 | import numpy as np
 7 | import tensorflow as tf
 8 | 
 9 | from tensorflow.contrib.tensorboard.plugins import projector
10 | from tensorflow.examples.tutorials.mnist import input_data
11 | 
12 | FLAGS = None
13 | 
14 | def main(_):
15 |     # 创建日志目录
16 |     if tf.gfile.Exists(FLAGS.log_dir):
17 |         tf.gfile.DeleteRecursively(FLAGS.log_dir)
18 |     tf.gfile.MakeDirs(FLAGS.log_dir)
19 | 
20 |     # 读取MNIST数据集
21 |     mnist = input_data.read_data_sets(FLAGS.data_dir,
22 |                                       one_hot=True,
23 |                                       fake_data=FLAGS.fake_data)
24 | 
25 |     # 创建嵌入变量，保存测试集中的10000张手写体数字图像
26 |     embedding_var = tf.Variable(tf.stack(mnist.test.images[:10000]),
27 |                                 trainable=False, name='embedding')
28 |     # 创建交互式会话，并初始化全局变量
29 |     sess = tf.InteractiveSession()
30 |     tf.global_variables_initializer().run()
31 | 
32 |     # 创建saver, 并保存嵌入变量
33 |     saver = tf.train.Saver()
34 |     saver.save(sess, os.path.join(FLAGS.log_dir + '/model.ckpt'))
35 | 
36 |     # 创建元信息文件,并写入测试集中10000张手写体数字的标签
37 |     metadata_file = FLAGS.log_dir + '/metadata.tsv'
38 |     with open(metadata_file, 'w') as f:
39 |       for i in range(FLAGS.max_nums):
40 |         c = np.nonzero(mnist.test.labels[::1])[1:][0][i]
41 |         f.write('{}\n'.format(c))
42 |     
43 |     # 创建FileWriter
44 |     writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)
45 | 
46 |     # 创建投影配置参数
47 |     config = projector.ProjectorConfig()
48 |     embeddings= config.embeddings.add()
49 |     embeddings.tensor_name = 'embedding:0'
50 |     embeddings.metadata_path = os.path.join(FLAGS.log_dir + '/metadata.tsv')
51 | 
52 |     # 设置全景图文件路径和手写体数字图像的尺寸
53 |     embeddings.sprite.image_path = os.path.join('/tmp/summary/images/mnist_10k_sprite.png')
54 |     embeddings.sprite.single_image_dim.extend([28, 28])
55 | 
56 |     # 将参数配置写入新创建的投影参数配置文件中
57 |     # TensorBoard启动时会自动加载该文件
58 |     projector.visualize_embeddings(writer, config)
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     parser = argparse.ArgumentParser()
63 |     parser.add_argument('--fake_data', nargs='?', const=True, type=bool,
64 |                         default=False,
65 |                         help='If true, uses fake data for unit testing.')
66 |     parser.add_argument('--max_nums', type=int, default=10000,
67 |                         help='Number of steps to run trainer.')
68 |     parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
69 |                         help='Directory for storing input data')
70 |     parser.add_argument('--log_dir', type=str, default='/tmp/summary/embeddings',
71 |                         help='Summaries log directory')
72 |     FLAGS, unparsed = parser.parse_known_args()
73 |     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
74 | 


--------------------------------------------------------------------------------
/code/6_tensorboard/6.3_mnist_softmax_scalar.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | from tensorflow.examples.tutorials.mnist import input_data
 9 | 
10 | mnist = input_data.read_data_sets('/tmp/data/mnist', one_hot=True)
11 | 
12 | with tf.name_scope('input'):
13 |   x = tf.placeholder(tf.float32, [None, 784], name='x-input')
14 |   y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
15 | 
16 | with tf.name_scope('softmax_layer'):
17 |   weights = tf.Variable(tf.zeros([784, 10]))
18 |   biases = tf.Variable(tf.zeros([10]))
19 |   y = tf.matmul(x, weights) + biases
20 |   
21 | with tf.name_scope('cross_entropy'):
22 |   diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
23 |   with tf.name_scope('total'):
24 |     cross_entropy = tf.reduce_mean(diff)
25 |     tf.summary.scalar('cross_entropy', cross_entropy)
26 | 
27 | with tf.name_scope('train'):
28 |   train_step = tf.train.AdamOptimizer(0.001).minimize(
29 |       cross_entropy)
30 | 
31 | with tf.name_scope('accuracy'):
32 |   with tf.name_scope('correct_prediction'):
33 |     correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
34 |   with tf.name_scope('accuracy'):
35 |     accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
36 |     tf.summary.scalar('accuracy', accuracy)
37 | 
38 | merged = tf.summary.merge_all()
39 | sess = tf.InteractiveSession()
40 | 
41 | train_writer = tf.summary.FileWriter('/tmp/summary/mnist1' + '/train', sess.graph)
42 | test_writer = tf.summary.FileWriter('/tmp/summary/mnist1' + '/test')
43 | tf.global_variables_initializer().run()
44 | 
45 | def feed_dict(train):
46 |   """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
47 |   if train:
48 |     xs, ys = mnist.train.next_batch(100, fake_data=False)
49 |   else:
50 |     xs, ys = mnist.test.images, mnist.test.labels
51 |   return {x: xs, y_: ys}
52 | 
53 | for i in range(1000):
54 |   if i % 10 == 0:  # Record summaries and test-set accuracy
55 |     summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
56 |     test_writer.add_summary(summary, i)
57 |     print('Accuracy at step %s: %s' % (i, acc))
58 |   else:  # Record train set summaries, and train
59 |     if i % 100 == 99:  # Record execution stats
60 |       run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
61 |       run_metadata = tf.RunMetadata()
62 |       summary, _ = sess.run([merged, train_step],
63 |                             feed_dict=feed_dict(True),
64 |                             options=run_options,
65 |                             run_metadata=run_metadata)
66 |       train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
67 |       train_writer.add_summary(summary, i)
68 |       print('Adding run metadata for', i)
69 |     else:  # Record a summary
70 |       summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
71 |       train_writer.add_summary(summary, i)
72 | train_writer.close()
73 | test_writer.close()
74 | 


--------------------------------------------------------------------------------
/code/6_tensorboard/6.3_mnist_softmax_histogram.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | from tensorflow.examples.tutorials.mnist import input_data
 9 | 
10 | mnist = input_data.read_data_sets('/tmp/data/mnist', one_hot=True)
11 | 
12 | with tf.name_scope('input'):
13 |   x = tf.placeholder(tf.float32, [None, 784], name='x-input')
14 |   y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
15 | 
16 | with tf.name_scope('softmax_layer'):
17 |   with tf.name_scope('weights'):
18 |     weights = tf.Variable(tf.zeros([784, 10]))
19 |     tf.summary.histogram('weights', weights)
20 |   with tf.name_scope('biases'):
21 |     biases = tf.Variable(tf.zeros([10]))
22 |   with tf.name_scope('Wx_plus_b'):
23 |     y = tf.matmul(x, weights) + biases
24 |   
25 | with tf.name_scope('cross_entropy'):
26 |   diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
27 |   with tf.name_scope('total'):
28 |     cross_entropy = tf.reduce_mean(diff)
29 |     tf.summary.scalar('cross_entropy', cross_entropy)
30 | 
31 | with tf.name_scope('train'):
32 |   train_step = tf.train.AdamOptimizer(0.001).minimize(
33 |       cross_entropy)
34 | 
35 | with tf.name_scope('accuracy'):
36 |   with tf.name_scope('correct_prediction'):
37 |     correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
38 |   with tf.name_scope('accuracy'):
39 |     accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
40 |     tf.summary.scalar('accuracy', accuracy)
41 | 
42 | merged = tf.summary.merge_all()
43 | sess = tf.InteractiveSession()
44 | 
45 | train_writer = tf.summary.FileWriter('/tmp/summary/mnist/histogram' + '/train', sess.graph)
46 | test_writer = tf.summary.FileWriter('/tmp/summary/mnist/histogram' + '/test')
47 | tf.global_variables_initializer().run()
48 | 
49 | def feed_dict(train):
50 |   """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
51 |   if train:
52 |     xs, ys = mnist.train.next_batch(100, fake_data=False)
53 |   else:
54 |     xs, ys = mnist.test.images, mnist.test.labels
55 |   return {x: xs, y_: ys}
56 | 
57 | for i in range(1000):
58 |   if i % 10 == 0:  # Record summaries and test-set accuracy
59 |     summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
60 |     test_writer.add_summary(summary, i)
61 |     print('Accuracy at step %s: %s' % (i, acc))
62 |   else:  # Record train set summaries, and train
63 |     if i % 100 == 99:  # Record execution stats
64 |       run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
65 |       run_metadata = tf.RunMetadata()
66 |       summary, _ = sess.run([merged, train_step],
67 |                             feed_dict=feed_dict(True),
68 |                             options=run_options,
69 |                             run_metadata=run_metadata)
70 |       train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
71 |       train_writer.add_summary(summary, i)
72 |       print('Adding run metadata for', i)
73 |     else:  # Record a summary
74 |       summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
75 |       train_writer.add_summary(summary, i)
76 | train_writer.close()
77 | test_writer.close()
78 | 


--------------------------------------------------------------------------------
/code/6_tensorboard/6.3_best_practice.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import tensorflow as tf
 3 | 
 4 | from tensorflow.examples.tutorials.mnist import input_data
 5 | 
 6 | mnist = input_data.read_data_sets('/tmp/tensorflow/mnist/input_data', one_hot=True)
 7 | 
 8 | with tf.name_scope('input'):
 9 |   x = tf.placeholder(tf.float32, [None, 784], name='x-input')
10 |   y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
11 | 
12 | with tf.name_scope('input_reshape'):
13 |   # 将输入图像x转换成四阶张量
14 |   image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
15 |   # 添加获取手写体图像的汇总操作，设置最大生成10张图像
16 |   tf.summary.image('input', image_shaped_input, 10)
17 | 
18 | with tf.name_scope('softmax_layer'):
19 |   with tf.name_scope('weights'):
20 |     weights = tf.Variable(tf.zeros([784, 10]))
21 |     # 添加获取模型权重值的汇总操作
22 |     tf.summary.histogram('weights', weights)
23 |   with tf.name_scope('biases'):
24 |     biases = tf.Variable(tf.zeros([10]))
25 |     # 添加获取模型偏置值的汇总操作
26 |     tf.summary.histogram('biases', biases)
27 |   with tf.name_scope('Wx_plus_b'):
28 |     y = tf.matmul(x, weights) + biases
29 |   
30 | with tf.name_scope('cross_entropy'):
31 |   diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
32 |   with tf.name_scope('total'):
33 |     cross_entropy = tf.reduce_mean(diff)
34 |     # 添加获取交叉熵的汇总操作
35 |     tf.summary.scalar('cross_entropy', cross_entropy)
36 | 
37 | with tf.name_scope('train'):
38 |   train_step = tf.train.AdamOptimizer(0.001).minimize(
39 |       cross_entropy)
40 | 
41 | with tf.name_scope('accuracy'):
42 |   with tf.name_scope('correct_prediction'):
43 |     correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
44 |   with tf.name_scope('accuracy'):
45 |     accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
46 |     # 添加获取准确率的汇总操作
47 |     tf.summary.scalar('accuracy', accuracy)
48 | 
49 | merged = tf.summary.merge_all()
50 | sess = tf.InteractiveSession()
51 | 
52 | train_writer = tf.summary.FileWriter('/tmp/summary/mnist' + '/train', sess.graph)
53 | test_writer = tf.summary.FileWriter('/tmp/summary/mnist' + '/test')
54 | tf.global_variables_initializer().run()
55 | 
56 | def feed_dict(train):
57 |   """填充训练数据或测试数据的方法"""
58 |   if train:
59 |     xs, ys = mnist.train.next_batch(100, fake_data=False)
60 |   else:
61 |     xs, ys = mnist.test.images, mnist.test.labels
62 |   return {x: xs, y_: ys}
63 | 
64 | for i in range(1000):
65 |   if i % 10 == 0:  # 写汇总数据和测试集的准确率
66 |     summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
67 |     test_writer.add_summary(summary, i)
68 |     print('Accuracy at step %s: %s' % (i, acc))
69 |   else:  # Record train set summaries, and train
70 |     if i % 100 == 99:  # 写运行时的事件数据
71 |       run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
72 |       run_metadata = tf.RunMetadata()
73 |       summary, _ = sess.run([merged, train_step],
74 |                             feed_dict=feed_dict(True),
75 |                             options=run_options,
76 |                             run_metadata=run_metadata)
77 |       train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
78 |       train_writer.add_summary(summary, i)
79 |       print('Adding run metadata for', i)
80 |     else:  # 写汇总数据
81 |       summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
82 |       train_writer.add_summary(summary, i)
83 | 
84 | train_writer.close()
85 | test_writer.close()
86 | 


--------------------------------------------------------------------------------
/preface.md:
--------------------------------------------------------------------------------
 1 | # 前言
 2 | 
 3 | ## 缘起
 4 | 
 5 | 2016年的某个中午，我在知乎上回答了名为"如何高效的学习 TensorFlow 代码?"的问题。简单介绍了我在TensorFlow开源社区的贡献，以及TensorFlow的学习路线和方法。此回答引起了一些圈内人的共鸣。人民邮电出版社图灵社区也在第一时间找到我，希望能够让我写一本TensorFlow相关的书籍。于是，便有了您手中阅读的这本书。
 6 | 
 7 | ## 为什么写本书
 8 | 
 9 | 在ImageNet的带动下，深度学习的研究热潮已席卷全球。随着AlphaGo的横空出世，资本市场对人工智能的产业化也表现出了空前的兴趣。2017年7月《国务院关于印发新一代人工智能发展规划的通知》的出台，标志着国家层面对人工智能发展的高度重视，明确了我国新一代人工智能发展的战略目标。未来10年，我们将见证人工智能全面升级改造传统行业。在这场深刻变革中，TensorFlow将迸发出巨大的能量。
10 | 
11 | TensorFlow推出短短一个月，就成为了机器学习和深度学习项目中最受欢迎的开源框架。究其原因，离不开Google在人工智能与数据处理领域的深厚积淀及其在业界的强大号召力。TensorFlow自2015年11月开源以来，已经发布了30多个版本。尽管TensorFlow整个生态系统是开源的，但由于它版本升级过快，且算子种类众多，大部分公司，尤其是一些中小型或创业公司，难以在有限的时间内快速掌握TensorFlow的设计思想和基本原理。TensorFlow开源的PS-Worker分布式架构也在快速迭代，与其内部基于Borg调度的分布式架构也并不相同。自TensorFlow开源以来，不断有人撰写博客解释其各组件的实现与原理。但遗憾的是，能够深入剖析TensorFlow内部实现细节与设计思想的书籍少之又少，而本书则尝试弥补这一缺憾。
12 | 
13 | 本书以TensorFlow 1.2为基础，从基本概念、内部实现和最佳实践等方面深入剖析了TensorFlow。本书重点介绍了以数据流图为核心的机器学习编程框架的设计原则与核心实现，并且还介绍了TensorFlow生态系统中的两大重要工具：TensorBoard可视化工具与TensorFlow Serving模型托管工具。同时，本书还将TensorFlow与深度学习相结合，从理论基础和程序实现两个方面系统介绍了卷积神经网络（CNN）、生成对抗网络（GAN）和循环神经网络（RNN）等经典模型。本书不仅按照由浅入深的学习顺序全面介绍了TensorFlow的使用方法，而且结合源代码进行了深入剖析，使读者可以快速、系统地学习TensorFlow的架构设计与实现原理。
14 | 
15 | ## 读者对象
16 | 
17 | （1）TensorFlow二次开发人员
18 | 
19 | 由于在高效性、多平台、多语言、稳定性等方面的诸多优点，TensorFlow已被国内外越来越多的公司采用并部署到生产环境。而为了解决特定场景下的特定问题，大部分公司选择在开源TensorFlow的基础上进行二次开发。对于这部分TensorFlow二次开发人员，深入而又全面地了解TensorFlow的设计原则和实现细节是修改TensorFlow内核的前提，而本书可以帮助这部分读者快速而又全面地了解TensorFlow实现原理。
20 | 
21 | （2）数据科学家和算法工程师
22 | 
23 | 如果要使用TensorFlow解决生产和生活中的实际问题，仅掌握TensorFlow基本使用方法是远远不够的，必须对TensorFlow的设计理念、架构和运作机制有一定的了解。尤其是对于分布式训练任务，更需要深入了解TensorFlow分布式的架构设计与多种并行模式的实现原理。对这部分读者来说，本书将带领他们走入TensorFlow架构师的内心世界，使其系统、深入地理解TensorFlow和数据流图，提高开发水平，从而编写出更加高效的深度学习和机器学习模型。
24 | 
25 | （3）人工智能方向研究生
26 | 
27 | 对于一名人工智能专业的研究生来说，除了对人工智能理论有极高的造诣外，还应当熟练掌握一门算法模型编程框架，才能将研究课题中的问题快速落实到实际的代码上来。而TensorFlow便是当下最受欢迎的机器学习和深度学习框架。通过阅读本书，人工智能方向研究生可以全面提升复现论文实验结果和开发全新模型的效率，并深入理解TensorFlow的设计思想和实现细节。
28 | 
29 | （4）开源软件爱好者
30 | 
31 | TensorFlow是全世界最受欢迎的开源机器学习和深度学习框架，它在设计和实现的过程中参考了Google第一代分布式机器学习框架DistBelief的实践经验，同时又加入了很多值得学习的创新。值得一提的是，本书分析TensorFlow架构设计和实现原理的方式也许值得许多开源软件爱好者进行学习和借鉴，这部分读者不仅能够领略到开源软件的优秀设计，还可以掌握分析开源软件源代码的方法和技巧，从而进一步提高使用开源软件的效率和质量。
32 | 
33 | ## 如何阅读本书
34 | 
35 | 本书分为五大部分（不包括附录）：
36 | 
37 | 第一部分为基础篇（第1～3章），简单地介绍了TensorFlow设计目标、基本架构、环境准备和基础概念，包括数据流图的设计与使用，以及TensorFlow运行环境和训练机制，帮助读者快速入门TensorFlow，迅速上手使用。
38 | 
39 | 第二部分为关键模块篇（第4～7章），着重讲解了使用TensorFlow端到端解决人工智能问题涉及的关键模块，包括数据处理、编程框架、可视化工具和模型托管工具，帮助读者进一步提升开发效率，快速落地模型应用。
40 | 
41 | 第三部分为算法模型篇（第8～11章），在熟练掌握TensorFlow后，该部分将深度学习与TensorFlow有机结合，系统介绍了深度学习的发展历史与应用场景，并结合理论与代码实现深入讲解了CNN、GAN和RNN等经典模型。
42 | 
43 | 第四部分为核心揭秘篇（第12～14章），深入剖析了TensorFlow运行时核心、通信原理和数据流图计算的原理与实现，聚焦C++核心层的揭秘，帮助读者进一步理解TensorFlow底层的设计思想与实现细节，TensorFlow二次开发人员需重点关注该部分内容。
44 | 
45 | 第五部分为生态发展篇（第15章），全面介绍了TensorFlow生态系统发展，并重点介绍了Keras深度学习算法库，以及TensorFlow与云原生社区Kubernetes生态的结合、与大数据社区Spark生态的结合，并介绍了TensorFlow通信优化技术、TPU及NNVM模块化深度学习技术，帮助读者进一步全面的了解深度学习生态发展现状。
46 | 
47 | ## 勘误和支持
48 | 
49 | 感谢您在茫茫书海中选择了我们的作品。尽管我们在写作过程中力求以精确的语言传达正确的信息，然而出于专业水平与表达能力限制，书中难免仍有不周之处。加之深度学习相关理论与技术演进飞速，书中部分内容在呈现于读者之时或许已不合时宜。我们诚挚地希望您能够指正本书存在的问题，并对我们提出宝贵的意见和建议。
50 | 
51 | 读者可以通过本书在图灵社区的页面（<http://www.ituring.com.cn/book/2397>）提交或查看勘误。本书的样章与部分示例代码也已托管在 GitHub 项目（<https://github.com/DjangoPeng/tensorflow-in-depth>）中，欢迎读者访问并提出意见与建议。
52 | 
53 | ## 致谢
54 | 
55 | 感谢我的合作者林健博士，他在计算机系统方面学识渊博，使我在合作著书的过程中不断进步。同时，林健博士对本书的审稿和校稿工作也作出了重要贡献。
56 | 
57 | 感谢我的另一位合作者白小龙博士，他对深度学习理论和落地的丰富经验使我受益匪浅。
58 | 
59 | 感谢人民邮电出版社图灵社区的王军花编辑在这一年多的时间中始终支持我们的写作，她的鼓励和帮助使我们顺利完成了本书。
60 | 
61 | 感谢丁泽震、辛现银、张震宇、李鹏飞、尉建等朋友在写书和校稿过程中给我们提供的修改建议与各种帮助。
62 | 
63 | 谨以此书献给我们最亲爱的家人，以及众多热爱TensorFlow和深度学习的朋友们！
64 | 
65 | 彭靖田
66 | 
67 | 于杭州
68 | 
69 | #
70 | 
71 | **Prev：**[推荐语](recommendations.md)
72 | 
73 | **Next:** [第一章 TensorFlow系统概述](text/1_overview/1.0_overview.md)


--------------------------------------------------------------------------------
/code/3_basic_concepts/3.6_best_practice.py:
--------------------------------------------------------------------------------
 1 | # -*- coding=utf-8 -*-
 2 | import tensorflow as tf
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | # 打印日志的步长
 7 | log_step = 50
 8 | # ================ 1.定义超参数 ================
 9 | # 学习率
10 | learning_rate = 0.01
11 | # 最大训练步数
12 | max_train_steps = 1000
13 | # ================ 2.输入数据 ================
14 | # 构造训练数据
15 | train_X = np.array([[3.3],[4.4],[5.5],[6.71],[6.93],[4.168],[9.779],[6.182],[7.59],[2.167],[7.042],[10.791],[5.313],[7.997],[5.654],[9.27],[3.1]], dtype=np.float32)
16 | train_Y = np.array([[1.7],[2.76],[2.09],[3.19],[1.694],[1.573],[3.366],[2.596],[2.53],[1.221],[2.827],[3.465],[1.65],[2.904],[2.42],[2.94],[1.3]], dtype=np.float32)
17 | total_samples = train_X.shape[0]
18 | # ================ 3.构建模型 ================
19 | # 输入数据
20 | X = tf.placeholder(tf.float32, [None, 1])
21 | # 模型参数
22 | W = tf.Variable(tf.random_normal([1, 1]), name="weight")
23 | b = tf.Variable(tf.zeros([1]), name="bias")
24 | # 推理值
25 | Y = tf.matmul(X, W) + b
26 | # ================ 4.定义损失函数 ================
27 | # 实际值
28 | Y_ = tf.placeholder(tf.float32, [None, 1])
29 | # 均方差
30 | loss = tf.reduce_sum(tf.pow(Y-Y_, 2))/(2*total_samples)
31 | # ================ 5.创建优化器 ================
32 | # 随机梯度下降优化器
33 | optimizer = tf.train.GradientDescentOptimizer(learning_rate)
34 | # ================ 6.定义单步训练操作 ================
35 | # 最小化损失值
36 | train_op = optimizer.minimize(loss)
37 | # ================ 7.创建会话 ================
38 | with tf.Session() as sess:
39 |     # 初始化全局变量
40 |     sess.run(tf.global_variables_initializer()) 
41 | # ================ 8.迭代训练 ================
42 |     print("Start training:")
43 |     for step in xrange(max_train_steps):
44 |         sess.run(train_op, feed_dict={X: train_X, Y_: train_Y})
45 |         # 每隔log_step步打印一次日志
46 |         if step % log_step == 0:
47 |             c = sess.run(loss, feed_dict={X: train_X, Y_:train_Y})
48 |             print("Step:%d, loss==%.4f, W==%.4f, b==%.4f" % 
49 |                     (step, c, sess.run(W), sess.run(b)))
50 |     # 计算训练完毕的模型在训练集上的损失值，作为指标输出
51 |     final_loss = sess.run(loss, feed_dict={X: train_X, Y_: train_Y})
52 |     # 计算训练完毕的模型参数W和b
53 |     weight, bias = sess.run([W, b])
54 |     print("Step:%d, loss==%.4f, W==%.4f, b==%.4f" % 
55 |             (max_train_steps, final_loss, sess.run(W), sess.run(b)))
56 |     print("Linear Regression Model: Y==%.4f*X+%.4f" % (weight, bias))
57 | # ================ 模型可视化 ================
58 |     # 初始化Matplotlib后端
59 |     %matplotlib
60 |     # 根据训练数据X和Y，添加对应的红色圆点
61 |     plt.plot(train_X, train_Y, 'ro', label='Training data')
62 |     # 根据模型参数和训练数据，添加蓝色（缺省色）拟合直线
63 |     plt.plot(train_X, weight * train_X + bias, label='Fitted line')
64 |     # 添加图例说明
65 |     plt.legend()
66 |     # 画出上面定义的图案
67 |     plt.show()
68 | 
69 | '''
70 | 输出：
71 | Start training:
72 | Step:0, loss==2.8679, W==0.0054, b==0.0411
73 | Step:50, loss==0.1045, W==0.3457, b==0.1317
74 | Step:100, loss==0.1013, W==0.3402, b==0.1710
75 | Step:150, loss==0.0985, W==0.3350, b==0.2080
76 | Step:200, loss==0.0961, W==0.3301, b==0.2428
77 | Step:250, loss==0.0939, W==0.3254, b==0.2755
78 | Step:300, loss==0.0919, W==0.3211, b==0.3064
79 | Step:350, loss==0.0902, W==0.3170, b==0.3354
80 | Step:400, loss==0.0887, W==0.3131, b==0.3627
81 | Step:450, loss==0.0874, W==0.3095, b==0.3884
82 | Step:500, loss==0.0862, W==0.3061, b==0.4126
83 | Step:550, loss==0.0851, W==0.3029, b==0.4353
84 | Step:600, loss==0.0842, W==0.2999, b==0.4567
85 | Step:650, loss==0.0833, W==0.2970, b==0.4769
86 | Step:700, loss==0.0826, W==0.2944, b==0.4959
87 | Step:750, loss==0.0820, W==0.2918, b==0.5137
88 | Step:800, loss==0.0814, W==0.2895, b==0.5305
89 | Step:850, loss==0.0809, W==0.2872, b==0.5463
90 | Step:900, loss==0.0804, W==0.2851, b==0.5612
91 | Step:950, loss==0.0800, W==0.2832, b==0.5752
92 | Step:1000, loss==0.0797, W==0.2814, b==0.5881
93 | Linear Regression Model: Y==0.2814*X+0.5881
94 | '''


--------------------------------------------------------------------------------
/recommendations.md:
--------------------------------------------------------------------------------
 1 | ## 推荐序1
 2 | 
 3 | 人工智能迎来了继上世纪九十年代以来的又一次大发展，人工智能和深度学习无疑是近年来最受追捧的热点。2016年和2017年AlphaGo两次战胜韩国和中国围棋国手，更是让人工智能成为街头巷尾的热谈。为什么人工智能在经历了两起两落后再一次迎来了复兴，我认为，这一次的爆发主要由理论、应用、硬件和软件四个方面的原因促成的，貌似偶然实则必然。
 4 | 
 5 | Geoff Hinton等人在2006年发现了训练高层神经网络的有效算法，为深度学习的研究打开了新局面，也是人工智能得以重燃的导火线。经过后续研究人员的努力，尤其是CNN和RNN的出现，深度学习、神经网络方法在图像和语音识别方面显示出非常好的效果，大大超越之前的理论和方法，甚至能够突破人类极限。与计算机视觉、机器人、自然语言理解、信息检索等技术相结合，深度学习的应用也从单纯的图像和语音识别扩展到自动驾驶、图像增强与风格替换、文本语音间转换和推荐系统等。使用深度学习技术的应用和初创企业如雨后春笋般冒了出来。尤其需要指出的是，互联网和大数据的广泛应用是深度学习发展的必要条件。一般而言，数据量越大且数据质量越高，由深度学习训练出来的模型精度也就越好。计算机硬件的发展也直接推动了深度学习的发展。GPU和其他专用加速器件的出现，大大提高了深度学习的计算效率。在语音识别场景下，GPU可将数十亿样本的训练时间从数年缩短到数天。而专用的ASIC加速芯片相比GPU的能效更有数量级以上提升，也让深度学习从服务器端走向手机端，进一步拓展了其应用范围。
 6 | 
 7 | 开放的软件生态和易用的软件形态是形成人工智能和深度学习产业链至关重要的两个方面。没有软件的支撑，理论很难与应用相结合，新硬件也很难为应用提速。从大数据软件的发展历程可以想见，如果没有开源的Hadoop生态系统，以及受其设计思想影响的新型大规模并行处理数据库系统，我们现在可能还在为如何管理和处理PB乃至EB级的数据发愁。开源TensorFlow的出现解决了类似的问题，一下子拉近了深度学习理论与实际应用的距离。同时，TensorFlow也具备迈向成功生态系统的必要条件，即差异化的软件功能，刚需的典型应用和活跃的社区支持，发展前景可期。但是，原生TensorFlow的软件形态尚不足以支持深度学习的全流程生产化应用，欠缺诸如数据管理和预处理，模型训练、管理和运行，资源管理、任务调度和运行时监控等能力，导致最终用户形成生产力的成本过高。深度学习是计算密集型重资产类应用，如果有能够提供异构高性能计算资源并能够集成上述平台化功能的深度学习公有云服务，可降低TensorFlow的使用门槛并提升用户体验，客观上会与开源效应叠加起到倍速产业发展的作用。
 8 | 
 9 | 很高兴能够在这个时候看到一本讲授如何使用TensorFlow的专业书籍。作者是深谙计算机系统之道的一线工程师，带给读者的是产生自实战经验基础上的理解。非常难得的是，本书除了讲解如何使用TensorFlow还加入了对系统设计原理方面的剖析，有助于读者做针对性的应用和系统优化。相信本书对从事深度学习方面研究和开发的读者定会有所裨益。
10 | 
11 | 查礼
12 | 
13 | 中国科学院计算技术研究所 副研究员
14 | 
15 | 中国大数据技术大会（BDTC） 发起人
16 | 
17 | 2018年元旦于北京
18 | 
19 | 
20 | 
21 | ## 推荐序2
22 | 
23 | 如果说要评选出一个在过去一年中，学术界和产业界最热门的词汇，无疑非人工智能莫属。作为新一轮产业变革的核心驱动力，人工智能正重构生产、分配、交换、消费等经济活动各环节，形成从宏观到微观各领域的智能化新需求、新产品、新技术、新业态，引发经济结构重大演化，实现社会生产力的整体跃升。
24 | 
25 | 人工智能系统的智能有三种来源：一种是依靠人类设计者的知识输入，为系统建立人工特征、知识库和推理机制，传统意义上的专家系统属于这一范畴。二是通过数据驱动的归纳式学习，近年来大火的深度学习即属于这一类，与依赖于人工经验、通过手工构建的知识特征不同，深度学习以端到端方式进行特征学习，其基本动机在于构建多层网络来学习隐含在数据内部的关系，从而使学习得到的特征具有更强、更泛化的表达能力。三是智能体通过与环境的交互，学习经验和知识并更新知识表示。近年来，以深度学习为代表的数据驱动方法在图像识别、语音识别、机器翻译、自然语言理解等任务中取得了一系列突破。人脸识别、自然语言理解在一系列国际评测中展示了超越人类能力的水平，语言识别和机器翻译也达到了一个前所未有的高度。
26 | 
27 | 在我看来，各行各业还会在相当长的一段时间内享受到基于大数据的深度学习红利。将深度学习红利释放到读者您所在的行业会是提升行业智能水平的一条捷径。而要做到这一点，核心关键在于要降低大数据深度学习技术的使用门槛。要使看起来高深的深度学习技术，早日达到“旧时王谢堂前燕，飞入寻常百姓家”的程度。而要做到这一点，正如IT业前面多次证明过的，开源软件生态社区会起到巨大的推动作用， TensorFlow则无疑正是在这方面的佼佼者。深度学习对于张量计算性能、算子灵活性、自动微分能力、分布式训练、自动调参数、可视化和端侧部署等都有很强的诉求，而TensorFlow 的设计也充分考虑到了这些因素，这使得它成为了当前业界很流行的一个深度学习引擎。
28 | 
29 | TensorFlow还是一个较新的技术，但是发展极为迅猛，在这时候出现一本深入浅出讲解TensorFlow理论与应用的书籍，对于广大希望学习和应用大数据深度学习技术的读者而言，诚“如大旱之望云霓”。本书理论与实践并重，理论上讲清楚了一些本质的东西，并加入了作者对系统设计原理方面的深刻理解，并通过实际案例，引导读者掌握针对性的系统优化的技能。
30 | 
31 | 本书第一作者是我的学生，12年入学时进入了浙大学习计算机专业的尖子班“求是科学班”，我担任了他们这个班的班主任。他不仅品学兼优，而且作为班上的团支书，帮我这个不太称职的班主任做了很多班级工作。在我心目中，他依然是入学时的青涩模样，转眼间却已成为开源软件届的技术翘楚，作为老师，最欣慰的莫过于此了吧。是为序。
32 | 
33 | 
34 | 
35 | 教育部“长江学者”特聘教授
36 | 
37 | 浙江大学计算机学院院长   陈刚
38 | 
39 | 
40 | 
41 | ## 联合推荐
42 | 
43 | TensorFlow是一个深度学习的基础框架，自2015年底开源以来，它被不断应用到各个领域当中，也逐渐孕育出了一个活跃的开源社区，《深入解析TensorFlow架构设计与实现原理》的作者就是这个社区的贡献者，他们对TensorFlow有深层次的理解。他们写的这本书从独特的角度剖析了TensorFlow和分布式TensorFlow的运行机理。并以TensorFlow 1.2为基础，用简单易懂的语言讲解了TensorFlow的从安装、模型编写、可视化一直到生产环境部署的方方面面，同时穿插介绍了深度学习的基本概念。该书是作者的呕心沥血之作，是目前最佳的一本TensorFlow中文书。
44 | 
45 | -- 周玥枫 Google Brain 资深工程师
46 | 
47 | 本书由浅入深，详细介绍了TensorFlow的编程方法与工作原理。本书在介绍深度学习与TensorFlow基本概念和用法的同时，深入分析了TensorFlow的系统架构与实现原理，是TensorFlow开源系统贡献者的重要参考资料。
48 | 
49 | -- 郑泽宇 才云科技顾问
50 | 
51 | 在深度学习技术进入商业化实用阶段，这本书通过技术概念和实践案例讲解，对广大AI技术爱好者深度了解和应用tensorflow技术的本质内涵、技术框架和应用体系提供重要参考，值得一读。
52 | 
53 | -- 夏命榛 华为人工智能领域主任工程师
54 | 
55 | 本书以TensorFlow为线索介绍深度学习的算法和系统，既包含算法的背景知识，又囊括系统的实现原理，并给出TensorFlow中的代码示例，是综合算法理论和系统原理并支持动手实践的佳作。
56 | 
57 | -- 邹永强 云账户联合创始人兼CTO
58 | 
59 | 当前人工智能的发展高度依赖数据、算法和计算能力三要素，在计算能力越来越强的今天，数据和算法成为人工智能发展的两个关键要素，而作为人工智能的核心算法深度学习对于技术工程师来说依然迷雾重重。TensorFlow是深度学习领域最重要的开源框架，基于TensorFlow的应用越来越广泛的应用到安防、电商、金融、医疗等领域，也正在逐步渗透到工业领域。本书从底层技术入手，深入浅出的讲解TensorFlow的原理、架构、核心算法和应用场景，并且展示了其强大的生态配套体系，是不可多得的TensorFlow学习教材。未来已来，让我们积极拥抱人工智能的未来。
60 | 
61 | -- 周公爽 博拉科技创始人兼CEO
62 | 
63 | 这是一本来自工业界技术专家的书，作者对深度学习框架和机器学习算法有多年的深入研究经验，对TensorFlow在业界的实战应用有丰富的经验和独到的见解。本书系统详尽的介绍了TensorFlow的主要模块及使用方法，同时介绍了CNN、GAN和RNN等深度学习算法模型和TensorFlow的内部核心模块。本书一气呵成，深入浅出，每章均配有总览流程图和详细的案例代码，特别适合作为工程师和研究者入门TensorFlow的第一本书。
64 | 
65 | -- 王锦鹏 微软亚洲研究院 助理研究员
66 | 
67 | 一本科技类读物通常难以在理论、源码、实践之间取得平衡：理论过多很难让读者学以致用并付诸实践，源码过多容易让读者望而却步且只见冰山一角，而过于聚焦实践案例则难以帮助读者举一反三，参透更高层的设计理念与哲学。这本读物却少有的由浅入深，既完整介绍了深度学习和TensorFlow的技术演化、生态全貌、设计理念，又及时地在一段理论陈述和数学原理之后通过源码层面的案例分享帮助读者将理论落地。为了帮助读者举一反三，知其然还知其所以然，本书不光传授了TensorFlow技术本身，还系统地介绍了为了理解和掌握TensorFlow所需的周边知识，用“自包含”的方式为读者提供了“一站式”的从入门到精通的指引。
68 | 
69 | --张鑫 才云科技创始人兼CEO
70 | 
71 | #
72 | 
73 | **Next：**[前言](preface.md)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | **Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
  4 | 
  5 | - [深入理解 TensorFlow 架构设计与实现原理](#%E6%B7%B1%E5%85%A5%E7%90%86%E8%A7%A3-tensorflow-%E6%9E%B6%E6%9E%84%E8%AE%BE%E8%AE%A1%E4%B8%8E%E5%AE%9E%E7%8E%B0%E5%8E%9F%E7%90%86)
  6 |   - [推荐语(节选)](#%E6%8E%A8%E8%8D%90%E8%AF%AD%E8%8A%82%E9%80%89)
  7 |   - [样例代码（已开源）](#%E6%A0%B7%E4%BE%8B%E4%BB%A3%E7%A0%81%E5%B7%B2%E5%BC%80%E6%BA%90)
  8 |   - [本书目录](#%E6%9C%AC%E4%B9%A6%E7%9B%AE%E5%BD%95)
  9 |     - [前言（开源）](#%E5%89%8D%E8%A8%80%E5%BC%80%E6%BA%90)
 10 |     - [第一部分 基础篇](#%E7%AC%AC%E4%B8%80%E9%83%A8%E5%88%86-%E5%9F%BA%E7%A1%80%E7%AF%87)
 11 |       - [第1章 TensorFlow系统概述（开源）](#%E7%AC%AC1%E7%AB%A0-tensorflow%E7%B3%BB%E7%BB%9F%E6%A6%82%E8%BF%B0%E5%BC%80%E6%BA%90)
 12 |       - [第2章 TensorFlow环境准备](#%E7%AC%AC2%E7%AB%A0-tensorflow%E7%8E%AF%E5%A2%83%E5%87%86%E5%A4%87)
 13 |       - [第3章 TensorFlow基础概念](#%E7%AC%AC3%E7%AB%A0-tensorflow%E5%9F%BA%E7%A1%80%E6%A6%82%E5%BF%B5)
 14 |     - [第二部分 关键模块篇](#%E7%AC%AC%E4%BA%8C%E9%83%A8%E5%88%86-%E5%85%B3%E9%94%AE%E6%A8%A1%E5%9D%97%E7%AF%87)
 15 |       - [第4章 TensorFlow数据处理方法](#%E7%AC%AC4%E7%AB%A0-tensorflow%E6%95%B0%E6%8D%AE%E5%A4%84%E7%90%86%E6%96%B9%E6%B3%95)
 16 |       - [第5章 TensorFlow编程框架](#%E7%AC%AC5%E7%AB%A0-tensorflow%E7%BC%96%E7%A8%8B%E6%A1%86%E6%9E%B6)
 17 |       - [第6章 TensorBoard可视化工具](#%E7%AC%AC6%E7%AB%A0-tensorboard%E5%8F%AF%E8%A7%86%E5%8C%96%E5%B7%A5%E5%85%B7)
 18 |       - [第7章 TensorFlow模型托管工具](#%E7%AC%AC7%E7%AB%A0-tensorflow%E6%A8%A1%E5%9E%8B%E6%89%98%E7%AE%A1%E5%B7%A5%E5%85%B7)
 19 |     - [第三部分 算法模型篇](#%E7%AC%AC%E4%B8%89%E9%83%A8%E5%88%86-%E7%AE%97%E6%B3%95%E6%A8%A1%E5%9E%8B%E7%AF%87)
 20 |       - [第8章 深度学习概述](#%E7%AC%AC8%E7%AB%A0-%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0%E6%A6%82%E8%BF%B0)
 21 |       - [第9章 卷积神经网络](#%E7%AC%AC9%E7%AB%A0-%E5%8D%B7%E7%A7%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C)
 22 |       - [第10章 生成对抗网络](#%E7%AC%AC10%E7%AB%A0-%E7%94%9F%E6%88%90%E5%AF%B9%E6%8A%97%E7%BD%91%E7%BB%9C)
 23 |       - [第11章 循环神经网络](#%E7%AC%AC11%E7%AB%A0-%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C)
 24 |     - [第四部分 核心揭秘篇](#%E7%AC%AC%E5%9B%9B%E9%83%A8%E5%88%86-%E6%A0%B8%E5%BF%83%E6%8F%AD%E7%A7%98%E7%AF%87)
 25 |       - [第12章 TensorFlow运行时核心设计与实现](#%E7%AC%AC12%E7%AB%A0-tensorflow%E8%BF%90%E8%A1%8C%E6%97%B6%E6%A0%B8%E5%BF%83%E8%AE%BE%E8%AE%A1%E4%B8%8E%E5%AE%9E%E7%8E%B0)
 26 |       - [第13章 通信原理与实现](#%E7%AC%AC13%E7%AB%A0-%E9%80%9A%E4%BF%A1%E5%8E%9F%E7%90%86%E4%B8%8E%E5%AE%9E%E7%8E%B0)
 27 |       - [第14章 数据流图计算原理与实现](#%E7%AC%AC14%E7%AB%A0-%E6%95%B0%E6%8D%AE%E6%B5%81%E5%9B%BE%E8%AE%A1%E7%AE%97%E5%8E%9F%E7%90%86%E4%B8%8E%E5%AE%9E%E7%8E%B0)
 28 |     - [第五部分 生态发展篇](#%E7%AC%AC%E4%BA%94%E9%83%A8%E5%88%86-%E7%94%9F%E6%80%81%E5%8F%91%E5%B1%95%E7%AF%87)
 29 |       - [第15章 TensorFlow生态环境](#%E7%AC%AC15%E7%AB%A0-tensorflow%E7%94%9F%E6%80%81%E7%8E%AF%E5%A2%83)
 30 |   - [参考链接](#%E5%8F%82%E8%80%83%E9%93%BE%E6%8E%A5)
 31 | 
 32 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 33 | 
 34 | # 深入理解 TensorFlow 架构设计与实现原理
 35 | 
 36 | 此项目托管了《深入理解 TensorFlow 架构设计与实现原理》一书的样章与部分示例代码。
 37 | 
 38 | ## [推荐语(节选)](recommendations.md)
 39 | 
 40 | *"很高兴能够在这个时候看到一本讲授如何使用TensorFlow的专业书籍。作者是深谙计算机系统之道的一线工程师，带给读者的是产生自实战经验基础上的理解。非常难得的是，本书除了讲解如何使用TensorFlow还加入了对系统设计原理方面的剖析，有助于读者做针对性的应用和系统优化。相信本书对从事深度学习方面研究和开发的读者定会有所裨益。"*
 41 | 
 42 | —— 查礼 中国科学院计算技术研究所 副研究员 中国大数据技术大会（BDTC） 发起人
 43 | 
 44 | *“TensorFlow还是一个较新的技术，但是发展极为迅猛，在这时候出现一本深入浅出讲解TensorFlow理论与应用的书籍，对于广大希望学习和应用大数据深度学习技术的读者而言，诚“如大旱之望云霓”。本书理论与实践并重，理论上讲清楚了一些本质的东西，并加入了作者对系统设计原理方面的深刻理解，并通过实际案例，引导读者掌握针对性的系统优化的技能。*
 45 | 
 46 | *本书第一作者是我的学生，12年入学时进入了浙大学习计算机专业的尖子班“求是科学班”，我担任了他们这个班的班主任。他不仅品学兼优，而且作为班上的团支书，帮我这个不太称职的班主任做了很多班级工作。在我心目中，他依然是入学时的青涩模样，转眼间却已成为开源软件届的技术翘楚，作为老师，最欣慰的莫过于此了吧。是为序。”*
 47 | 
 48 | —— 陈刚 教育部“长江学者”特聘教授 浙江大学计算机学院院长   
 49 | 
 50 | ## [样例代码（已开源）](code/)
 51 | 
 52 | **说明：代码文件格式为 "章节_代码名称"，如3.6节线性回归最佳实践代码文件名为 "3.6\_best\_practice.py"。**
 53 | 
 54 | ## [本书目录](contents.md)
 55 | 
 56 | ### [前言（开源）](preface.md)
 57 | 
 58 | ### 第一部分 基础篇
 59 | 
 60 | #### [第1章 TensorFlow系统概述（开源）](text/1_overview/1.0_overview.md)
 61 | 
 62 | #### 第2章 TensorFlow环境准备
 63 | 
 64 | #### 第3章 TensorFlow基础概念
 65 | 
 66 | ### 第二部分 关键模块篇
 67 | 
 68 | #### 第4章 TensorFlow数据处理方法
 69 | 
 70 | #### 第5章 TensorFlow编程框架
 71 | 
 72 | #### 第6章 TensorBoard可视化工具
 73 | 
 74 | #### 第7章 TensorFlow模型托管工具
 75 | 
 76 | ### 第三部分 算法模型篇
 77 | 
 78 | #### 第8章 深度学习概述
 79 | 
 80 | #### 第9章 卷积神经网络
 81 | 
 82 | #### 第10章 生成对抗网络
 83 | 
 84 | #### 第11章 循环神经网络
 85 | 
 86 | ### 第四部分 核心揭秘篇
 87 | 
 88 | #### 第12章 TensorFlow运行时核心设计与实现
 89 | 
 90 | #### 第13章 通信原理与实现
 91 | 
 92 | #### 第14章 数据流图计算原理与实现
 93 | 
 94 | ### 第五部分 生态发展篇
 95 | 
 96 | #### 第15章 TensorFlow生态环境
 97 | 
 98 | 
 99 | ## 参考链接
100 | 
101 | - [人民邮电出版社官方介绍](http://www.ptpress.com.cn/shopping/buy?bookId=d87d343a-66f0-4430-b48d-4d03273f8258)
102 | - [京东商城链接](http://item.jd.com/12349620.html)
103 | 
104 | 


--------------------------------------------------------------------------------
/code/11_rnn_models/11.2_reader.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | """Utilities for parsing PTB text files."""
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import collections
 23 | import os
 24 | import sys
 25 | 
 26 | import tensorflow as tf
 27 | 
 28 | Py3 = sys.version_info[0] == 3
 29 | 
 30 | def _read_words(filename):
 31 |   with tf.gfile.GFile(filename, "r") as f:
 32 |     if Py3:
 33 |       return f.read().replace("\n", "<eos>").split()
 34 |     else:
 35 |       return f.read().decode("utf-8").replace("\n", "<eos>").split()
 36 | 
 37 | 
 38 | def _build_vocab(filename):
 39 |   data = _read_words(filename)
 40 | 
 41 |   counter = collections.Counter(data)
 42 |   count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
 43 | 
 44 |   words, _ = list(zip(*count_pairs))
 45 |   word_to_id = dict(zip(words, range(len(words))))
 46 | 
 47 |   return word_to_id
 48 | 
 49 | 
 50 | def _file_to_word_ids(filename, word_to_id):
 51 |   data = _read_words(filename)
 52 |   return [word_to_id[word] for word in data if word in word_to_id]
 53 | 
 54 | 
 55 | def ptb_raw_data(data_path=None):
 56 |   """Load PTB raw data from data directory "data_path".
 57 |   Reads PTB text files, converts strings to integer ids,
 58 |   and performs mini-batching of the inputs.
 59 |   The PTB dataset comes from Tomas Mikolov's webpage:
 60 |   http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 61 |   Args:
 62 |     data_path: string path to the directory where simple-examples.tgz has
 63 |       been extracted.
 64 |   Returns:
 65 |     tuple (train_data, valid_data, test_data, vocabulary)
 66 |     where each of the data objects can be passed to PTBIterator.
 67 |   """
 68 | 
 69 |   train_path = os.path.join(data_path, "ptb.train.txt") # 训练数据集
 70 |   valid_path = os.path.join(data_path, "ptb.valid.txt") # 验证数据集
 71 |   test_path = os.path.join(data_path, "ptb.test.txt") # 测试数据集
 72 | 
 73 |   word_to_id = _build_vocab(train_path)
 74 |   train_data = _file_to_word_ids(train_path, word_to_id)
 75 |   valid_data = _file_to_word_ids(valid_path, word_to_id)
 76 |   test_data = _file_to_word_ids(test_path, word_to_id)
 77 |   vocabulary = len(word_to_id)
 78 |   return train_data, valid_data, test_data, vocabulary
 79 | 
 80 | 
 81 | def ptb_producer(raw_data, batch_size, num_steps, name=None):
 82 |   """Iterate on the raw PTB data.
 83 |   This chunks up raw_data into batches of examples and returns Tensors that
 84 |   are drawn from these batches.
 85 |   Args:
 86 |     raw_data: one of the raw data outputs from ptb_raw_data.
 87 |     batch_size: int, the batch size.
 88 |     num_steps: int, the number of unrolls.
 89 |     name: the name of this operation (optional).
 90 |   Returns:
 91 |     A pair of Tensors, each shaped [batch_size, num_steps]. The second element
 92 |     of the tuple is the same data time-shifted to the right by one.
 93 |   Raises:
 94 |     tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
 95 |   """
 96 |   with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
 97 |     # 将 ptb_raw_data 方法获得的某一数据集(train_data、valid_data 和 test_data 之一) 
 98 |     # 中的所有单词对应的 ID 转化为 tf.int32 类型的张量,并仍保存为 raw_data
 99 |     raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
100 | 
101 |     data_len = tf.size(raw_data)
102 |     batch_len = data_len // batch_size
103 |     # 将 raw_data 变为二维张量 data,第一个维度 batch_size 表示批的大小,
104 |     # 第二个维度 batch_len 表示该数据集包含的批的长度
105 |     data = tf.reshape(raw_data[0 : batch_size * batch_len],
106 |                       [batch_size, batch_len])
107 |     # num_steps 表示时间序列方向上 LSTM 单元的个数,即 LSTM 模型的输入的单词数量;
108 |     # epoch_size 表示数据集包含的批的数量
109 |     epoch_size = (batch_len - 1) // num_steps
110 |     assertion = tf.assert_positive(
111 |         epoch_size,
112 |         message="epoch_size == 0, decrease batch_size or num_steps")
113 |     with tf.control_dependencies([assertion]):
114 |       epoch_size = tf.identity(epoch_size, name="epoch_size")
115 |     # 顺序产生从 0 到(epoch_size-1)的序号 i
116 |     i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
117 |     # 张量 x 和 y 的形状均为[batch_size, num_steps],通过 i 的变化可以遍历数据集中的全部单词
118 |     x = tf.strided_slice(data, [0, i * num_steps],
119 |                          [batch_size, (i + 1) * num_steps])
120 |     x.set_shape([batch_size, num_steps])
121 |     y = tf.strided_slice(data, [0, i * num_steps + 1],
122 |                          [batch_size, (i + 1) * num_steps + 1])
123 |     y.set_shape([batch_size, num_steps])
124 |     return x, y


--------------------------------------------------------------------------------
/text/1_overview/1.2_objectives.md:
--------------------------------------------------------------------------------
 1 | ## 1.2 TensorFlow的设计目标
 2 | 
 3 | TensorFlow作为Google公司上一代深度学习平台DistBelief的继任者，其首要设计目标是满足公司内部的图像、语音、语义等感知和预测类应用的需求。这些应用的数据规模和模型复杂度日益增长，对提供计算能力的软硬件平台的功能和性能不断提出更高的要求。与此同时，新一波人工智能浪潮的兴起也促使在开源软件商业生态建设方面经验丰厚的Google公司敏锐嗅探信息化、智能化产业发展的新动向，并广泛吸收工业界、学术界和开源社区的前沿理念与最佳实践，将这些思想纳入TensorFlow的设计。种种迹象表明，TensorFlow的设计目标并非局限一套深度学习库，Google寄希望其成为一套面向多种应用场景和编程范式、支持异构计算平台、具备优异性能与可伸缩性的通用人工智能引擎。本节从中选取几个侧面，对TensorFlow的设计目标进行介绍和分析。
 4 | 
 5 | ### 1.2.1 灵活通用的深度学习库
 6 | 
 7 | 近几年来，随着海量数据的涌现、硬件计算能力的提升以及神经网络模型和机器学习算法的改进，深度学习技术得到了快速的发展，已经成为学术界和工业界共同关注的热点。一方面，深度学习模型和算法的理论研究还未完全成熟，其发展空间巨大，吸引了大量科研人员参与其中。这几年深度学习席卷了各大顶级学术会议，模型设计、方法优化和应用创新的迭代更新速度极快。另一方面，深度学习在某些领域已经可以落地，比如人脸识别模型成功应用于安防系统，语音识别模型成功用于智能终端。除此之外，工业界正在积极探索深度学习更多潜在的商业应用场景。在人工智能的热潮中，很多开源的深度学习库应运而生，这对加速相关研究和工程化效率起到了非常重要的作用。为了应对上述研究与应用领域的诸多使用场景，深度学习库必须注重设计的灵活性与功能的通用性，才能在风起云涌的人工智能生态系统中得以立足。TensorFlow作为当前主流的深度学习库之一，其设计具有很高的灵活性和通用性，主要体现在以下几个方面。
 8 | 
 9 | 在算子定义方面，相比于其他深度学习库，TensorFlow提供的算子粒度更细、数量更多，能够支撑上层灵活多变的模型和算法。用户可以使用这些算子自由、灵活地开发各种深度学习模型。此外，很多传统的机器学习模型也可以基于TensorFlow实现，如支持向量机、决策树和随机森林等。TensorFlow亦支持深度学习和传统机器学习混合的模型，从而使得数据流水线式的应用创新成为可能。TensorFlow对新算子的支持也足够灵活，允许用户通过组合已有的细粒度算子来构造新的算子，以快速实现算法原型、验证一些新的想法。用户也可以使用C++语言和CUDA等底层函数库实现新的算子并在运行时动态加载使用，以便满足专用算法需求并保证计算性能。
10 | 
11 | 在编程范式方面，TensorFlow支持声明式编程，将模型的定义和执行解耦。模型以数据流图结构表示，经过编译和优化之后才会真正执行。以数据流图抽象为核心的设计在保证模型执行效率的同时，使得用户编程更加灵活。例如，在模型定义阶段，用户可以通过添加控制依赖边来指定算子的执行顺序，可以通过添加自定义变量自如地管理数据流图的输入输出，还可以通过队列控制多设备之间的数据传输和子图执行时序。在数据流图的运行态，用户可以指定数据流图中待执行的子图，从而避免不必要的算子计算开销。除了模型之外，数据读取、数据预处理等其他操作都可以被添加到数据流图中，用户可以通过编辑数据流图实现对具体应用的端到端灵活控制。
12 | 
13 | 在运行时框架方面，TensorFlow在具备隐式并行计算能力的同时，也提供了细粒度的显式控制接口，允许灵活地控制模型在多节点异构设备上的分布式执行方式。用户在编写深度学习模型时，可以自由地将模型中的每个算子绑定在任意的计算设备上。TensorFlow运行时框架负责将模型对应的数据流图按照设备进行切分，并自动插入必要的通信操作，对用户屏蔽了底层复杂的数据传输与时序同步机制。用户可以结合具体模型和应用的特点，通过手工指定或者以强化学习等方式找出模型在多节点异构硬件上的最优布局与执行方式（如数据并行、模型并行等），这使得模型的训练和推理都有更多的优化空间。
14 | 
15 | 在多语言支持方面，TensorFlow提供Python、C、C++、Java、Go等主流语言的编程接口。虽然Python是当前深度学习和人工智能领域使用最为广泛的编程语言，但是其他语言也有各自的语法优势、适用场景以及拥趸用户，它们能够满足科研、商用等不同应用领域及服务器、终端等不同目标设备的开发需求。另外，社区开源贡献者在TensorFlow C API基础上扩展开发了Node.js、Julia、R等其他语言的编程接口，这在体现TensorFlow内核设计灵活性的同时，也进一步扩大了其作为通用深度学习库的场景覆盖范围。
16 | 
17 | 总而言之，TensorFlow通过丰富的算子、灵活的编程范式、自由的运行时框架以及多语言API支持，对用户展现了高度的灵活性和通用性。TensorFlow在其设计之初就被定义为灵活通用的深度学习平台。Google公司将它开源的目的正是要以其作为基石，构筑深度学习乃至整个人工智能的生态圈。
18 | 
19 | ### 1.2.2 端云结合的人工智能引擎
20 | 
21 | “端云结合”是当今信息化、智能化技术发展的普遍趋势。一方面，随着信息技术在生产、生活中各个应用领域愈发深入的集成，对海量数据的高效处理成为IT服务商与决策部门的迫切需求。在传统数据中心基础上发展起来的云计算和大数据技术以集约化的资源管理、动态弹性的资源供给为持续膨胀的应用提供了高水平、可伸缩的计算能力，同时降低了服务提供者的准入门槛。这就要求传统服务器端软件必须适应云化部署场景，以水平扩展（scale-out）、无状态、微服务等方式构建高内聚、低耦合的系统架构。另一方面，随着以智能手机为代表的移动终端技术的高速普及，以及物联网、机器人等智能化技术在传统行业的不断渗透，用户对于数据私密性、安全性的重视程度逐渐增强，应用场景对服务实时性与可用性的要求也更加严格。在这一背景下，计算能力的边缘化成为与云化并驾齐驱的演进方向。这就要求提供服务的软件能够适应体系结构多样、计算资源有限、功耗受到制约的终端硬件环境，并具备一定的自治与协同工作能力。
22 | 
23 | 在人工智能领域野心勃勃的Google公司自然准确地把握并积极地引领着这一趋势。在云侧，Google既是一家公有云提供商，需要通过具有核心竞争力的PaaS层产品吸引企业级用户和二次开发者；又是一家业务高度依赖于智能算法与海量数据的创新型公司，需要开发高效、灵活的基础平台软件满足自身业务快速发展的需要。在端侧，Google公司不但需要为日新月异的应用开发提供强有力的智能化支撑能力，从而使其Android生态系统得以抗衡强大的竞争对手；而且需要借助实时且可靠的智能计算引擎进军物联网、可穿戴设备、增强实现、无人驾驶等新兴领域，以寻求“后移动互联网时代”的新增长点。TensorFlow等平台层软件的设计因此也兼顾了云侧与端侧的需求。
24 | 
25 | TensorFlow对云计算场景的支持是其竞争力的基础，主要体现在以下方面：（1）提供多种标准化的安装包、构建脚本及容器化封装，支持在不同Linux发行版以及Windows Server等其他服务器操作系统上部署。既允许以二进制包方式快速安装，也允许针对特定环境定制化编译高效的目标代码，从而极大地增强了软件的适用范围和适配能力。（2）支持对接多种常见的公有云和私有云服务，如Google Cloud Storage、Amazon S3、HDFS，并为对接其他类似服务预留可扩展设计，从而能够与既有的互联网、大数据生态系统无缝交互，实现资源复用与服务组合。（3）兼容若干种常见的高性能计算与通信硬件，能够有效利用云环境的既有投资并提升应用软件对高端硬件资源的利用率。例如，支持NVIDIA和OpenCL GPU，能够充分挖掘众核设备的并行计算能力；支持RDMA网络协议，能够充分发挥InfiniBand等高速网络设备的带宽潜力。（4）灵活的运行时框架设计，既提供标准且易用的PS-worker分布式模式，也允许用户自由开发针对特定环境需求的分布式框架。即使脱离Google公司的Borg等基础设施，以TensorFlowOnSpark、MaTEx-TensorFlow为代表的第三方工具也能够利用既有的分布式平台提升TensorFlow在数据中心和超算集群中的可伸缩性。
26 | 
27 | TensorFlow在端侧场景方面也毫不逊色，其主要设计体现在：（1）推理（预测）态代码能够运行于多种主流的终端平台，包括Android、iOS，以及部署Linux操作系统的多类ARM与MIPS设备（如Raspberry Pi），从而为形态多样的终端设备集成AI认知与决策能力提供支撑。（2）通过XLA AOT（ahead-of-time）编译技术及其他软硬件解耦设计，显著地简化底层异构计算设备的对接方式，实现对神经网络芯片等新型专用端侧硬件的快速支持能力。（3）提供量化参数和低精度代数等算法层机制，适配算力、存储和功耗受限的终端，从而实现低端边缘设备的智能化。（4）提供模型与框架一体化的精简版运行时平台，具备完全的离线工作能力，有助于实现端侧计算的私密性与实时性。
28 | 
29 | 可以看出，TensorFlow作为一套人工智能引擎，不但致力于增强应用系统的“大脑”，同时也在帮助其完善“末梢神经”。可以预见，未来TensorFlow等人工智能引擎会像Linux操作系统内核一样，成为在端云两侧广泛支撑各类应用、助力实现智能化社会的幕后英雄。
30 | 
31 | ### 1.2.3 高性能的基础平台软件
32 | 
33 | 虽然如今的互联网、大数据计算平台软件层出不穷，核心技术变幻莫测，吸睛特性轮番登场，但是性能始终都能够超脱于名目繁多的噱头，成为几乎所有用户一致认可的硬指标。在半导体器件物理极限将至的情况下，摩尔定律的有效性已经存疑，软件轻松分享硬件发展红利的时代走向末路。硬件设计者正在广泛采纳新型器件、三维电路、应用定制、众核并行等多元化思路满足应用不断增长的算力需求，这为软件开发者提供机会的同时也带来了不小的挑战。如何将软件架构和算法有效适配到硬件体系结构、充分利用硬件资源发挥其设计性能，成为所有软件开发者，特别是基础平台层软件开发者面临的重要问题。
34 | 
35 | 随着深度学习技术发展而兴起的一系列开源计算库长期处于激烈竞争的态势。在竞争过程中，一快遮百丑。这里的“快”字有两层含义：深度学习库的开发者不仅需要快速响应上层需求和下层技术的变化，及时发布新版本与新特性，而且需要通过苦修内功提升深度学习库本身的性能，加快算法模型的训练与推理速度。在第一个“快”字上，TensorFlow借助Google公司强大的号召力和坚决的执行力，长期保持领先地位。在第二个“快”字上，TensorFlow曾因权衡灵活性等原因一度落后于同类软件，但如今已经迎头赶上，在主流应用场景中取得了优异的测试成绩。这归功于众多核心研究者和开源贡献者在性能方面的深耕。
36 | 
37 | TensorFlow的高性能设计首先体现在它对高端和专用硬件的深入支持。同其他主流的深度学习库一样，TensorFlow将NVIDIA GPU作为训练态的硬件加速器，同时兼顾OpenCL GPU设备。不同于简单使用CUDA Runtime API的其他平台，Google的工程师基于CUDA Driver API实现了控制粒度更细、并行性能更优的StreamExecutor异构计算库，并对cuBLAS、cuDNN等库的函数变种进行了精确的适配。在推理态，尽管Google没有开源或销售TPU，然而TensorFlow开放性的设计已经促使多家芯片厂家实现了对接，这为定制化设备上的计算性能提升提供了保障。针对高性能计算环境中常用的InfiniBand、RoCE等高速网络设备，以及NVLink等片间高速互联技术，TensorFlow引入了RDMA、NCCL等协议，较好地解决了通信延迟问题，推进了分布式计算作业的加速比提升。
38 | 
39 | 其次，系统层的优化技术是TensorFlow性能提升的重要杀手锏。相比来自于学术界的算法研究团队，Google科研与工程团队深厚的系统研发背景是TensorFlow构建性能竞争力的坚实后盾。XLA这种融合了编译器设计理论的优化框架就是一例。它引入的JIT（just-in-time）编译机制能够在数据流图运行过程中实时创建二进制代码，将其中大量细粒度的操作融合为少量粗粒度的专用核函数，从而减少图中操作执行时的内存分配和上下文切换开销，极大地提升计算速度。TensorFlow诸多模块设计中也存在着细节性能优化。例如，通信模块中具有若干种旁路（bypass）设计，可以避免不必要的网络访问和内存复制开销；数据流图构建时会执行常量折叠、公共子表达式消除、内联函数展开等多种语法树优化，能够消除无意义的计算开销。这些设计体现出开发者良好的软件工程素养与精益求精追求。
40 | 
41 | 最后，算法层的优化设计也是TensorFlow实现优异性能不可或缺的组成部分。为了实现高性能的目标，TensorFlow的设计采纳了自顶向下、全栈优化的思路，而算子恰恰是贯穿上下层的核心要素。在深度学习的算法模型中，每种算子的逻辑都可以采用多种算法实现。为此，TensorFlow内置了多种优化后的基础算子和模型组件。以卷积算子为例，cuDNN提供了winograd等8种算法。针对不同的输入数据大小、卷积计算超参以及内存等资源限制，TensorFlow会自动为每个卷积操作选择最快的实现算法。另外，针对递归神经网络等模型，TensorFlow也支持Fold解决方案，使得动态批处理成为可能，极大加速了这些模型的计算速度。
42 | 
43 | 综上所述，性能是TensorFlow研发者重点关注的设计目标。虽然TensorFlow开源版本的性能优化起步稍晚，但是在Google团队和开源社区的共同努力下进步迅速。在这个“天下武功唯快不破”的时代，TensorFlow的高性能优势必将为其插上腾飞的翅膀，使之引领人工智能研究与应用的高速发展。
44 | 
45 | #
46 | 
47 | **Prev：**[1.1 TensorFlow简介](1.1_introduction.md)
48 | 
49 | **Next：**[1.3 TensorFlow的基本架构](1.3_architecture.md)


--------------------------------------------------------------------------------
/code/7_tf_serving/7.4_mnist_saved_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | #!/usr/bin/env python2.7
 17 | r"""Train and export a simple Softmax Regression TensorFlow model.
 18 | The model is from the TensorFlow "MNIST For ML Beginner" tutorial. This program
 19 | simply follows all its training instructions, and uses TensorFlow SavedModel to
 20 | export the trained model with proper signatures that can be loaded by standard
 21 | tensorflow_model_server.
 22 | Usage: mnist_saved_model.py [--training_iteration=x] [--model_version=y] \
 23 |     export_dir
 24 | """
 25 | 
 26 | import os
 27 | import sys
 28 | 
 29 | # This is a placeholder for a Google-internal import.
 30 | 
 31 | import tensorflow as tf
 32 | 
 33 | import mnist_input_data
 34 | 
 35 | tf.app.flags.DEFINE_integer('training_iteration', 1000,
 36 |                             'number of training iterations.')
 37 | tf.app.flags.DEFINE_integer('model_version', 1, 'version number of the model.')
 38 | tf.app.flags.DEFINE_string('work_dir', '/tmp/mnist/', 'Working directory.')
 39 | FLAGS = tf.app.flags.FLAGS
 40 | 
 41 | 
 42 | def main(_):
 43 |   if len(sys.argv) < 2 or sys.argv[-1].startswith('-'):
 44 |     print('Usage: mnist_export.py [--training_iteration=x] '
 45 |           '[--model_version=y] export_dir')
 46 |     sys.exit(-1)
 47 |   if FLAGS.training_iteration <= 0:
 48 |     print 'Please specify a positive value for training iteration.'
 49 |     sys.exit(-1)
 50 |   if FLAGS.model_version <= 0:
 51 |     print 'Please specify a positive value for version number.'
 52 |     sys.exit(-1)
 53 | 
 54 |   # Train model
 55 |   print 'Training model...'
 56 |   mnist = mnist_input_data.read_data_sets(FLAGS.work_dir, one_hot=True)
 57 |   print 'Create session'
 58 |   sess = tf.InteractiveSession()
 59 |   serialized_tf_example = tf.placeholder(tf.string, name='tf_example')
 60 |   feature_configs = {'x': tf.FixedLenFeature(shape=[784], dtype=tf.float32),}
 61 |   tf_example = tf.parse_example(serialized_tf_example, feature_configs)
 62 |   x = tf.identity(tf_example['x'], name='x')  # use tf.identity() to assign name
 63 |   y_ = tf.placeholder('float', shape=[None, 10])
 64 |   w = tf.Variable(tf.zeros([784, 10]))
 65 |   b = tf.Variable(tf.zeros([10]))
 66 |   sess.run(tf.global_variables_initializer())
 67 |   y = tf.nn.softmax(tf.matmul(x, w) + b, name='y')
 68 |   cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
 69 |   train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
 70 |   values, indices = tf.nn.top_k(y, 10)
 71 |   table = tf.contrib.lookup.index_to_string_table_from_tensor(
 72 |       tf.constant([str(i) for i in xrange(10)]))
 73 |   prediction_classes = table.lookup(tf.to_int64(indices))
 74 |   for _ in range(FLAGS.training_iteration):
 75 |     batch = mnist.train.next_batch(50)
 76 |     train_step.run(feed_dict={x: batch[0], y_: batch[1]})
 77 |   correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
 78 |   accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
 79 |   print 'training accuracy %g' % sess.run(
 80 |       accuracy, feed_dict={x: mnist.test.images,
 81 |                            y_: mnist.test.labels})
 82 |   print 'Done training!'
 83 | 
 84 |   # Export model
 85 |   # WARNING(break-tutorial-inline-code): The following code snippet is
 86 |   # in-lined in tutorials, please update tutorial documents accordingly
 87 |   # whenever code changes.
 88 |   export_path_base = sys.argv[-1]
 89 |   export_path = os.path.join(
 90 |       tf.compat.as_bytes(export_path_base),
 91 |       tf.compat.as_bytes(str(FLAGS.model_version)))
 92 |   print 'Exporting trained model to', export_path
 93 |   builder = tf.saved_model.builder.SavedModelBuilder(export_path)
 94 | 
 95 |   # Build the signature_def_map.
 96 |   classification_inputs = tf.saved_model.utils.build_tensor_info(
 97 |       serialized_tf_example)
 98 |   classification_outputs_classes = tf.saved_model.utils.build_tensor_info(
 99 |       prediction_classes)
100 |   classification_outputs_scores = tf.saved_model.utils.build_tensor_info(values)
101 | 
102 |   classification_signature = (
103 |       tf.saved_model.signature_def_utils.build_signature_def(
104 |           inputs={
105 |               tf.saved_model.signature_constants.CLASSIFY_INPUTS:
106 |                   classification_inputs
107 |           },
108 |           outputs={
109 |               tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES:
110 |                   classification_outputs_classes,
111 |               tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
112 |                   classification_outputs_scores
113 |           },
114 |           method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))
115 | 
116 |   tensor_info_x = tf.saved_model.utils.build_tensor_info(x)
117 |   tensor_info_y = tf.saved_model.utils.build_tensor_info(y)
118 | 
119 |   prediction_signature = (
120 |       tf.saved_model.signature_def_utils.build_signature_def(
121 |           inputs={'images': tensor_info_x},
122 |           outputs={'scores': tensor_info_y},
123 |           method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
124 | 
125 |   legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
126 |   builder.add_meta_graph_and_variables(
127 |       sess, [tf.saved_model.tag_constants.SERVING],
128 |       signature_def_map={
129 |           'predict_images':
130 |               prediction_signature,
131 |           tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
132 |               classification_signature,
133 |       },
134 |       legacy_init_op=legacy_init_op)
135 | 
136 |   builder.save()
137 | 
138 |   print 'Done exporting!'
139 | 
140 | 
141 | if __name__ == '__main__':
142 |   tf.app.run()


--------------------------------------------------------------------------------
/code/9_cnn_models/9.2_nets_factory.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Contains a factory for building various models."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | import functools
 21 | 
 22 | import tensorflow as tf
 23 | 
 24 | from nets import alexnet
 25 | from nets import cifarnet
 26 | from nets import inception
 27 | from nets import lenet
 28 | from nets import mobilenet_v1
 29 | from nets import overfeat
 30 | from nets import resnet_v1
 31 | from nets import resnet_v2
 32 | from nets import vgg
 33 | from nets.mobilenet import mobilenet_v2
 34 | from nets.nasnet import nasnet
 35 | from nets.nasnet import pnasnet
 36 | 
 37 | slim = tf.contrib.slim
 38 | 
 39 | networks_map = {'alexnet_v2': alexnet.alexnet_v2,
 40 |                 'cifarnet': cifarnet.cifarnet,
 41 |                 'overfeat': overfeat.overfeat,
 42 |                 'vgg_a': vgg.vgg_a,
 43 |                 'vgg_16': vgg.vgg_16,
 44 |                 'vgg_19': vgg.vgg_19,
 45 |                 'inception_v1': inception.inception_v1,
 46 |                 'inception_v2': inception.inception_v2,
 47 |                 'inception_v3': inception.inception_v3,
 48 |                 'inception_v4': inception.inception_v4,
 49 |                 'inception_resnet_v2': inception.inception_resnet_v2,
 50 |                 'lenet': lenet.lenet,
 51 |                 'resnet_v1_50': resnet_v1.resnet_v1_50,
 52 |                 'resnet_v1_101': resnet_v1.resnet_v1_101,
 53 |                 'resnet_v1_152': resnet_v1.resnet_v1_152,
 54 |                 'resnet_v1_200': resnet_v1.resnet_v1_200,
 55 |                 'resnet_v2_50': resnet_v2.resnet_v2_50,
 56 |                 'resnet_v2_101': resnet_v2.resnet_v2_101,
 57 |                 'resnet_v2_152': resnet_v2.resnet_v2_152,
 58 |                 'resnet_v2_200': resnet_v2.resnet_v2_200,
 59 |                 'mobilenet_v1': mobilenet_v1.mobilenet_v1,
 60 |                 'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_075,
 61 |                 'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_050,
 62 |                 'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_025,
 63 |                 'mobilenet_v2': mobilenet_v2.mobilenet,
 64 |                 'nasnet_cifar': nasnet.build_nasnet_cifar,
 65 |                 'nasnet_mobile': nasnet.build_nasnet_mobile,
 66 |                 'nasnet_large': nasnet.build_nasnet_large,
 67 |                 'pnasnet_large': pnasnet.build_pnasnet_large,
 68 |                }
 69 | 
 70 | arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope,
 71 |                   'cifarnet': cifarnet.cifarnet_arg_scope,
 72 |                   'overfeat': overfeat.overfeat_arg_scope,
 73 |                   'vgg_a': vgg.vgg_arg_scope,
 74 |                   'vgg_16': vgg.vgg_arg_scope,
 75 |                   'vgg_19': vgg.vgg_arg_scope,
 76 |                   'inception_v1': inception.inception_v3_arg_scope,
 77 |                   'inception_v2': inception.inception_v3_arg_scope,
 78 |                   'inception_v3': inception.inception_v3_arg_scope,
 79 |                   'inception_v4': inception.inception_v4_arg_scope,
 80 |                   'inception_resnet_v2':
 81 |                   inception.inception_resnet_v2_arg_scope,
 82 |                   'lenet': lenet.lenet_arg_scope,
 83 |                   'resnet_v1_50': resnet_v1.resnet_arg_scope,
 84 |                   'resnet_v1_101': resnet_v1.resnet_arg_scope,
 85 |                   'resnet_v1_152': resnet_v1.resnet_arg_scope,
 86 |                   'resnet_v1_200': resnet_v1.resnet_arg_scope,
 87 |                   'resnet_v2_50': resnet_v2.resnet_arg_scope,
 88 |                   'resnet_v2_101': resnet_v2.resnet_arg_scope,
 89 |                   'resnet_v2_152': resnet_v2.resnet_arg_scope,
 90 |                   'resnet_v2_200': resnet_v2.resnet_arg_scope,
 91 |                   'mobilenet_v1': mobilenet_v1.mobilenet_v1_arg_scope,
 92 |                   'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_arg_scope,
 93 |                   'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_arg_scope,
 94 |                   'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_arg_scope,
 95 |                   'mobilenet_v2': mobilenet_v2.training_scope,
 96 |                   'nasnet_cifar': nasnet.nasnet_cifar_arg_scope,
 97 |                   'nasnet_mobile': nasnet.nasnet_mobile_arg_scope,
 98 |                   'nasnet_large': nasnet.nasnet_large_arg_scope,
 99 |                   'pnasnet_large': pnasnet.pnasnet_large_arg_scope,
100 |                  }
101 | 
102 | 
103 | def get_network_fn(name, num_classes, weight_decay=0.0, is_training=False):
104 |   """Returns a network_fn such as `logits, end_points = network_fn(images)`.
105 |   Args:
106 |     name: The name of the network.
107 |     num_classes: The number of classes to use for classification. If 0 or None,
108 |       the logits layer is omitted and its input features are returned instead.
109 |     weight_decay: The l2 coefficient for the model weights.
110 |     is_training: `True` if the model is being used for training and `False`
111 |       otherwise.
112 |   Returns:
113 |     network_fn: A function that applies the model to a batch of images. It has
114 |       the following signature:
115 |           net, end_points = network_fn(images)
116 |       The `images` input is a tensor of shape [batch_size, height, width, 3]
117 |       with height = width = network_fn.default_image_size. (The permissibility
118 |       and treatment of other sizes depends on the network_fn.)
119 |       The returned `end_points` are a dictionary of intermediate activations.
120 |       The returned `net` is the topmost layer, depending on `num_classes`:
121 |       If `num_classes` was a non-zero integer, `net` is a logits tensor
122 |       of shape [batch_size, num_classes].
123 |       If `num_classes` was 0 or `None`, `net` is a tensor with the input
124 |       to the logits layer of shape [batch_size, 1, 1, num_features] or
125 |       [batch_size, num_features]. Dropout has not been applied to this
126 |       (even if the network's original classification does); it remains for
127 |       the caller to do this or not.
128 |   Raises:
129 |     ValueError: If network `name` is not recognized.
130 |   """
131 |   if name not in networks_map:
132 |     raise ValueError('Name of network unknown %s' % name)
133 |   # 在 network_map 字典中,通过指定的模型名字(如 alexnet_v2)
134 |   # 得到一个模型的实现函数(如 alexnet.alexnet_v2 方法)
135 |   func = networks_map[name]
136 |   # 此处使用 functools.wraps 装饰器,在原函数 func 之上添加参数作用域 arg_scope, 
137 |   # 以避免在模型定义时重复写过多的参数
138 |   @functools.wraps(func)
139 |   def network_fn(images, **kwargs):
140 |     arg_scope = arg_scopes_map[name](weight_decay=weight_decay)
141 |     with slim.arg_scope(arg_scope):
142 |       return func(images, num_classes, is_training=is_training, **kwargs)
143 |   if hasattr(func, 'default_image_size'):
144 |     # 每个模型的实现函数都会设定该模型的默认输入图像的分辨率(或大小)
145 |     network_fn.default_image_size = func.default_image_size
146 | 
147 |   return network_fn
148 | 


--------------------------------------------------------------------------------
/code/9_cnn_models/9.2_alexnet.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Contains a model definition for AlexNet.
 16 | This work was first described in:
 17 |   ImageNet Classification with Deep Convolutional Neural Networks
 18 |   Alex Krizhevsky, Ilya Sutskever and Geoffrey E. Hinton
 19 | and later refined in:
 20 |   One weird trick for parallelizing convolutional neural networks
 21 |   Alex Krizhevsky, 2014
 22 | Here we provide the implementation proposed in "One weird trick" and not
 23 | "ImageNet Classification", as per the paper, the LRN layers have been removed.
 24 | Usage:
 25 |   with slim.arg_scope(alexnet.alexnet_v2_arg_scope()):
 26 |     outputs, end_points = alexnet.alexnet_v2(inputs)
 27 | @@alexnet_v2
 28 | """
 29 | 
 30 | from __future__ import absolute_import
 31 | from __future__ import division
 32 | from __future__ import print_function
 33 | 
 34 | import tensorflow as tf
 35 | 
 36 | slim = tf.contrib.slim
 37 | trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev)
 38 | 
 39 | 
 40 | def alexnet_v2_arg_scope(weight_decay=0.0005):
 41 |   # 在该参数作用域下的 conv2d 和 fully_connected 的计算中,激活函数默认采用 
 42 |   # tf.nn.relu,biases 参数默认采用 0.1 恒定值作为初始化值,
 43 |   # weights 的正则化默认采用 L2 范式(可以防止过拟合)。用户可显式指定其他方式
 44 |   with slim.arg_scope([slim.conv2d, slim.fully_connected],
 45 |                       activation_fn=tf.nn.relu,
 46 |                       biases_initializer=tf.constant_initializer(0.1),
 47 |                       weights_regularizer=slim.l2_regularizer(weight_decay)):
 48 |     # 在该参数作用域下的 conv2d 计算中,补零默认采用 SAME 方式,用户可显式指定其他方式
 49 |     with slim.arg_scope([slim.conv2d], padding='SAME'):
 50 |       # 在该参数作用域下的 max_pool2d 中,补零默认采用 VALID 方式,用户可显式指定其他方
 51 |       with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc:
 52 |         return arg_sc
 53 | 
 54 | 
 55 | def alexnet_v2(inputs,
 56 |                num_classes=1000,
 57 |                is_training=True,
 58 |                dropout_keep_prob=0.5,
 59 |                spatial_squeeze=True,
 60 |                scope='alexnet_v2',
 61 |                global_pool=False):
 62 |   """AlexNet version 2.
 63 |   Described in: http://arxiv.org/pdf/1404.5997v2.pdf
 64 |   Parameters from:
 65 |   github.com/akrizhevsky/cuda-convnet2/blob/master/layers/
 66 |   layers-imagenet-1gpu.cfg
 67 |   Note: All the fully_connected layers have been transformed to conv2d layers.
 68 |         To use in classification mode, resize input to 224x224 or set
 69 |         global_pool=True. To use in fully convolutional mode, set
 70 |         spatial_squeeze to false.
 71 |         The LRN layers have been removed and change the initializers from
 72 |         random_normal_initializer to xavier_initializer.
 73 |   Args:
 74 |     inputs: a tensor of size [batch_size, height, width, channels].
 75 |     num_classes: the number of predicted classes. If 0 or None, the logits layer
 76 |     is omitted and the input features to the logits layer are returned instead.
 77 |     is_training: whether or not the model is being trained.
 78 |     dropout_keep_prob: the probability that activations are kept in the dropout
 79 |       layers during training.
 80 |     spatial_squeeze: whether or not should squeeze the spatial dimensions of the
 81 |       logits. Useful to remove unnecessary dimensions for classification.
 82 |     scope: Optional scope for the variables.
 83 |     global_pool: Optional boolean flag. If True, the input to the classification
 84 |       layer is avgpooled to size 1x1, for any input size. (This is not part
 85 |       of the original AlexNet.)
 86 |   Returns:
 87 |     net: the output of the logits layer (if num_classes is a non-zero integer),
 88 |       or the non-dropped-out input to the logits layer (if num_classes is 0
 89 |       or None).
 90 |     end_points: a dict of tensors with intermediate activations.
 91 |   """
 92 |   with tf.variable_scope(scope, 'alexnet_v2', [inputs]) as sc:
 93 |     end_points_collection = sc.original_name_scope + '_end_points'
 94 |     # tensorflow.contrib.slim.arg_scope(list_ops_or_scope, **kwargs)方法 
 95 |     #(tensorflow.contrib.framework.python.ops.argscope(list_ops_or_scope,**kwargs)
 96 |     # 方法)的作用是:
 97 |     # 当 list_ops_or_scope 为 ops 列表时,
 98 |     # 将 kwargs 中指定的键值对作为 ops 列表中每个 Op 的输入参数。
 99 |     # 此处,调用 slim.arg_scope 方法,使得卷积(conv2d)、全连接(fully_connected)
100 |     # 和最大池化(max_pool2d)三种算子的输出神经元组成的张量都保存在 end_points_collection 中
101 |     # Collect outputs for conv2d, fully_connected and max_pool2d.
102 |     with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
103 |                         outputs_collections=[end_points_collection]):
104 |       net = slim.conv2d(inputs, 64, [11, 11], 4, padding='VALID',
105 |                         scope='conv1')
106 |       net = slim.max_pool2d(net, [3, 3], 2, scope='pool1')
107 |       net = slim.conv2d(net, 192, [5, 5], scope='conv2')
108 |       net = slim.max_pool2d(net, [3, 3], 2, scope='pool2')
109 |       net = slim.conv2d(net, 384, [3, 3], scope='conv3')
110 |       net = slim.conv2d(net, 384, [3, 3], scope='conv4')
111 |       net = slim.conv2d(net, 256, [3, 3], scope='conv5')
112 |       net = slim.max_pool2d(net, [3, 3], 2, scope='pool5')
113 |       # 利用 conv2d 实现全连接层
114 |       with slim.arg_scope([slim.conv2d],
115 |                           weights_initializer=trunc_normal(0.005),
116 |                           biases_initializer=tf.constant_initializer(0.1)):
117 |         net = slim.conv2d(net, 4096, [5, 5], padding='VALID',
118 |                           scope='fc6')
119 |         net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
120 |                            scope='dropout6')
121 |         # 因为前一层已经是全连接层,即特征图的尺寸为 1×1,所以此处卷积核也设置为 1×1
122 |         net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
123 |         # 将 end_points_collection 转换为 end_points 字典。该字典中的 key 表示该模型中每一层的名字, 
124 |         # value 表示每一层的输出张量。
125 |         end_points = slim.utils.convert_collection_to_dict(
126 |             end_points_collection)
127 |         if global_pool:
128 |           net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
129 |           end_points['global_pool'] = net
130 |         if num_classes:
131 |           net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
132 |                              scope='dropout7')
133 |           net = slim.conv2d(net, num_classes, [1, 1],
134 |                             activation_fn=None,
135 |                             normalizer_fn=None,
136 |                             biases_initializer=tf.zeros_initializer(),
137 |                             scope='fc8')
138 |           if spatial_squeeze:
139 |             # 如果 spatial_squeeze 为 True,则对最后一个全连接层的输出张量的形状进行转换,
140 |             # 这样做的好处是能够去除一些不必要的维度,方便计算。该张量原来的形状为 [N,H,W,C],
141 |             # N 表示当前训练的批大小(batch_size),H 和 W 表示特征图的高和宽,C 表示特征图的通道数 
142 |             # (即分类问题中的类别数)。因为该张量是全连接层的输出,所以 H 和 W 都为 1。
143 |             # 通过 tf.squeeze 方法将 H 和 W 所在的维度去掉,该张量的形状变为 [N,C]
144 |             net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
145 |           end_points[sc.name + '/fc8'] = net
146 |       return net, end_points
147 | alexnet_v2.default_image_size = 224


--------------------------------------------------------------------------------
/text/1_overview/1.1_introduction.md:
--------------------------------------------------------------------------------
  1 | ## 1.1 TensorFlow简介
  2 | 
  3 | 当今人工智能领域最受欢迎的深度学习和机器学习框架非Google开源的TensorFlow莫属。本节将依次介绍TensorFlow的产生背景、独特价值和版本变迁，并横向对比目前主流机器学习和深度学习框架各自的特点和优劣。
  4 | 
  5 | ### 1.1.1 TensorFlow产生背景
  6 | 
  7 | 随着近年来深度学习模型在图像、视觉和语音领域取得不断突破，相关研究热潮持续高涨，开源深度学习框架出现百花齐放之势。其中具有代表性的框架包括XGBoost、Theano、Torch、Caffe和MXNet等。它们有的计算速度快，有的可移植性好，有的内存占用少，有的易于上手。在Google推出TensorFlow之前，大家仍处于“你方唱罢我登场”的百家争鸣状态。然而，Google推出深度学习框架TensorFlow之后，江湖巨变。
  8 | 
  9 | 2015年10月，Google旗下的DeepMind公司研发的AlphaGo击败樊麾，成为第一个无需让子即可在19路棋盘上击败围棋职业选手的电脑程序。这一壮举不仅打破了人工智能无法在围棋领域战胜人类顶尖棋手的诅咒，而且吸引了全球各界对人工智能研究的高度关注。2016年1月，AlphaGo的研究成果发表在知名学术期刊《自然》上，这一事件也将人工智能热潮推向了新的高度。2017年5月，强化后的AlphaGo与当世第一棋手柯洁对弈，获得3:0全胜战绩。这再一次笃定了人们对于Google在人工智能领域遥遥领先的信念。
 10 | 
 11 | AlphaGo的后期版本使用了基于TensorFlow编写的算法模型。但事实上，AlphaGo只是让TensorFlow走进公众视野的一个契机，TensorFlow的原始动机则是Google在高速发展的信息化应用背景下，发展感知、预测等人工智能技术的需求。移动互联网、物联网、共享经济、增强实现……这些热词的背后无一不需要海量数据与智能处理能力的支撑。Google作为行业的引领者，公司内部很早便有自研的机器学习平台。TensorFlow是既有平台的多年技术积累在新的时代背景下褪变升华的成果。
 12 | 
 13 | TensorFlow推出后短短一个月，就成为了机器学习和深度学习项目中最受欢迎的开源框架。究其原因，离不开Google在人工智能与数据处理领域的深厚积淀及其在业界的强大号召力。同时，Google已经成功领导多个开源项目，典型的有移动操作系统Android、容器编排引擎Kubernetes、编程语言Go等，它们充分体现了Google的工程水准与协作精神。因此，在内因与外因的合力之下，TensorFlow的横空出世也就不难理解了。
 14 | 
 15 | ### 1.1.2 TensorFlow独特价值
 16 | 
 17 | TensorFlow能够在众多开源框架中杀出重围，除了Google的背书以外，一定有其自身的独特价值。下面重点介绍TensorFlow相比其他开源框架的亮点和优势。
 18 | 
 19 | 运算性能强劲：在构建和部署机器学习系统时，性能是至关重要的。TensorFlow 1.0加入的线性代数编译器XLA全方位地提升了计算性能。XLA可以帮助TensorFlow在CPU、GPU、TPU、嵌入式设备等平台上更快速地运行机器学习模型的训练与推理任务。同时，TensorFlow提供了大量针对不同软硬件环境的优化配置参数。用户可以根据自身的需求和应用的特点，进一步提升计算性能。
 20 | 
 21 | 框架设计通用：TensorFlow不是一个严格的神经网络库。TensorFlow最初由Google Brain小组（隶属于Google机器智能研究机构）的研究员和工程师们开发出来，用于机器学习和深度神经网络方面的研究，但其灵活的设计也可广泛用于其他计算领域。同时，TensorFlow既提供高层封装API（如Slim、Keras、TF Layers等），能够帮助用户快速实现算法原型；又提供底层原生API，可以实现更灵活且高效的分布式并行模式。
 22 | 
 23 | 支持生产环境部署：TensorFlow支持使用同一套API实现探索环境和生产环境的部署。曾经，科研人员想要将算法原型推广到生产环境中使用是一个非常痛苦的过程，因为这涉及到大量的模型重写和脚本适配工作。现在，使用TensorFlow的算法研发人员可以快速地将想法和原型运用到生产环境的产品中，也可以在学术圈更方便地分享自己的研究成果。
 24 | 
 25 | 语言接口丰富：TensorFlow核心层由C++实现，应用层使用SWIG等技术封装，提供了多语言API的支持。目前官方支持的语言有Python、C、C++、Java、Go等。除此之外，TensorFlow的社区贡献者们也提供了非官方的应用层API，如NodeJS（<https://github.com/node-tensorflow/node-tensorflow>）、Julia（<https://github.com/malmaud/TensorFlow.jl>）、R（<https://github.com/rstudio/tensorflow>）。
 26 | 
 27 | 端云协同计算：TensorFlow同时支持在云侧（服务器端）和端侧（移动设备等终端）运行，有效结合了云侧和端侧的各自优势。在云侧方面，TensorFlow提供多种并行模式和编译优化等技术，尽可能提升算法模型的运算性能；在端侧方面，TensorFlow提供轻量级部署和8比特压缩等技术，尽可能提升计算和存储资源利用效率。
 28 | 
 29 | 除以上列举的优势外，TensorFlow丰富的算子库和教学资料也是其独有的竞争优势。同时，TensorFlow社区的活跃度遥遥领先其他竞争者，每个月都会有上万行的代码合入主分支。这就使得TensorFlow的新特性能够被快速实现，bug也能被快速修复。我们相信在业界众多人工智能开发者和Google工程师的共同努力下，TensorFlow能够计算得越来越快、发展得越来越好。
 30 | 
 31 | ### 1.1.3 TensorFlow版本变迁
 32 | 
 33 | TensorFlow自2015年11月开源以来，已经发布了30多个版本。本小节从TensorFlow的发展历程入手，考察其关键特性的发布和对应版本的变迁。图1-1展示了这一变化过程。
 34 | 
 35 | ![releases](images/releases.png)
 36 | 
 37 | 图1.1  TensorFlow关键特性发布和对应版本变迁
 38 | 
 39 | 在TensorFlow开源后第9天，Google带来了第一个正式发布版——TensorFlow 0.5.0，不过该版本仅支持在Linux系统上运行单机模型。随着TensorFlow贡献者和用户们对分布式的呼声越来越高，2016年4月发布的TensorFlow 0.8.0开始初步支持分布式计算。两个月后，TensorFlow 0.9.0增加了对多平台的支持。自此，用户可以将TensorFlow部署在Android、iOS和树莓派（Raspberry Pi）。同时，该版本还支持在macOS上使用GPU运行算法模型。
 40 | 
 41 | 2016年9月，0.10.0版本的发布解决了TensorFlow学习成本高和上手难的问题。尤其对于非计算机背景的算法研究人员，使用该版本提供的高层API——TF-Slim能够快速实现图像和视觉领域的算法模型。TF-Slim的发布有效扩大了TensorFlow用户群体，使得高校和科研院所的研究者们也能够享受TensorFlow带来的便利。
 42 | 
 43 | 随着TensorFlow开源一周年而到来的0.11.0版本新增了对HDFS和cuDNN 5的支持。HDFS作为Hadoop生态中的分布式文件系统，广泛地应用于大数据系统，这一特性标志着大数据生态和TensorFlow开始相互合作和共赢发展；cuDNN是NVIDIA公司开发的深度神经网络库，cuDNN 5能够进一步提升神经网络任务在GPU硬件上的计算速度。紧接着发布的TensorFlow 0.12.0开始为Windows平台提供支持，同时提供了实验性的Go语言应用层API。
 44 | 
 45 | 2017年2月，Google在山景城（Mountain View）召开了TensorFlow Dev Summit 2017大会。大会全面地介绍了TensorFlow的进展和取得的成就，并于隔天发布了TensorFlow 1.0.0。这也成为了TensorFlow发展的一个重大里程碑事件，标志着TensorFlow已经初步成熟并能够支持生产环境部署。事实上，当时京东、小米、Uber等国内外公司也确实在生产环境中使用了TensorFlow。作为TensorFlow的第一个正式版，TensorFlow 1.0.0带来了诸多提升性能和易用性的关键特性，比如：线性代数编译器XLA，部分解决了内存消耗大和计算速度慢的问题；命令行调测工具TensorFlow Debugger，初步解决了算法模型调测困难的问题。同时，该版本还新增了对Android的友好支持，使得用户能够更快速地将TensorFlow编写的模型部署到移动设备上运行。
 46 | 
 47 | 2017年4月，TensorFlow 1.1.0将Keras 2合并到了项目主分支的`tf.contrib.keras`目录中。从此以后，用户再也不需要独立安装Keras软件包，TensorFlow将自带Keras API。Keras是一套类似于TF-Slim的高层API，它良好的封装性和对模型的高度抽象使之收获了一大批算法开发者。但是，Keras并不等于TensorFlow，诸如分布式运行和更灵活的计算模式还得使用TensorFlow原生API实现。同时，该版本还支持用户在Windows上使用Java语言的应用层API。
 48 | 
 49 | 2017年7月，TensorFlow 1.2.0正式合入了Yahoo!提供的基于InfiniBand等高性能网络的RDMA通信方案。早在TensorFlow白皮书中，Google就表示TensorFlow支持RDMA。但可能出于商业考虑或其他原因，一直没有将RDMA方案发布到TensorFlow开源版本中。直到TensorFlow 1.2.0发布，用户终于可以在高性能网络设备上享受RDMA带来的效率提升。这有效解决了分布式训练大模型时的通信瓶颈问题。经测试，在VGG等大模型的分布式训练场景下，RDMA相比TCP/IP能够减少一半左右的网络通信开销。
 50 | 
 51 | 2017年8月，TensorFlow 1.3.0发布。它允许用户使用新增的Estimator库，以开箱即用方式快速实现深度神经网络分类器（DNNClassifier）、深度神经网络回归器（DNNRegressor）、线性分类器（LinearClassifier），以及深度神经网络和线性混合分类器（DNNLinearCombinedClassifier）。同时，从TensorFlow 1.3.0开始的所有二进制发布包都默认使用cuDNN 6，这将进一步提升TensorFlow在GPU上的运算性能。
 52 | 
 53 | 2017年11月，TensorFlow 1.4.0发布。该版本新增了tf.data模块，为数据读入和处理提供了便捷高效的解决方案。该版本还增强了Estimator的能力，使其能够支持简单的分布式模型训练和评估。同时，Google还开源了GANEstimator库，以回应越来越多的用户对生成对抗网络（GAN）的需求。
 54 | 
 55 | 回望TensorFlow的发展历程，我们不难发现Google在TensorFlow项目的推进上投入了不小的资源。在TensorFlow项目组成员和贡献者的共同努力下，TensorFlow正一步步走向成熟。
 56 | 
 57 | ### 1.1.4 TensorFlow与其他主流深度学习框架的对比
 58 | 
 59 | 放眼全球，诸如Google、Facebook、Amazon和Microsoft等国际巨头均在深度学习领域着手布局。一时间，江湖风云四起，各大门派争相斗法。Google坐拥TensorFlow，捍卫江湖地位；Facebook携手Caffe2和PyTorch，以图三分天下；Amazon拥抱MXNet，不甘落于人后；Microsoft坚守Cognitive Toolkit（CNTK），寻求单点突破。除此之外，还有Caffe、Torch7、Theano等老一辈深度学习框架参与竞争。可谓是乱花渐欲迷人眼，用户不知如何选。本小节将为用户客观分析各大主流深度学习框架的特点与优劣。
 60 | 
 61 | 下面我们从社区活跃度、多语言支持、教学资源、运算性能等多个维度全方位对比主流的深度学习框架。考虑到Keras的广大用户基础，我们也将其单独列出来进行比较。表1-1对比了2017年12月各个深度学习框架在GitHub上的统计数据。不难发现，TensorFlow在各项指标中均遥遥领先。排名第二的Keras由于接口简单易用而受到广泛关注，目前Keras官方已经支持使用TensorFlow、Theano和CNTK作为后端计算引擎。紧随其后的是老牌深度学习框架Caffe，其创始人是加州大学伯克利分校的贾扬清博士，他同时也参与了TensorFlow项目的早期设计和实现。
 62 | 
 63 | 表1-1 主流深度学习框架在GitHub的上统计数据
 64 | 
 65 | | 框架名称       | 所属机构       | 多语言支持                | Star数量 | Fork数量 | 贡献者数量 |
 66 | | ---------- | ---------- | -------------------- | ------ | ------ | ----- |
 67 | | TensorFlow | Google     | Python/C/C++/Java/Go | 84132  | 41072  | 1226  |
 68 | | Keras      | keras-team | Python               | 23530  | 8582   | 590   |
 69 | | Caffe      | BVLC       | C++/Python           | 22024  | 13509  | 253   |
 70 | | CNTK       | 微软         | C++                  | 13488  | 3522   | 158   |
 71 | | MXNet      | 亚马逊        | Python/C++/R         | 12594  | 4641   | 465   |
 72 | | PyTorch    | Facebook   | Python               | 10737  | 2225   | 375   |
 73 | | Torch7     | Facebook   | Lua                  | 7575   | 2222   | 133   |
 74 | | Theano     | 蒙特利尔大学     | Python               | 7515   | 2370   | 326   |
 75 | | Caffe2     | Facebook   | C++/Python           | 6665   | 1510   | 140   |
 76 | 
 77 | 从目前局势来看，TensorFlow是最受欢迎的深度学习框架。那么，抛开Google在行业的巨大影响力，TensorFlow自身的硬实力如何呢？我们参考了多种公开基准测评，以及我们在图像和视觉领域实际测试得到的数据，给出了表1-2这组相对客观的横向对比。表中各评价指标均为5分制。因为部分框架未能找到测试数据，所以表中仅列出了指标相对确定的深度学习框架。
 78 | 
 79 | 表1-2  主流深度学习框架在各个维度的横向对比
 80 | 
 81 | | 框架名称       | 教学资源 | 多语言接口 | 模型设计 | 运算性能 | 易用性  |
 82 | | ---------- | ---- | ----- | ---- | ---- | ---- |
 83 | | TensorFlow | 5    | 5     | 5    | 4    | 5    |
 84 | | MXNet      | 3    | 5     | 4    | 5    | 4    |
 85 | | Caffe      | 2    | 5     | 3    | 5    | 3    |
 86 | | Theano     | 3    | 2     | 5    | 3    | 2    |
 87 | | Torch7     | 2    | 2     | 5    | 3    | 3    |
 88 | | CNTK       | 2    | 2     | 2    | 5    | 3    |
 89 | 
 90 | 对于深度学习框架的初学者来说，教学资源是一个非常重要的参考指标。借助Google的强大影响力和执行力，TensorFlow在这方面具有显著优势。不论是基础的指导手册，亦或是花样百出的最佳实践，初学者都有大量资料可以查询。同时，人工智能相关的会议和期刊论文中发布的新模型和新算法，几乎都会有人第一时间使用TensorFlow实现，并在GitHub上开源出来。
 91 | 
 92 | 从多语言接口的角度来看，TensorFlow和MXNet共同占据领先地位。几乎所有框架都支持深度学习领域的“英语”——Python。TensorFlow对更多不同编程范式语言的支持使之对于不同背景的用户都具有一定的友好性，同时也扩展了框架潜在的应用领域。
 93 | 
 94 | 从模型设计维度来看，TensorFlow采用了当前主流的基于数据流图的模型设计方式。其算子种类丰富，粒度较细，为用户提供的自由度高。相比于Caffe的配置式模型设计，TensorFlow显得更加灵活，能够适应更多的应用场景。同样使用数据流图定义模型的还有MXNet。不过，MXNet的分布式模型的约束较多，灵活性不足。
 95 | 
 96 | 自TensorFlow发布以来，运算性能似乎一直是其弱项。GitHub和Stack Overflow上的讨论帖中也时常能够看到有人对TensorFlow的内存消耗和计算速度表示遗憾。但是，随着XLA和RDMA等特性的发布，TensorFlow的性能在绝大多数情况下都不输于其他深度学习框架。如果用户能够深入了解TensorFlow的API，那么就会发现它提供了大量提升性能的配置项。在启用这些性能优化选项后，TensorFlow的运算性能甚至能够超过MXNet和Caffe。
 97 | 
 98 | TensorFlow的灵活性是它的一大优势，但同时也因为API过于丰富而带来了学习成本高的问题。尤其是对于仅研究算法和模型的开发人员，在没有时间全面了解TensorFlow运行机制和编程接口的前提下，往往觉得无从下手。针对这类用户，社区的开发者们也提出了不少解决方案，那就是以Keras、TF Layers和TF Learn等为代表的高层API封装库。这些库隐藏了TensorFlow的大量细粒度接口，以简单易懂的接口取而代之，使得读者能够快速上手编程，并实现一些单机运行的算法模型。
 99 | 
100 | 综合对比当前主流的深度学习框架，TensorFlow在各个维度都具有比较明显的优势。同时，TensorFlow社区的活跃度也远超其他社区。这会使得越来越多的深度学习从业者参与贡献TensorFlow项目，最终形成越用越好用的良性循环。
101 | 
102 | #
103 | 
104 | **Prev：**[第1章 TensorFlow系统概述](1.0_overview.md)
105 | 
106 | **Next：**[1.2 TensorFlow的设计目标](1.2_objectives.md)


--------------------------------------------------------------------------------
/contents.md:
--------------------------------------------------------------------------------
  1 | # 《深入解析TensorFlow架构设计与实现原理》
  2 | 
  3 | # 第一部分 基础篇
  4 | 
  5 | ## 第1章 TensorFlow系统概述
  6 | 
  7 | ### 1.1 TensorFlow简介
  8 | 
  9 | #### 1.1.1 TensorFlow的产生背景
 10 | 
 11 | #### 1.1.2 TensorFlow的独特价值
 12 | 
 13 | #### 1.1.3 TensorFlow的版本变迁
 14 | 
 15 | #### 1.1.4 TensorFlow的目标用户
 16 | 
 17 | #### 1.1.5 TensorFlow与其他深度学习平台的对比
 18 | 
 19 | ### 1.2 TensorFlow的设计目标
 20 | 
 21 | #### 1.2.1 灵活通用的深度学习库
 22 | 
 23 | #### 1.2.2 端云结合的人工智能引擎
 24 | 
 25 | #### 1.2.3 高性能的基础平台软件
 26 | 
 27 | ### 1.3 TensorFlow的基本架构
 28 | 
 29 | #### 1.3.1 TensorFlow的工作形态
 30 | 
 31 | #### 1.3.2 TensorFlow的组件结构
 32 | 
 33 | ### 1.4 小结
 34 | 
 35 | ## 第2章 TensorFlow环境准备
 36 | 
 37 | ### 2.1 TensorFlow的安装
 38 | 
 39 | #### 2.1.1 TensorFlow安装概述
 40 | 
 41 | #### 2.1.2 使用Anaconda安装
 42 | 
 43 | #### 2.1.3 使用原生pip安装
 44 | 
 45 | #### 2.1.4 使用Virtualenv安装
 46 | 
 47 | #### 2.1.5 使用Docker安装
 48 | 
 49 | #### 2.1.6 使用源代码编译安装
 50 | 
 51 | #### 2.1.7 Hello TensorFlow
 52 | 
 53 | ### 2.2 TensorFlow的依赖项
 54 | 
 55 | #### 2.2.1 Bazel软件构建工具
 56 | 
 57 | #### 2.2.2 Protocol Buffers数据结构序列化工具
 58 | 
 59 | #### 2.2.3 Eigen线性代数计算库
 60 | 
 61 | #### 2.2.4 CUDA统一计算设备架构
 62 | 
 63 | ### 2.3 TensorFlow的源代码结构
 64 | 
 65 | #### 2.3.1 根目录
 66 | 
 67 | #### 2.3.2 tensorflow目录
 68 | 
 69 | #### 2.3.3 tensorflow/core目录
 70 | 
 71 | #### 2.3.4 tensorflow/python目录
 72 | 
 73 | #### 2.3.5 安装目录
 74 | 
 75 | ### 2.4 小结
 76 | 
 77 | ## 第3章 TensorFlow基础概念
 78 | 
 79 | ### 3.1 TensorFlow编程范式——数据流图
 80 | 
 81 | #### 3.1.1 声明式编程与命令式编程
 82 | 
 83 | #### 3.1.2 数据流图在深度学习应用上的优势
 84 | 
 85 | #### 3.1.3 TensorFlow数据流图的基本概念
 86 | 
 87 | ### 3.2 TensorFlow数据载体——张量
 88 | 
 89 | #### 3.2.1 张量——Tensor
 90 | 
 91 | #### 3.2.2 稀疏张量——SparseTensor
 92 | 
 93 | ### 3.3 TensorFlow模型载体——操作
 94 | 
 95 | #### 3.3.1 计算节点——Operation
 96 | 
 97 | #### 3.3.2 存储节点——Variable
 98 | 
 99 | #### 3.3.3 数据节点——Placeholder
100 | 
101 | ### 3.4 TensorFlow运行环境——会话
102 | 
103 | #### 3.4.1 普通会话——Session
104 | 
105 | #### 3.4.2 交互式会话——InteractiveSession
106 | 
107 | #### 3.4.3 扩展阅读：会话实现原理
108 | 
109 | ### 3.5 TensorFlow训练机制——优化器
110 | 
111 | #### 3.5.1 损失函数与优化算法
112 | 
113 | #### 3.5.2 优化器概述
114 | 
115 | #### 3.5.3 简单梯度计算方法
116 | 
117 | #### 3.5.4 扩展阅读：高级梯度计算方法
118 | 
119 | ### 3.6 一元线性回归模型的最佳实践
120 | 
121 | ### 3.7 小结
122 | 
123 | 
124 | # 第二部分 关键模块篇
125 | 
126 | ## 第4章 TensorFlow数据处理方法
127 | 
128 | ### 4.1 输入数据集
129 | 
130 | #### 4.1.1 使用输入流水线并行读取数据
131 | 
132 | #### 4.1.2 创建批样例数据的方法
133 | 
134 | #### 4.1.3 填充数据节点的方法
135 | 
136 | #### 4.1.4 处理CIFAR-10数据集的最佳实践
137 | 
138 | ### 4.2 模型参数
139 | 
140 | #### 4.2.1 模型参数的典型使用流程
141 | 
142 | #### 4.2.2 使用tf.Variable创建、初始化和更新模型参数
143 | 
144 | #### 4.2.3 使用tf.train.Saver保存和恢复模型参数
145 | 
146 | #### 4.2.4 使用变量作用域处理复杂模型
147 | 
148 | ### 4.3 命令行参数
149 | 
150 | #### 4.3.1 使用argparse解析命令行参数
151 | 
152 | #### 4.3.2 使用tf.app.flags解析命令行参数
153 | 
154 | ### 4.4 小结
155 | 
156 | ## 第5章 TensorFlow编程框架
157 | 
158 | ### 5.1 TensorFlow单机程序编程框架
159 | 
160 | #### 5.1.1 单机程序编程框架概述
161 | 
162 | #### 5.1.2 创建单机数据流图
163 | 
164 | #### 5.1.3 创建单机会话
165 | 
166 | ### 5.2 TensorFlow分布式程序编程框架
167 | 
168 | #### 5.2.1 PS-worker架构概述
169 | 
170 | #### 5.2.2 分布式程序编程框架概述
171 | 
172 | #### 5.2.3 创建TensorFlow集群
173 | 
174 | #### 5.2.4 将操作放置到目标设备
175 | 
176 | #### 5.2.5 TensorFlow数据并行模式
177 | 
178 | #### 5.2.6 TensorFlow同步训练机制
179 | 
180 | #### 5.2.7 TensorFlow异步训练机制
181 | 
182 | #### 5.2.8 使用Supervisor管理模型训练
183 | 
184 | #### 5.2.9 Supervisor使用方法进阶
185 | 
186 | #### 5.2.10 扩展阅读：分布式会话创建原理
187 | 
188 | #### 5.2.11 分布式同步训练的最佳实践
189 | 
190 | ### 5.3 小结
191 | 
192 | ## 第6章 TensorBoard可视化工具
193 | 
194 | ### 6.1 TensorBoard概述
195 | 
196 | #### 6.2.1 TensorBoard典型用例
197 | 
198 | #### 6.2.2 TensorBoard使用流程
199 | 
200 | ### 6.2 可视化数据流图
201 | 
202 | #### 6.2.1 名字作用域与抽象节点
203 | 
204 | #### 6.2.2 可视化数据流图的最佳实践
205 | 
206 | #### 6.2.3 扩展阅读：汇总数据和事件数据
207 | 
208 | #### 6.2.4 扩展阅读：揭秘tf.summary.FileWriter工作原理
209 | 
210 | ### 6.3 可视化学习过程
211 | 
212 | #### 6.3.1 汇总操作概述
213 | 
214 | #### 6.2.2 使用tf.summary.scalar生成折线图
215 | 
216 | #### 6.3.3 使用tf.summary.histogram生成数据分布图
217 | 
218 | #### 6.3.4 使用tf.summary.image生成图像
219 | 
220 | #### 6.3.5 使用tf.summary.audio生成音频
221 | 
222 | #### 6.3.6 可视化MNIST softmax模型学习过程的最佳实践
223 | 
224 | ### 6.4 可视化高维数据
225 | 
226 | #### 6.4.1 使用TensorBoard可视化高维数据
227 | 
228 | #### 6.4.2 可视化MNIST数据集的最佳实践
229 | 
230 | ### 6.5 小结
231 | 
232 | ## 第7章 TensorFlow模型托管工具
233 | 
234 | ### 7.1 TensorFlow Serving概述
235 | 
236 | ### 7.2 TensorFlow Serving系统架构
237 | 
238 | ### 7.3 TensorFlow Serving的安装
239 | 
240 | #### 7.3.1 使用APT安装ModelServer
241 | 
242 | #### 7.3.2 使用源码编译安装ModelServer
243 | 
244 | ### 7.4 TensorFlow Serving最佳实践
245 | 
246 | #### 7.4.1 导出模型
247 | 
248 | #### 7.4.2 发布模型服务
249 | 
250 | #### 7.4.3 更新线上模型服务
251 | 
252 | ### 7.5 小结
253 | 
254 | 
255 | # 第三部分 算法模型篇
256 | 
257 | ## 第8章 深度学习概述
258 | 
259 | ### 8.1 深度学习的发展历史
260 | 
261 | #### 8.1.1 感知机模型与神经网络
262 | 
263 | #### 8.1.2 神经网络的寒冬与复苏
264 | 
265 | #### 8.1.3 神经网络的发展与第二次寒冬
266 | 
267 | #### 8.1.4 深度学习时代的到来
268 | 
269 | ### 8.2 深度学习的主要应用
270 | 
271 | #### 8.2.1 计算机视觉
272 | 
273 | #### 8.2.2 自然语言处理
274 | 
275 | #### 8.2.3 深度强化学习
276 | 
277 | ### 8.3 深度学习与TensorFlow
278 | 
279 | ### 8.4 小结
280 | 
281 | ## 第9章 卷积神经网络
282 | 
283 | ### 9.1 CNN模型简介
284 | 
285 | #### 9.1.1 卷积层
286 | 
287 | #### 9.1.2 激活层
288 | 
289 | #### 9.1.3 池化层
290 | 
291 | #### 9.1.4 全连接层
292 | 
293 | #### 9.1.5 Dropout层
294 | 
295 | #### 9.1.6 Batch Normalization (BN)层
296 | 
297 | #### 9.1.7 常用的CNN图像分类模型
298 | 
299 | ### 9.2 TensorFlow-Slim
300 | 
301 | #### 9.2.1 Datasets包和Data包
302 | 
303 | #### 9.2.2 Preprocessing包
304 | 
305 | #### 9.2.3 Deployment包
306 | 
307 | #### 9.2.4 Nets包
308 | 
309 | #### 9.2.5 TensorFlow-Slim最佳实践
310 | 
311 | ### 9.3 CNN模型的应用
312 | 
313 | #### 9.3.1 物体检测
314 | 
315 | #### 9.3.2 图像分割
316 | 
317 | ### 9.4 小结
318 | 
319 | ## 第10章 生成对抗网络
320 | 
321 | ### 10.1 GAN的原理及应用
322 | 
323 | #### 10.1.1 GAN的原理
324 | 
325 | #### 10.1.2 GAN的主要应用
326 | 
327 | ### 10.2 几类经典的GAN模型
328 | 
329 | #### 10.2.1 DCGAN
330 | 
331 | #### 10.2.2 InfoGAN
332 | 
333 | #### 10.2.3 LPGAN
334 | 
335 | #### 10.2.4 WGAN
336 | 
337 | #### 10.2.5 LS-GAN
338 | 
339 | ### 10.3 GAN模型的发展趋势
340 | 
341 | ### 10.4 小结
342 | 
343 | ## 第11章 循环神经网络
344 | 
345 | ### 11.1 RNN单元及其变种
346 | 
347 | #### 11.1.1 RNN单元
348 | 
349 | #### 11.1.2 LSTM单元
350 | 
351 | #### 11.1.3 GRU单元
352 | 
353 | #### 11.1.4 双向RNN单元
354 | 
355 | #### 11.1.5 带有其他特性的RNN单元
356 | 
357 | ### 11.2 RNN模型
358 | 
359 | #### 11.2.1 PTB-LSTM语言模型
360 | 
361 | #### 11.2.2 Seq2Seq模型
362 | 
363 | ### 11.3 小结
364 | 
365 | 
366 | # 第四部分 核心揭秘篇
367 | 
368 | ## 第12章 TensorFlow运行时核心设计与实现
369 | 
370 | ### 12.1 运行时框架概述
371 | 
372 | ### 12.2 关键数据结构
373 | 
374 | #### 12.2.1 张量相关数据结构
375 | 
376 | #### 12.2.2 设备相关数据结构
377 | 
378 | #### 12.2.3 数据流图相关数据结构
379 | 
380 | ### 12.3 公共基础机制
381 | 
382 | #### 12.3.1 内存分配
383 | 
384 | #### 12.3.2 线程管理
385 | 
386 | #### 12.3.3 多语言接口
387 | 
388 | #### 12.3.4 XLA编译技术
389 | 
390 | #### 12.3.5 单元测试框架
391 | 
392 | ### 12.4 外部环境接口
393 | 
394 | #### 12.4.1 加速器硬件接口
395 | 
396 | #### 12.4.2 系统软件接口
397 | 
398 | ### 12.5 小结
399 | 
400 | ## 第13章 通信原理与实现
401 | 
402 | ### 13.1 概述
403 | 
404 | ### 13.2 进程内通信
405 | 
406 | #### 13.2.1 通信接口
407 | 
408 | #### 13.2.2 会合点机制
409 | 
410 | #### 13.2.3 异构设备内存访问
411 | 
412 | ### 13.3 进程间通信
413 | 
414 | #### 13.3.1 gRPC通信机制
415 | 
416 | #### 13.3.2 控制通信
417 | 
418 | #### 13.3.3 数据通信
419 | 
420 | ### 13.4 RDMA通信模块
421 | 
422 | #### 13.4.1 模块结构
423 | 
424 | #### 13.4.2 消息语义
425 | 
426 | #### 13.4.3 通信流程
427 | 
428 | ### 13.5 小结
429 | 
430 | ## 第14章 数据流图计算原理与实现
431 | 
432 | ### 14.1 概述
433 | 
434 | ### 14.2 数据流图创建
435 | 
436 | #### 14.2.1 流程与抽象
437 | 
438 | #### 14.2.2 全图构造
439 | 
440 | #### 14.2.3 子图提取
441 | 
442 | #### 14.2.4 图切分
443 | 
444 | #### 14.2.5 图优化
445 | 
446 | ### 14.3 单机会话运行
447 | 
448 | #### 14.3.1 流程与抽象
449 | 
450 | #### 14.3.2 执行器获取
451 | 
452 | #### 14.3.3 输入数据填充
453 | 
454 | #### 14.3.4 图运行
455 | 
456 | #### 14.3.5 输出数据获取
457 | 
458 | #### 14.3.6 张量保存
459 | 
460 | ### 14.4 分布式会话运行
461 | 
462 | #### 14.4.1 主-从模型
463 | 
464 | #### 14.4.2 主要抽象
465 | 
466 | #### 14.4.3 Client创建会话
467 | 
468 | #### 14.4.4 Client请求图运行
469 | 
470 | #### 14.4.5 Master驱动图运行
471 | 
472 | #### 14.4.6 Worker实施图运行
473 | 
474 | ### 14.5 操作节点执行
475 | 
476 | #### 14.5.1 核函数抽象
477 | 
478 | #### 14.5.2 CPU核函数执行
479 | 
480 | #### 14.5.3 GPU核函数执行
481 | 
482 | ### 14.6 小结
483 | 
484 | 
485 | # 第五部分 生态发展篇
486 | 
487 | ## 第15章 TensorFlow生态环境
488 | 
489 | ### 15.1 TensorFlow生态环境概况
490 | 
491 | #### 15.1.1 社区托管组件
492 | 
493 | #### 15.1.2 第三方项目
494 | 
495 | ### 15.2 Keras深度学习算法库
496 | 
497 | #### 15.2.1 Keras 项目概述
498 | 
499 | #### 15.2.2 Keras 模型概述
500 | 
501 | #### 15.2.3 Keras 顺序模型
502 | 
503 | #### 15.2.4 Keras 函数式模型
504 | 
505 | ### 15.3 TensorFlow与Kubernetes生态的结合
506 | 
507 | ### 15.4 TensorFlow与Spark生态的结合
508 | 
509 | ### 15.5 TensorFlow通信优化技术
510 | 
511 | ### 15.6 TPU及神经网络处理器
512 | 
513 | ### 15.7 NNVM模块化深度学习组件
514 | 
515 | ### 15.8 TensorFlow未来发展
516 | 
517 | ### 15.9 小结
518 | 
519 | ##### 附录 A：常见问题解决方案
520 | 


--------------------------------------------------------------------------------
/code/5_control_flow_analysis/5.2_best_practice.py:
--------------------------------------------------------------------------------
  1 | """5.2_best_practice.py"""
  2 | # -*- coding: utf-8 -*-
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import math
  8 | import sys
  9 | import tempfile
 10 | import time
 11 | 
 12 | import tensorflow as tf
 13 | from tensorflow.examples.tutorials.mnist import input_data
 14 | 
 15 | 
 16 | flags = tf.app.flags
 17 | flags.DEFINE_string("data_dir", "/tmp/mnist-data",
 18 |                     "Directory for storing mnist data")
 19 | flags.DEFINE_string("train_dir", "/tmp/mnist-log",
 20 |                     "Directory for storing checkpoint and summary files")
 21 | flags.DEFINE_integer("task_index", None,
 22 |                      "Worker task index, should be >= 0. task_index=0 is "
 23 |                      "the master worker task the performs the variable "
 24 |                      "initialization ")
 25 | flags.DEFINE_integer("num_gpus", 0, "Total number of gpus for each machine."
 26 |                      "If you don't use GPU, please set it to '0'")
 27 | flags.DEFINE_integer("replicas_to_aggregate", None,
 28 |                      "Number of replicas to aggregate before parameter update "
 29 |                      "is applied (For sync_replicas mode only; default: "
 30 |                      "num_workers)")
 31 | flags.DEFINE_integer("hidden_units", 100,
 32 |                      "Number of units in the hidden layer of the NN")
 33 | flags.DEFINE_integer("train_steps", 200,
 34 |                      "Number of (global) training steps to perform")
 35 | flags.DEFINE_integer("batch_size", 100, "Training batch size")
 36 | flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
 37 | flags.DEFINE_boolean("sync_replicas", False,
 38 |                      "Use the sync_replicas (synchronized replicas) mode, "
 39 |                      "wherein the parameter updates from workers are aggregated "
 40 |                      "before applied to avoid stale gradients")
 41 | flags.DEFINE_string("ps_hosts","localhost:2222",
 42 |                     "Comma-separated list of hostname:port pairs")
 43 | flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224",
 44 |                     "Comma-separated list of hostname:port pairs")
 45 | flags.DEFINE_string("job_name", None, "job name: worker or ps")
 46 | 
 47 | FLAGS = flags.FLAGS
 48 | 
 49 | 
 50 | IMAGE_PIXELS = 28
 51 | 
 52 | 
 53 | def main(unused_argv):
 54 |   mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
 55 | 
 56 |   if FLAGS.job_name is None or FLAGS.job_name == "":
 57 |     raise ValueError("Must specify an explicit `job_name`")
 58 |   if FLAGS.task_index is None or FLAGS.task_index == "":
 59 |     raise ValueError("Must specify an explicit `task_index`")
 60 | 
 61 |   # 解析ps和worker的主机名列表
 62 |   ps_spec = FLAGS.ps_hosts.split(",")
 63 |   worker_spec = FLAGS.worker_hosts.split(",")
 64 | 
 65 |   # 计算worker的数量
 66 |   num_workers = len(worker_spec)
 67 | 
 68 |   cluster = tf.train.ClusterSpec({
 69 |       "ps": ps_spec,
 70 |       "worker": worker_spec})
 71 |   
 72 |   # 如果是ps，直接启动服务，并开始监听worker发起的请求
 73 |   if FLAGS.job_name == "ps":
 74 |       server.join()
 75 | 
 76 |   # 判断当前是否为chief worker的任务进程
 77 |   is_chief = (FLAGS.task_index == 0)
 78 | 
 79 |   if FLAGS.num_gpus > 0:
 80 |     # 假设每台机器的 GPU 数量都相同时，为每台机器的每个 GPU 依次分配一个计算任务。
 81 |     gpu = (FLAGS.task_index % FLAGS.num_gpus)
 82 |     worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
 83 |   elif FLAGS.num_gpus == 0:
 84 |     # 如果没有 GPU，直接将计算任务分配到 CPU
 85 |     cpu = 0
 86 |     worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
 87 | 
 88 |   # 根据TensorFlow集群的定义和当前设备的信息，放置对应的模型参数和计算操作
 89 |   with tf.device(
 90 |       tf.train.replica_device_setter(
 91 |           worker_device=worker_device,
 92 |           ps_device="/job:ps/cpu:0",
 93 |           cluster=cluster)):
 94 |       global_step = tf.Variable(0, name="global_step", trainable=False)
 95 | 
 96 |       # 隐层模型参数
 97 |       hid_w = tf.Variable(
 98 |           tf.truncated_normal(
 99 |               [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
100 |               stddev=1.0 / IMAGE_PIXELS),
101 |           name="hid_w")
102 |       hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")
103 | 
104 |       # softmax层模型参数
105 |       sm_w = tf.Variable(
106 |           tf.truncated_normal(
107 |               [FLAGS.hidden_units, 10],
108 |               stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
109 |           name="sm_w")
110 |       sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
111 | 
112 |       # 根据任务编号放置对应的placeholder
113 |       x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
114 |       y_ = tf.placeholder(tf.float32, [None, 10])
115 |       # tf.nn.xw_plus_b即为matmul(x, w) + b
116 |       hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
117 |       # 使用relu作为激活函数，hid为隐层输出
118 |       hid = tf.nn.relu(hid_lin)
119 |       # 定义softmax层的输出y，即推理计算出的标签值
120 |       y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
121 |       # 使用交叉熵评估两个概率分布间的相似性。因为概率取值范围为[0, 1]，
122 |       # 同时避免出现无意义的log(0)，所以裁剪y值到区间[1e-10, 1.0]
123 |       cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
124 |       # 使用Adam做最优化求解
125 |       opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
126 |     
127 |     # 如果使用同步训练机制
128 |     if FLAGS.sync_replicas:
129 |       # 如果用户没有输入并行副本数，则令其等于worker任务数
130 |       if FLAGS.replicas_to_aggregate is None:
131 |         replicas_to_aggregate = num_workers
132 |       # 如果用户输入了并行副本数，则赋值为命令行解析的并行副本数
133 |       else:
134 |         replicas_to_aggregate = FLAGS.replicas_to_aggregate
135 |       # 创建同步优化器实例，负责计算梯度和更新模型参数
136 |       opt = tf.train.SyncReplicasOptimizer(
137 |           opt,
138 |           replicas_to_aggregate=replicas_to_aggregate,
139 |           total_num_replicas=num_workers,
140 |           name="mnist_sync_replicas")
141 |     # 单步训练操作，即利用同步优化器最优化交叉熵
142 |     train_op = opt.minimize(cross_entropy, global_step=global_step)
143 | 
144 |     # 使用同步训练机制
145 |     if FLAGS.sync_replicas:
146 |       # 其它worker：为local_step设置初始值
147 |       local_init_op = opt.local_step_init_op
148 |       # chief worker：为global_step设置初始值
149 |       if is_chief:
150 |         local_init_op = opt.chief_init_op
151 |       # 定义为未初始化的Variable设置初始值的操作
152 |       ready_for_local_init_op = opt.ready_for_local_init_op
153 | 
154 |       # 定义启动同步标记队列的QueueRunner实例
155 |       chief_queue_runner = opt.get_chief_queue_runner()
156 |       # 定义为同步标记队列入队初始值的操作
157 |       sync_init_op = opt.get_init_tokens_op()
158 |     # 定义为全局Variable设置初始值的操作
159 |     init_op = tf.global_variables_initializer()
160 | 
161 |     # 使用同步训练机制，传入本地初始化相关操作
162 |     if FLAGS.sync_replicas:
163 |       sv = tf.train.Supervisor(
164 |           is_chief=is_chief,
165 |           logdir=FLAGS.train_dir,
166 |           init_op=init_op,
167 |           local_init_op=local_init_op,
168 |           ready_for_local_init_op=ready_for_local_init_op,
169 |           recovery_wait_secs=1,
170 |           global_step=global_step)
171 |     # 使用异步更新机制，各worker独自训练，与单机模型一致
172 |     else:
173 |       sv = tf.train.Supervisor(
174 |           is_chief=is_chief,
175 |           logdir=FLAGS.train_dir,
176 |           init_op=init_op,
177 |           recovery_wait_secs=1,
178 |           global_step=global_step)
179 |     # 配置分布式会话：
180 |     #     在没有可用的GPU时，将操作放置到CPU
181 |     #     不打印设备放置信息
182 |     #     过滤未绑定在ps和worker上的操作
183 |     sess_config = tf.ConfigProto(
184 |         allow_soft_placement=True,  
185 |         log_device_placement=False, 
186 |         device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])
187 | 
188 |     # 如果是chief worker，则初始化所有worker的分布式会话
189 |     if is_chief:
190 |       print("Worker %d: Initializing session..." % FLAGS.task_index)
191 |     # 如果是其它worker，则等待chief worker返回的会话
192 |     else:
193 |       print("Worker %d: Waiting for session to be initialized..." %
194 |             FLAGS.task_index)
195 | 
196 |     sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
197 | 
198 |     print("Worker %d: Session initialization complete." % FLAGS.task_index)
199 |     # 如果是同步更新模式，并且当前进程为chief worker
200 |     if FLAGS.sync_replicas and is_chief:
201 |       # 初始化同步标记队列
202 |       sess.run(sync_init_op)
203 |       # 通过queue runner启动3个线程，并运行各自的标准服务
204 |       sv.start_queue_runners(sess, [chief_queue_runner])
205 | 
206 |     # 记录并打印训练开始前的时间
207 |     time_begin = time.time()
208 |     print("Training begins @ %f" % time_begin)
209 |     # 将local_step赋值为0
210 |     local_step = 0
211 |     while True:
212 |       # 填充训练数据
213 |       batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
214 |       train_feed = {x: batch_xs, y_: batch_ys}
215 |       # 执行单步训练操作
216 |       _, step = sess.run([train_op, global_step], feed_dict=train_feed)
217 |       local_step += 1
218 |       # 记录并打印完成当前单步训练所需的时间
219 |       now = time.time()
220 |       print("%f: Worker %d: training step %d done (global step: %d)" %
221 |             (now, FLAGS.task_index, local_step, step))
222 |       # 如果当前超过最大训练步数，退出训练循环
223 |       if step >= FLAGS.train_steps:
224 |         break
225 |     # 记录并打印训练结束的时间
226 |     time_end = time.time()
227 |     print("Training ends @ %f" % time_end)
228 |     # 总训练时间为两者的时间差
229 |     training_time = time_end - time_begin
230 |     print("Training elapsed time: %f s" % training_time)
231 | 
232 |     # 填充验证数据
233 |     val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
234 |     # 在验证数据集上计算模型的交叉熵
235 |     val_xent = sess.run(cross_entropy, feed_dict=val_feed)
236 |     print("After %d training step(s), validation cross entropy = %g" %
237 |           (FLAGS.train_steps, val_xent))
238 | 
239 | 
240 | if __name__ == "__main__":
241 |   tf.app.run()


--------------------------------------------------------------------------------
/code/11_rnn_models/11.2_ptb_word_lm.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Example / benchmark for building a PTB LSTM model.
 17 | Trains the model described in:
 18 | (Zaremba, et. al.) Recurrent Neural Network Regularization
 19 | http://arxiv.org/abs/1409.2329
 20 | There are 3 supported model configurations:
 21 | ===========================================
 22 | | config | epochs | train | valid  | test
 23 | ===========================================
 24 | | small  | 13     | 37.99 | 121.39 | 115.91
 25 | | medium | 39     | 48.45 |  86.16 |  82.07
 26 | | large  | 55     | 37.87 |  82.62 |  78.29
 27 | The exact results may vary depending on the random initialization.
 28 | The hyperparameters used in the model:
 29 | - init_scale - the initial scale of the weights
 30 | - learning_rate - the initial value of the learning rate
 31 | - max_grad_norm - the maximum permissible norm of the gradient
 32 | - num_layers - the number of LSTM layers
 33 | - num_steps - the number of unrolled steps of LSTM
 34 | - hidden_size - the number of LSTM units
 35 | - max_epoch - the number of epochs trained with the initial learning rate
 36 | - max_max_epoch - the total number of epochs for training
 37 | - keep_prob - the probability of keeping weights in the dropout layer
 38 | - lr_decay - the decay of the learning rate for each epoch after "max_epoch"
 39 | - batch_size - the batch size
 40 | - rnn_mode - the low level implementation of lstm cell: one of CUDNN,
 41 |              BASIC, or BLOCK, representing cudnn_lstm, basic_lstm, and
 42 |              lstm_block_cell classes.
 43 | The data required for this example is in the data/ dir of the
 44 | PTB dataset from Tomas Mikolov's webpage:
 45 | $ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 46 | $ tar xvf simple-examples.tgz
 47 | To run:
 48 | $ python ptb_word_lm.py --data_path=simple-examples/data/
 49 | """
 50 | from __future__ import absolute_import
 51 | from __future__ import division
 52 | from __future__ import print_function
 53 | 
 54 | import time
 55 | 
 56 | import numpy as np
 57 | import tensorflow as tf
 58 | 
 59 | import reader
 60 | import util
 61 | 
 62 | from tensorflow.python.client import device_lib
 63 | 
 64 | flags = tf.flags
 65 | logging = tf.logging
 66 | 
 67 | flags.DEFINE_string(
 68 |     "model", "small",
 69 |     "A type of model. Possible options are: small, medium, large.")
 70 | flags.DEFINE_string("data_path", None,
 71 |                     "Where the training/test data is stored.")
 72 | flags.DEFINE_string("save_path", None,
 73 |                     "Model output directory.")
 74 | flags.DEFINE_bool("use_fp16", False,
 75 |                   "Train using 16-bit floats instead of 32bit floats")
 76 | flags.DEFINE_integer("num_gpus", 1,
 77 |                      "If larger than 1, Grappler AutoParallel optimizer "
 78 |                      "will create multiple training replicas with each GPU "
 79 |                      "running one replica.")
 80 | flags.DEFINE_string("rnn_mode", None,
 81 |                     "The low level implementation of lstm cell: one of CUDNN, "
 82 |                     "BASIC, and BLOCK, representing cudnn_lstm, basic_lstm, "
 83 |                     "and lstm_block_cell classes.")
 84 | FLAGS = flags.FLAGS
 85 | BASIC = "basic"
 86 | CUDNN = "cudnn"
 87 | BLOCK = "block"
 88 | 
 89 | 
 90 | def data_type():
 91 |   return tf.float16 if FLAGS.use_fp16 else tf.float32
 92 | 
 93 | 
 94 | class PTBInput(object):
 95 |   """The input data."""
 96 | 
 97 |   def __init__(self, config, data, name=None):
 98 |     self.batch_size = batch_size = config.batch_size
 99 |     self.num_steps = num_steps = config.num_steps
100 |     self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
101 |     self.input_data, self.targets = reader.ptb_producer(
102 |         data, batch_size, num_steps, name=name)
103 | 
104 | 
105 | class PTBModel(object):
106 |   """The PTB model."""
107 | 
108 |   def __init__(self, is_training, config, input_):
109 |     self._is_training = is_training # 判断该模型是否需要被训练
110 |     self._input = input_ # input_是一个 PTBInput 对象,用于表示模型的输入数据
111 |     self._rnn_params = None # 模型参数
112 |     self._cell = None # 组成模型的 RNN 单元
113 |     self.batch_size = input_.batch_size # 输入数据的批大小
114 |     self.num_steps = input_.num_steps # 输入数据的长度(以单词为单位)
115 |     size = config.hidden_size # 模型第一层 RNN 单元中隐藏层神经元的数量,即每个输入单词词向量的长
116 |     vocab_size = config.vocab_size # 数据集中单词的数量
117 |     # 在 CPU 上完成词向量查询
118 |     with tf.device("/cpu:0"):
119 |       # embedding 表示词向量矩阵。该矩阵的行数和列数分别为 vocab_size 和 size,
120 |       # 矩阵的每一行表示一个单词的词向量
121 |       embedding = tf.get_variable(
122 |           "embedding", [vocab_size, size], dtype=data_type())
123 |       # 通过 input_.input_data 中的单词 ID,在词向量矩阵 embedding 中查询单词的词向量
124 |       inputs = tf.nn.embedding_lookup(embedding, input_.input_data)
125 |     # 如果当前模型处于训练态,并且 config 对象中设置了 Dropout 层的参数 keep_prob(该参数值小于 1), 
126 |     # 则输入数据(已转换为词向量)在输入到模型之前需要先经过 Dropout 层。
127 |     # 采用 Dropout 层的好处在于可以在一定程度上减缓模型过拟合的风险
128 |     if is_training and config.keep_prob < 1:
129 |       inputs = tf.nn.dropout(inputs, config.keep_prob)
130 | 
131 |     output, state = self._build_rnn_graph(inputs, config, is_training)
132 |     # 输入数据经过 LSTM 模型处理后进入输出层。在输出层中,
133 |     # 矩阵 softmax_w 的转置与每个 LSTM 单元输出向量相乘后,再与 softmax_b 相加,得到 logits。
134 |     # logits 用于表示输出单词在训练集词库中的索引
135 |     softmax_w = tf.get_variable(
136 |         "softmax_w", [size, vocab_size], dtype=data_type())
137 |     softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
138 |     logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
139 |     # 为了满足 tf.contrib.seq2seq.sequence_loss 接口对 logits 形状的要求,将 logits 的维度变为 
140 |     # [batch_size, num_steps, vocab_size]
141 |     logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])
142 | 
143 |     # 通过比较 logits 和 input_.targets,得到当前输入数据在 batch_size 维度上损失函数的平均值 loss
144 |     loss = tf.contrib.seq2seq.sequence_loss(
145 |         logits,
146 |         input_.targets,
147 |         tf.ones([self.batch_size, self.num_steps], dtype=data_type()),
148 |         average_across_timesteps=False,
149 |         average_across_batch=True)
150 | 
151 |     # Update the cost
152 |     # 将张量 loss 中的所有值归约到一个值,并保存在 PTBModel 对象的成员变量 _cost 中
153 |     self._cost = tf.reduce_sum(loss)
154 |     self._final_state = state
155 | 
156 |     if not is_training:
157 |       return
158 | 
159 |     self._lr = tf.Variable(0.0, trainable=False)
160 |     tvars = tf.trainable_variables()
161 |     # 为了防止梯度爆炸,需要根据 config.max_grad_norm 设定的上限对梯度进行裁剪
162 |     grads, _ = tf.clip_by_global_norm(tf.gradients(self._cost, tvars),
163 |                                       config.max_grad_norm)
164 |     optimizer = tf.train.GradientDescentOptimizer(self._lr)
165 |     self._train_op = optimizer.apply_gradients(
166 |         zip(grads, tvars),
167 |         global_step=tf.train.get_or_create_global_step())
168 |     # 允许学习速率在训练过程中被更新
169 |     self._new_lr = tf.placeholder(
170 |         tf.float32, shape=[], name="new_learning_rate")
171 |     self._lr_update = tf.assign(self._lr, self._new_lr)
172 | 
173 |   def _build_rnn_graph(self, inputs, config, is_training):
174 |     if config.rnn_mode == CUDNN:
175 |       return self._build_rnn_graph_cudnn(inputs, config, is_training)
176 |     else:
177 |       return self._build_rnn_graph_lstm(inputs, config, is_training)
178 | 
179 |   def _build_rnn_graph_cudnn(self, inputs, config, is_training):
180 |     """Build the inference graph using CUDNN cell."""
181 |     inputs = tf.transpose(inputs, [1, 0, 2])
182 |     self._cell = tf.contrib.cudnn_rnn.CudnnLSTM(
183 |         num_layers=config.num_layers,
184 |         num_units=config.hidden_size,
185 |         input_size=config.hidden_size,
186 |         dropout=1 - config.keep_prob if is_training else 0)
187 |     params_size_t = self._cell.params_size()
188 |     self._rnn_params = tf.get_variable(
189 |         "lstm_params",
190 |         initializer=tf.random_uniform(
191 |             [params_size_t], -config.init_scale, config.init_scale),
192 |         validate_shape=False)
193 |     c = tf.zeros([config.num_layers, self.batch_size, config.hidden_size],
194 |                  tf.float32)
195 |     h = tf.zeros([config.num_layers, self.batch_size, config.hidden_size],
196 |                  tf.float32)
197 |     self._initial_state = (tf.contrib.rnn.LSTMStateTuple(h=h, c=c),)
198 |     outputs, h, c = self._cell(inputs, h, c, self._rnn_params, is_training)
199 |     outputs = tf.transpose(outputs, [1, 0, 2])
200 |     outputs = tf.reshape(outputs, [-1, config.hidden_size])
201 |     return outputs, (tf.contrib.rnn.LSTMStateTuple(h=h, c=c),)
202 | 
203 |   def _get_lstm_cell(self, config, is_training):
204 |     if config.rnn_mode == BASIC:
205 |       return tf.contrib.rnn.BasicLSTMCell(
206 |           config.hidden_size, forget_bias=0.0, state_is_tuple=True,
207 |           reuse=not is_training)
208 |     if config.rnn_mode == BLOCK:
209 |       return tf.contrib.rnn.LSTMBlockCell(
210 |           config.hidden_size, forget_bias=0.0)
211 |     raise ValueError("rnn_mode %s not supported" % config.rnn_mode)
212 | 
213 |   def _build_rnn_graph_lstm(self, inputs, config, is_training):
214 |     """Build the inference graph using canonical LSTM cells."""
215 |     # Slightly better results can be obtained with forget gate biases
216 |     # initialized to 1 but the hyperparameters of the model would need to be
217 |     # different than reported in the paper.
218 |     def make_cell():
219 |       cell = self._get_lstm_cell(config, is_training)
220 |       if is_training and config.keep_prob < 1:
221 |         cell = tf.contrib.rnn.DropoutWrapper(
222 |             cell, output_keep_prob=config.keep_prob)
223 |       return cell
224 | 
225 |     cell = tf.contrib.rnn.MultiRNNCell(
226 |         [make_cell() for _ in range(config.num_layers)], state_is_tuple=True)
227 | 
228 |     self._initial_state = cell.zero_state(config.batch_size, data_type())
229 |     state = self._initial_state
230 |     # Simplified version of tf.nn.static_rnn().
231 |     # This builds an unrolled LSTM for tutorial purposes only.
232 |     # In general, use tf.nn.static_rnn() or tf.nn.static_state_saving_rnn().
233 |     #
234 |     # The alternative version of the code below is:
235 |     #
236 |     # inputs = tf.unstack(inputs, num=self.num_steps, axis=1)
237 |     # outputs, state = tf.nn.static_rnn(cell, inputs,
238 |     #                                   initial_state=self._initial_state)
239 |     outputs = []
240 |     with tf.variable_scope("RNN"):
241 |       for time_step in range(self.num_steps):
242 |         if time_step > 0: tf.get_variable_scope().reuse_variables()
243 |         (cell_output, state) = cell(inputs[:, time_step, :], state)
244 |         outputs.append(cell_output)
245 |     output = tf.reshape(tf.concat(outputs, 1), [-1, config.hidden_size])
246 |     return output, state
247 | 
248 |   def assign_lr(self, session, lr_value):
249 |     session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
250 | 
251 |   def export_ops(self, name):
252 |     """Exports ops to collections."""
253 |     self._name = name
254 |     ops = {util.with_prefix(self._name, "cost"): self._cost}
255 |     if self._is_training:
256 |       ops.update(lr=self._lr, new_lr=self._new_lr, lr_update=self._lr_update)
257 |       if self._rnn_params:
258 |         ops.update(rnn_params=self._rnn_params)
259 |     for name, op in ops.items():
260 |       tf.add_to_collection(name, op)
261 |     self._initial_state_name = util.with_prefix(self._name, "initial")
262 |     self._final_state_name = util.with_prefix(self._name, "final")
263 |     util.export_state_tuples(self._initial_state, self._initial_state_name)
264 |     util.export_state_tuples(self._final_state, self._final_state_name)
265 | 
266 |   def import_ops(self):
267 |     """Imports ops from collections."""
268 |     if self._is_training:
269 |       self._train_op = tf.get_collection_ref("train_op")[0]
270 |       self._lr = tf.get_collection_ref("lr")[0]
271 |       self._new_lr = tf.get_collection_ref("new_lr")[0]
272 |       self._lr_update = tf.get_collection_ref("lr_update")[0]
273 |       rnn_params = tf.get_collection_ref("rnn_params")
274 |       if self._cell and rnn_params:
275 |         params_saveable = tf.contrib.cudnn_rnn.RNNParamsSaveable(
276 |             self._cell,
277 |             self._cell.params_to_canonical,
278 |             self._cell.canonical_to_params,
279 |             rnn_params,
280 |             base_variable_scope="Model/RNN")
281 |         tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
282 |     self._cost = tf.get_collection_ref(util.with_prefix(self._name, "cost"))[0]
283 |     num_replicas = FLAGS.num_gpus if self._name == "Train" else 1
284 |     self._initial_state = util.import_state_tuples(
285 |         self._initial_state, self._initial_state_name, num_replicas)
286 |     self._final_state = util.import_state_tuples(
287 |         self._final_state, self._final_state_name, num_replicas)
288 | 
289 |   @property
290 |   def input(self):
291 |     return self._input
292 | 
293 |   @property
294 |   def initial_state(self):
295 |     return self._initial_state
296 | 
297 |   @property
298 |   def cost(self):
299 |     return self._cost
300 | 
301 |   @property
302 |   def final_state(self):
303 |     return self._final_state
304 | 
305 |   @property
306 |   def lr(self):
307 |     return self._lr
308 | 
309 |   @property
310 |   def train_op(self):
311 |     return self._train_op
312 | 
313 |   @property
314 |   def initial_state_name(self):
315 |     return self._initial_state_name
316 | 
317 |   @property
318 |   def final_state_name(self):
319 |     return self._final_state_name
320 | 
321 | 
322 | class SmallConfig(object):
323 |   """Small config."""
324 |   init_scale = 0.1
325 |   learning_rate = 1.0
326 |   max_grad_norm = 5
327 |   num_layers = 2
328 |   num_steps = 20
329 |   hidden_size = 200
330 |   max_epoch = 4
331 |   max_max_epoch = 13
332 |   keep_prob = 1.0
333 |   lr_decay = 0.5
334 |   batch_size = 20
335 |   vocab_size = 10000
336 |   rnn_mode = BLOCK
337 | 
338 | 
339 | class MediumConfig(object):
340 |   """Medium config."""
341 |   init_scale = 0.05
342 |   learning_rate = 1.0
343 |   max_grad_norm = 5
344 |   num_layers = 2
345 |   num_steps = 35
346 |   hidden_size = 650
347 |   max_epoch = 6
348 |   max_max_epoch = 39
349 |   keep_prob = 0.5
350 |   lr_decay = 0.8
351 |   batch_size = 20
352 |   vocab_size = 10000
353 |   rnn_mode = BLOCK
354 | 
355 | 
356 | class LargeConfig(object):
357 |   """Large config."""
358 |   init_scale = 0.04
359 |   learning_rate = 1.0
360 |   max_grad_norm = 10
361 |   num_layers = 2
362 |   num_steps = 35
363 |   hidden_size = 1500
364 |   max_epoch = 14
365 |   max_max_epoch = 55
366 |   keep_prob = 0.35
367 |   lr_decay = 1 / 1.15
368 |   batch_size = 20
369 |   vocab_size = 10000
370 |   rnn_mode = BLOCK
371 | 
372 | 
373 | class TestConfig(object):
374 |   """Tiny config, for testing."""
375 |   init_scale = 0.1
376 |   learning_rate = 1.0
377 |   max_grad_norm = 1
378 |   num_layers = 1
379 |   num_steps = 2
380 |   hidden_size = 2
381 |   max_epoch = 1
382 |   max_max_epoch = 1
383 |   keep_prob = 1.0
384 |   lr_decay = 0.5
385 |   batch_size = 20
386 |   vocab_size = 10000
387 |   rnn_mode = BLOCK
388 | 
389 | 
390 | def run_epoch(session, model, eval_op=None, verbose=False):
391 |   """Runs the model on the given data."""
392 |   start_time = time.time()
393 |   costs = 0.0
394 |   iters = 0
395 |   state = session.run(model.initial_state)
396 | 
397 |   fetches = {
398 |       "cost": model.cost,
399 |       "final_state": model.final_state,
400 |   }
401 |   if eval_op is not None:
402 |     fetches["eval_op"] = eval_op
403 | 
404 |   for step in range(model.input.epoch_size):
405 |     feed_dict = {}
406 |     for i, (c, h) in enumerate(model.initial_state):
407 |       feed_dict[c] = state[i].c
408 |       feed_dict[h] = state[i].h
409 | 
410 |     vals = session.run(fetches, feed_dict)
411 |     cost = vals["cost"]
412 |     state = vals["final_state"]
413 | 
414 |     costs += cost
415 |     iters += model.input.num_steps
416 | 
417 |     if verbose and step % (model.input.epoch_size // 10) == 10:
418 |       print("%.3f perplexity: %.3f speed: %.0f wps" %
419 |             (step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
420 |              iters * model.input.batch_size * max(1, FLAGS.num_gpus) /
421 |              (time.time() - start_time)))
422 | 
423 |   return np.exp(costs / iters)
424 | 
425 | 
426 | def get_config():
427 |   """Get model config."""
428 |   config = None
429 |   if FLAGS.model == "small":
430 |     config = SmallConfig()
431 |   elif FLAGS.model == "medium":
432 |     config = MediumConfig()
433 |   elif FLAGS.model == "large":
434 |     config = LargeConfig()
435 |   elif FLAGS.model == "test":
436 |     config = TestConfig()
437 |   else:
438 |     raise ValueError("Invalid model: %s", FLAGS.model)
439 |   if FLAGS.rnn_mode:
440 |     config.rnn_mode = FLAGS.rnn_mode
441 |   if FLAGS.num_gpus != 1 or tf.__version__ < "1.3.0" :
442 |     config.rnn_mode = BASIC
443 |   return config
444 | 
445 | 
446 | def main(_):
447 |   if not FLAGS.data_path:
448 |     raise ValueError("Must set --data_path to PTB data directory")
449 |   gpus = [
450 |       x.name for x in device_lib.list_local_devices() if x.device_type == "GPU"
451 |   ]
452 |   if FLAGS.num_gpus > len(gpus):
453 |     raise ValueError(
454 |         "Your machine has only %d gpus "
455 |         "which is less than the requested --num_gpus=%d."
456 |         % (len(gpus), FLAGS.num_gpus))
457 | 
458 |   raw_data = reader.ptb_raw_data(FLAGS.data_path)
459 |   train_data, valid_data, test_data, _ = raw_data
460 | 
461 |   config = get_config()
462 |   eval_config = get_config()
463 |   eval_config.batch_size = 1
464 |   eval_config.num_steps = 1
465 | 
466 |   with tf.Graph().as_default():
467 |     initializer = tf.random_uniform_initializer(-config.init_scale,
468 |                                                 config.init_scale)
469 | 
470 |     with tf.name_scope("Train"):
471 |       train_input = PTBInput(config=config, data=train_data, name="TrainInput")
472 |       with tf.variable_scope("Model", reuse=None, initializer=initializer):
473 |         m = PTBModel(is_training=True, config=config, input_=train_input)
474 |       tf.summary.scalar("Training Loss", m.cost)
475 |       tf.summary.scalar("Learning Rate", m.lr)
476 | 
477 |     with tf.name_scope("Valid"):
478 |       valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
479 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
480 |         mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
481 |       tf.summary.scalar("Validation Loss", mvalid.cost)
482 | 
483 |     with tf.name_scope("Test"):
484 |       test_input = PTBInput(
485 |           config=eval_config, data=test_data, name="TestInput")
486 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
487 |         mtest = PTBModel(is_training=False, config=eval_config,
488 |                          input_=test_input)
489 | 
490 |     models = {"Train": m, "Valid": mvalid, "Test": mtest}
491 |     for name, model in models.items():
492 |       model.export_ops(name)
493 |     metagraph = tf.train.export_meta_graph()
494 |     if tf.__version__ < "1.1.0" and FLAGS.num_gpus > 1:
495 |       raise ValueError("num_gpus > 1 is not supported for TensorFlow versions "
496 |                        "below 1.1.0")
497 |     soft_placement = False
498 |     if FLAGS.num_gpus > 1:
499 |       soft_placement = True
500 |       util.auto_parallel(metagraph, m)
501 | 
502 |   with tf.Graph().as_default():
503 |     tf.train.import_meta_graph(metagraph)
504 |     for model in models.values():
505 |       model.import_ops()
506 |     sv = tf.train.Supervisor(logdir=FLAGS.save_path)
507 |     config_proto = tf.ConfigProto(allow_soft_placement=soft_placement)
508 |     with sv.managed_session(config=config_proto) as session:
509 |       for i in range(config.max_max_epoch):
510 |         lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
511 |         m.assign_lr(session, config.learning_rate * lr_decay)
512 | 
513 |         print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
514 |         train_perplexity = run_epoch(session, m, eval_op=m.train_op,
515 |                                      verbose=True)
516 |         print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
517 |         valid_perplexity = run_epoch(session, mvalid)
518 |         print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
519 | 
520 |       test_perplexity = run_epoch(session, mtest)
521 |       print("Test Perplexity: %.3f" % test_perplexity)
522 | 
523 |       if FLAGS.save_path:
524 |         print("Saving model to %s." % FLAGS.save_path)
525 |         sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
526 | 
527 | 
528 | if __name__ == "__main__":
529 |   tf.app.run()
530 | 


--------------------------------------------------------------------------------
/code/10_gan_models/10.3_model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import os
  3 | import time
  4 | import math
  5 | from glob import glob
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | from six.moves import xrange
  9 | 
 10 | from ops import *
 11 | from utils import *
 12 | 
 13 | def conv_out_size_same(size, stride):
 14 |   return int(math.ceil(float(size) / float(stride)))
 15 | 
 16 | class DCGAN(object):
 17 |   def __init__(self, sess, input_height=108, input_width=108, crop=True,
 18 |          batch_size=64, sample_num = 64, output_height=64, output_width=64,
 19 |          y_dim=None, z_dim=100, gf_dim=64, df_dim=64,
 20 |          gfc_dim=1024, dfc_dim=1024, c_dim=3, dataset_name='default',
 21 |          input_fname_pattern='*.jpg', checkpoint_dir=None, sample_dir=None, data_dir='./data'):
 22 |     """
 23 |     Args:
 24 |       sess: TensorFlow session
 25 |       batch_size: The size of batch. Should be specified before training.
 26 |       y_dim: (optional) Dimension of dim for y. [None]
 27 |       z_dim: (optional) Dimension of dim for Z. [100]
 28 |       gf_dim: (optional) Dimension of gen filters in first conv layer. [64]
 29 |       df_dim: (optional) Dimension of discrim filters in first conv layer. [64]
 30 |       gfc_dim: (optional) Dimension of gen units for for fully connected layer. [1024]
 31 |       dfc_dim: (optional) Dimension of discrim units for fully connected layer. [1024]
 32 |       c_dim: (optional) Dimension of image color. For grayscale input, set to 1. [3]
 33 |     """
 34 |     self.sess = sess # 当前会话
 35 |     self.crop = crop # 是否需要裁剪判别器的输入图像,即改变原图像高和宽
 36 | 
 37 |     self.batch_size = batch_size # 训练所用的批大小
 38 |     self.sample_num = sample_num # 每次迭代中生成器采样的样本数
 39 | 
 40 |     self.input_height = input_height # 训练集中输入图像的宽度
 41 |     self.input_width = input_width # 训练集中输入图像的高度
 42 |     self.output_height = output_height # 生成器生成的图像高度
 43 |     self.output_width = output_width # 生成器生成的图像宽度
 44 | 
 45 |     self.y_dim = y_dim # 生成器或判别器的输入条件向量维度
 46 |     self.z_dim = z_dim # 生成器的输入噪声向量维度
 47 | 
 48 |     self.gf_dim = gf_dim # 生成器的第一个卷积层的输出通道数
 49 |     self.df_dim = df_dim # 判别器的第一个卷积层的输出通道数
 50 | 
 51 |     self.gfc_dim = gfc_dim # 生成器的全连接层的输出神经元个数
 52 |     self.dfc_dim = dfc_dim # 判别器的全连接层的输出神经元个数
 53 |     # 定义生成器和判别器的 BN 层
 54 |     # batch normalization : deals with poor initialization helps gradient flow
 55 |     self.d_bn1 = batch_norm(name='d_bn1')
 56 |     self.d_bn2 = batch_norm(name='d_bn2')
 57 | 
 58 |     if not self.y_dim:
 59 |       self.d_bn3 = batch_norm(name='d_bn3')
 60 | 
 61 |     self.g_bn0 = batch_norm(name='g_bn0')
 62 |     self.g_bn1 = batch_norm(name='g_bn1')
 63 |     self.g_bn2 = batch_norm(name='g_bn2')
 64 | 
 65 |     if not self.y_dim:
 66 |       self.g_bn3 = batch_norm(name='g_bn3')
 67 | 
 68 |     self.dataset_name = dataset_name
 69 |     self.input_fname_pattern = input_fname_pattern
 70 |     self.checkpoint_dir = checkpoint_dir
 71 |     self.data_dir = data_dir
 72 | 
 73 |     if self.dataset_name == 'mnist':
 74 |       self.data_X, self.data_y = self.load_mnist()
 75 |       self.c_dim = self.data_X[0].shape[-1]
 76 |     else:
 77 |       self.data = glob(os.path.join(self.data_dir, self.dataset_name, self.input_fname_pattern))
 78 |       np.random.shuffle(self.data)
 79 |       imreadImg = imread(self.data[0])
 80 |       # 定义输入图像的格式和通道数 c.dim(灰度图像的通道数为 1,彩色图像的通道数为 3)
 81 |       if len(imreadImg.shape) >= 3: #check if image is a non-grayscale image by checking channel number
 82 |         self.c_dim = imread(self.data[0]).shape[-1]
 83 |       else:
 84 |         self.c_dim = 1
 85 | 
 86 |     self.grayscale = (self.c_dim == 1)
 87 | 
 88 |     self.build_model() # 建立模型
 89 | 
 90 |   def build_model(self):
 91 |     if self.y_dim:
 92 |       # 定义生成器或者判别器的输入条件向量
 93 |       self.y = tf.placeholder(tf.float32, [self.batch_size, self.y_dim], name='y')
 94 |     else:
 95 |       self.y = None
 96 | 
 97 |     if self.crop:
 98 |       # 如果 self.crop 为 True,则在输入图像的中央裁剪出高度和宽度分别为
 99 |       # self.output_height 和 self.output_width 大小的图像
100 |       image_dims = [self.output_height, self.output_width, self.c_dim]
101 |     else:
102 |       image_dims = [self.input_height, self.input_width, self.c_dim]
103 |     # 定义判别器的输入数据,即真实的图像
104 |     self.inputs = tf.placeholder(
105 |       tf.float32, [self.batch_size] + image_dims, name='real_images')
106 | 
107 |     inputs = self.inputs
108 | 
109 |     self.z = tf.placeholder(
110 |       tf.float32, [None, self.z_dim], name='z')
111 |     self.z_sum = histogram_summary("z", self.z)
112 |     # 分别创建生成器、判别器和采样器,并得到相应的输出张量。在训练过程中,
113 |     # 每隔一定的迭代步数,采样器被调用一次,其本质是用生成器做一次推理并得到生成的图像,
114 |     # 通过对生成的图像做质量评价,我们可以知道当前生成器的好坏
115 |     self.G                  = self.generator(self.z, self.y)
116 |     self.D, self.D_logits   = self.discriminator(inputs, self.y, reuse=False)
117 |     self.sampler            = self.sampler(self.z, self.y)
118 |     self.D_, self.D_logits_ = self.discriminator(self.G, self.y, reuse=True)
119 |     
120 |     self.d_sum = histogram_summary("d", self.D)
121 |     self.d__sum = histogram_summary("d_", self.D_)
122 |     self.G_sum = image_summary("G", self.G)
123 | 
124 |     def sigmoid_cross_entropy_with_logits(x, y):
125 |       try:
126 |         return tf.nn.sigmoid_cross_entropy_with_logits(logits=x, labels=y)
127 |       except:
128 |         return tf.nn.sigmoid_cross_entropy_with_logits(logits=x, targets=y)
129 |     # 定义判别器在真实数据分布上的损失函数
130 |     self.d_loss_real = tf.reduce_mean(
131 |       sigmoid_cross_entropy_with_logits(self.D_logits, tf.ones_like(self.D)))
132 |     # 定义判别器在生成数据分布上的损失函数
133 |     self.d_loss_fake = tf.reduce_mean(
134 |       sigmoid_cross_entropy_with_logits(self.D_logits_, tf.zeros_like(self.D_)))
135 |     # 定义生成器在生成数据分布上的损失函数
136 |     self.g_loss = tf.reduce_mean(
137 |       sigmoid_cross_entropy_with_logits(self.D_logits_, tf.ones_like(self.D_)))
138 | 
139 |     self.d_loss_real_sum = scalar_summary("d_loss_real", self.d_loss_real)
140 |     self.d_loss_fake_sum = scalar_summary("d_loss_fake", self.d_loss_fake)
141 |                           
142 |     self.d_loss = self.d_loss_real + self.d_loss_fake # 判别器总的损失函数
143 | 
144 |     self.g_loss_sum = scalar_summary("g_loss", self.g_loss)
145 |     self.d_loss_sum = scalar_summary("d_loss", self.d_loss)
146 | 
147 |     t_vars = tf.trainable_variables() # 所有可被训练的参数
148 | 
149 |     self.d_vars = [var for var in t_vars if 'd_' in var.name] # 判别器中可被训练的参数
150 |     self.g_vars = [var for var in t_vars if 'g_' in var.name] # 生成器中可被训练的参数
151 | 
152 |     self.saver = tf.train.Saver()
153 | 
154 |   def train(self, config):
155 |     # 调用 Adam 优化器的 minimize 方法得到生成器和判别器的优化操作
156 |     d_optim = tf.train.AdamOptimizer(config.learning_rate, beta1=config.beta1) \
157 |               .minimize(self.d_loss, var_list=self.d_vars)
158 |     g_optim = tf.train.AdamOptimizer(config.learning_rate, beta1=config.beta1) \
159 |               .minimize(self.g_loss, var_list=self.g_vars)
160 |     # 初始化当前会话中所有的变量
161 |     try:
162 |       tf.global_variables_initializer().run()
163 |     except:
164 |       tf.initialize_all_variables().run()
165 | 
166 |     self.g_sum = merge_summary([self.z_sum, self.d__sum,
167 |       self.G_sum, self.d_loss_fake_sum, self.g_loss_sum])
168 |     self.d_sum = merge_summary(
169 |         [self.z_sum, self.d_sum, self.d_loss_real_sum, self.d_loss_sum])
170 |     self.writer = SummaryWriter("./logs", self.sess.graph)
171 | 
172 |     sample_z = np.random.uniform(-1, 1, size=(self.sample_num , self.z_dim))
173 |     
174 |     if config.dataset == 'mnist':
175 |       sample_inputs = self.data_X[0:self.sample_num]
176 |       sample_labels = self.data_y[0:self.sample_num]
177 |     else:
178 |       sample_files = self.data[0:self.sample_num]
179 |       sample = [
180 |           get_image(sample_file,
181 |                     input_height=self.input_height,
182 |                     input_width=self.input_width,
183 |                     resize_height=self.output_height,
184 |                     resize_width=self.output_width,
185 |                     crop=self.crop,
186 |                     grayscale=self.grayscale) for sample_file in sample_files]
187 |       if (self.grayscale):
188 |         sample_inputs = np.array(sample).astype(np.float32)[:, :, :, None]
189 |       else:
190 |         sample_inputs = np.array(sample).astype(np.float32)
191 |   
192 |     counter = 1 # 迭代步数的计数器
193 |     start_time = time.time()
194 |     could_load, checkpoint_counter = self.load(self.checkpoint_dir)
195 |     if could_load:
196 |       counter = checkpoint_counter
197 |       print(" [*] Load SUCCESS")
198 |     else:
199 |       print(" [!] Load failed...")
200 |     # 根据 config 中指定的 epoch 总数(默认为 25)开始循环执行训练
201 |     for epoch in xrange(config.epoch):
202 |       if config.dataset == 'mnist':
203 |         batch_idxs = min(len(self.data_X), config.train_size) // config.batch_size
204 |       else:      
205 |         self.data = glob(os.path.join(
206 |           config.data_dir, config.dataset, self.input_fname_pattern))
207 |         np.random.shuffle(self.data)
208 |         batch_idxs = min(len(self.data), config.train_size) // config.batch_size
209 | 
210 |       for idx in xrange(0, batch_idxs):
211 |         # 读取批数据用于单步训练
212 |         if config.dataset == 'mnist':
213 |           batch_images = self.data_X[idx*config.batch_size:(idx+1)*config.batch_size]
214 |           batch_labels = self.data_y[idx*config.batch_size:(idx+1)*config.batch_size]
215 |         else:
216 |           batch_files = self.data[idx*config.batch_size:(idx+1)*config.batch_size]
217 |           # 调用 utils.py 提供的 get_image 方法调整图像大小,图像的高度由 self.input_height 
218 |           # 变为 self.output_height,宽度由 self.input_width 变为 self.output_width
219 |           batch = [
220 |               get_image(batch_file,
221 |                         input_height=self.input_height,
222 |                         input_width=self.input_width,
223 |                         resize_height=self.output_height,
224 |                         resize_width=self.output_width,
225 |                         crop=self.crop,
226 |                         grayscale=self.grayscale) for batch_file in batch_files]
227 |           if self.grayscale:
228 |             batch_images = np.array(batch).astype(np.float32)[:, :, :, None]
229 |           else:
230 |             batch_images = np.array(batch).astype(np.float32)
231 | 
232 |         batch_z = np.random.uniform(-1, 1, [config.batch_size, self.z_dim]) \
233 |               .astype(np.float32)
234 |         # 对于 MNIST 数据集,执行特殊的处理逻辑
235 |         if config.dataset == 'mnist':
236 |           # Update D network
237 |           # 执行 d_optim 操作,更新判别器的模型参数
238 |           _, summary_str = self.sess.run([d_optim, self.d_sum],
239 |             feed_dict={ 
240 |               self.inputs: batch_images,
241 |               self.z: batch_z,
242 |               self.y:batch_labels,
243 |             })
244 |           self.writer.add_summary(summary_str, counter)
245 | 
246 |           # Update G network
247 |           # 执行 g_optim 操作,更新生成器的模型参数
248 |           _, summary_str = self.sess.run([g_optim, self.g_sum],
249 |             feed_dict={
250 |               self.z: batch_z, 
251 |               self.y:batch_labels,
252 |             })
253 |           self.writer.add_summary(summary_str, counter)
254 | 
255 |           # Run g_optim twice to make sure that d_loss does not go to zero (different from paper)
256 |           _, summary_str = self.sess.run([g_optim, self.g_sum],
257 |             feed_dict={ self.z: batch_z, self.y:batch_labels })
258 |           self.writer.add_summary(summary_str, counter)
259 |           
260 |           errD_fake = self.d_loss_fake.eval({
261 |               self.z: batch_z, 
262 |               self.y:batch_labels
263 |           })
264 |           errD_real = self.d_loss_real.eval({
265 |               self.inputs: batch_images,
266 |               self.y:batch_labels
267 |           })
268 |           errG = self.g_loss.eval({
269 |               self.z: batch_z,
270 |               self.y: batch_labels
271 |           })
272 |         else:
273 |           # 对于其他数据集,同样依次执行 d_optim 和 g_optim 操作,
274 |           # 该分支与 mnist 分支代码的区别在于此处的判别器和生成器没有条件约束
275 |           # Update D network
276 |           _, summary_str = self.sess.run([d_optim, self.d_sum],
277 |             feed_dict={ self.inputs: batch_images, self.z: batch_z })
278 |           self.writer.add_summary(summary_str, counter)
279 | 
280 |           # Update G network
281 |           _, summary_str = self.sess.run([g_optim, self.g_sum],
282 |             feed_dict={ self.z: batch_z })
283 |           self.writer.add_summary(summary_str, counter)
284 | 
285 |           # Run g_optim twice to make sure that d_loss does not go to zero (different from paper)
286 |           _, summary_str = self.sess.run([g_optim, self.g_sum],
287 |             feed_dict={ self.z: batch_z })
288 |           self.writer.add_summary(summary_str, counter)
289 |           
290 |           errD_fake = self.d_loss_fake.eval({ self.z: batch_z })
291 |           errD_real = self.d_loss_real.eval({ self.inputs: batch_images })
292 |           errG = self.g_loss.eval({self.z: batch_z})
293 | 
294 |         counter += 1
295 |         print("Epoch: [%2d/%2d] [%4d/%4d] time: %4.4f, d_loss: %.8f, g_loss: %.8f" \
296 |           % (epoch, config.epoch, idx, batch_idxs,
297 |             time.time() - start_time, errD_fake+errD_real, errG))
298 | 
299 |         if np.mod(counter, 100) == 1:
300 |           if config.dataset == 'mnist':
301 |             samples, d_loss, g_loss = self.sess.run(
302 |               [self.sampler, self.d_loss, self.g_loss],
303 |               feed_dict={
304 |                   self.z: sample_z,
305 |                   self.inputs: sample_inputs,
306 |                   self.y:sample_labels,
307 |               }
308 |             )
309 |             save_images(samples, image_manifold_size(samples.shape[0]),
310 |                   './{}/train_{:02d}_{:04d}.png'.format(config.sample_dir, epoch, idx))
311 |             print("[Sample] d_loss: %.8f, g_loss: %.8f" % (d_loss, g_loss)) 
312 |           else:
313 |             try:
314 |               samples, d_loss, g_loss = self.sess.run(
315 |                 [self.sampler, self.d_loss, self.g_loss],
316 |                 feed_dict={
317 |                     self.z: sample_z,
318 |                     self.inputs: sample_inputs,
319 |                 },
320 |               )
321 |               save_images(samples, image_manifold_size(samples.shape[0]),
322 |                     './{}/train_{:02d}_{:04d}.png'.format(config.sample_dir, epoch, idx))
323 |               print("[Sample] d_loss: %.8f, g_loss: %.8f" % (d_loss, g_loss)) 
324 |             except:
325 |               print("one pic error!...")
326 | 
327 |         if np.mod(counter, 500) == 2:
328 |           self.save(config.checkpoint_dir, counter)
329 | 
330 |   def discriminator(self, image, y=None, reuse=False):
331 |     with tf.variable_scope("discriminator") as scope:
332 |       if reuse:
333 |         # 复用已有变量
334 |         scope.reuse_variables()
335 |       # 如果没有条件向量 y,则建立一个带有四层卷积操作的卷积模型。为了避免梯度消失现象,
336 |       # 每层卷积之后都加 BN 层。除了最后一层激活函数为 sigmoid 之外,其他层激活函数为 leaky ReLU
337 |       if not self.y_dim:
338 |         h0 = lrelu(conv2d(image, self.df_dim, name='d_h0_conv'))
339 |         h1 = lrelu(self.d_bn1(conv2d(h0, self.df_dim*2, name='d_h1_conv')))
340 |         h2 = lrelu(self.d_bn2(conv2d(h1, self.df_dim*4, name='d_h2_conv')))
341 |         h3 = lrelu(self.d_bn3(conv2d(h2, self.df_dim*8, name='d_h3_conv')))
342 |         h4 = linear(tf.reshape(h3, [self.batch_size, -1]), 1, 'd_h4_lin')
343 | 
344 |         return tf.nn.sigmoid(h4), h4
345 |       else:
346 |         # 与 not self.y_dim 分支中建模的特点相同,除最后一层外的每一层都包含 
347 |         # BN、leaky ReLU 这两个算子,区别在于后两层不是卷积层,而是全连接层
348 |         yb = tf.reshape(y, [self.batch_size, 1, 1, self.y_dim])
349 |         x = conv_cond_concat(image, yb)
350 | 
351 |         h0 = lrelu(conv2d(x, self.c_dim + self.y_dim, name='d_h0_conv'))
352 |         h0 = conv_cond_concat(h0, yb)
353 | 
354 |         h1 = lrelu(self.d_bn1(conv2d(h0, self.df_dim + self.y_dim, name='d_h1_conv')))
355 |         h1 = tf.reshape(h1, [self.batch_size, -1])      
356 |         h1 = concat([h1, y], 1)
357 |         
358 |         h2 = lrelu(self.d_bn2(linear(h1, self.dfc_dim, 'd_h2_lin')))
359 |         h2 = concat([h2, y], 1)
360 | 
361 |         h3 = linear(h2, 1, 'd_h3_lin')
362 |         
363 |         return tf.nn.sigmoid(h3), h3
364 | 
365 |   def generator(self, z, y=None):
366 |     with tf.variable_scope("generator") as scope:
367 |       if not self.y_dim:
368 |         # 将输出图像的高度和宽度分别设定为 self.output_height 和 self.output_width。
369 |         # 因为每层反卷积操作的 stride 都设为 2,所以每层反卷积之后图像的高度和宽度都会加倍
370 |         s_h, s_w = self.output_height, self.output_width
371 |         s_h2, s_w2 = conv_out_size_same(s_h, 2), conv_out_size_same(s_w, 2)
372 |         s_h4, s_w4 = conv_out_size_same(s_h2, 2), conv_out_size_same(s_w2, 2)
373 |         s_h8, s_w8 = conv_out_size_same(s_h4, 2), conv_out_size_same(s_w4, 2)
374 |         s_h16, s_w16 = conv_out_size_same(s_h8, 2), conv_out_size_same(s_w8, 2)
375 | 
376 |         # project `z` and reshape
377 |         # 将生成器的输入噪声向量通过线性变换映射到 self.gf_dim*8*s_h16*s_w16 维度的向量
378 |         self.z_, self.h0_w, self.h0_b = linear(
379 |             z, self.gf_dim*8*s_h16*s_w16, 'g_h0_lin', with_w=True)
380 | 
381 |         self.h0 = tf.reshape(
382 |             self.z_, [-1, s_h16, s_w16, self.gf_dim * 8])
383 |         h0 = tf.nn.relu(self.g_bn0(self.h0))
384 |         # 按照调参经验,除了最后一层激活函数采用 tanh 之外,其他层激活函数都采用 ReLU
385 |         self.h1, self.h1_w, self.h1_b = deconv2d(
386 |             h0, [self.batch_size, s_h8, s_w8, self.gf_dim*4], name='g_h1', with_w=True)
387 |         h1 = tf.nn.relu(self.g_bn1(self.h1))
388 |         # 类似于 h1 的求取过程,通过连续调用反卷积操作(deconv2d)、
389 |         # BN操作(self.g_bn1、self.g_bn2、self.g_bn3)和激活操作(tf.nn.relu),得到h2和h3
390 |         h2, self.h2_w, self.h2_b = deconv2d(
391 |             h1, [self.batch_size, s_h4, s_w4, self.gf_dim*2], name='g_h2', with_w=True)
392 |         h2 = tf.nn.relu(self.g_bn2(h2))
393 | 
394 |         h3, self.h3_w, self.h3_b = deconv2d(
395 |             h2, [self.batch_size, s_h2, s_w2, self.gf_dim*1], name='g_h3', with_w=True)
396 |         h3 = tf.nn.relu(self.g_bn3(h3))
397 | 
398 |         h4, self.h4_w, self.h4_b = deconv2d(
399 |             h3, [self.batch_size, s_h, s_w, self.c_dim], name='g_h4', with_w=True)
400 | 
401 |         return tf.nn.tanh(h4)
402 |       else:
403 |         # 当有条件向量 y 时,构造一个主要由两层卷积和两层全连接组成的相对简单的模型
404 |         # 因为有条件向量 y,所以将每个激活层的输出向量与条件向量 y 连接起来,以形成条件约束
405 |         s_h, s_w = self.output_height, self.output_width
406 |         s_h2, s_h4 = int(s_h/2), int(s_h/4)
407 |         s_w2, s_w4 = int(s_w/2), int(s_w/4)
408 | 
409 |         # yb = tf.expand_dims(tf.expand_dims(y, 1),2)
410 |         yb = tf.reshape(y, [self.batch_size, 1, 1, self.y_dim])
411 |         z = concat([z, y], 1)
412 | 
413 |         h0 = tf.nn.relu(
414 |             self.g_bn0(linear(z, self.gfc_dim, 'g_h0_lin')))
415 |         h0 = concat([h0, y], 1)
416 | 
417 |         h1 = tf.nn.relu(self.g_bn1(
418 |             linear(h0, self.gf_dim*2*s_h4*s_w4, 'g_h1_lin')))
419 |         h1 = tf.reshape(h1, [self.batch_size, s_h4, s_w4, self.gf_dim * 2])
420 | 
421 |         h1 = conv_cond_concat(h1, yb)
422 | 
423 |         h2 = tf.nn.relu(self.g_bn2(deconv2d(h1,
424 |             [self.batch_size, s_h2, s_w2, self.gf_dim * 2], name='g_h2')))
425 |         h2 = conv_cond_concat(h2, yb)
426 | 
427 |         return tf.nn.sigmoid(
428 |             deconv2d(h2, [self.batch_size, s_h, s_w, self.c_dim], name='g_h3'))
429 | 
430 |   def sampler(self, z, y=None):
431 |     with tf.variable_scope("generator") as scope:
432 |       scope.reuse_variables()
433 | 
434 |       if not self.y_dim:
435 |         s_h, s_w = self.output_height, self.output_width
436 |         s_h2, s_w2 = conv_out_size_same(s_h, 2), conv_out_size_same(s_w, 2)
437 |         s_h4, s_w4 = conv_out_size_same(s_h2, 2), conv_out_size_same(s_w2, 2)
438 |         s_h8, s_w8 = conv_out_size_same(s_h4, 2), conv_out_size_same(s_w4, 2)
439 |         s_h16, s_w16 = conv_out_size_same(s_h8, 2), conv_out_size_same(s_w8, 2)
440 | 
441 |         # project `z` and reshape
442 |         h0 = tf.reshape(
443 |             linear(z, self.gf_dim*8*s_h16*s_w16, 'g_h0_lin'),
444 |             [-1, s_h16, s_w16, self.gf_dim * 8])
445 |         h0 = tf.nn.relu(self.g_bn0(h0, train=False))
446 | 
447 |         h1 = deconv2d(h0, [self.batch_size, s_h8, s_w8, self.gf_dim*4], name='g_h1')
448 |         h1 = tf.nn.relu(self.g_bn1(h1, train=False))
449 | 
450 |         h2 = deconv2d(h1, [self.batch_size, s_h4, s_w4, self.gf_dim*2], name='g_h2')
451 |         h2 = tf.nn.relu(self.g_bn2(h2, train=False))
452 | 
453 |         h3 = deconv2d(h2, [self.batch_size, s_h2, s_w2, self.gf_dim*1], name='g_h3')
454 |         h3 = tf.nn.relu(self.g_bn3(h3, train=False))
455 | 
456 |         h4 = deconv2d(h3, [self.batch_size, s_h, s_w, self.c_dim], name='g_h4')
457 | 
458 |         return tf.nn.tanh(h4)
459 |       else:
460 |         s_h, s_w = self.output_height, self.output_width
461 |         s_h2, s_h4 = int(s_h/2), int(s_h/4)
462 |         s_w2, s_w4 = int(s_w/2), int(s_w/4)
463 | 
464 |         # yb = tf.reshape(y, [-1, 1, 1, self.y_dim])
465 |         yb = tf.reshape(y, [self.batch_size, 1, 1, self.y_dim])
466 |         z = concat([z, y], 1)
467 | 
468 |         h0 = tf.nn.relu(self.g_bn0(linear(z, self.gfc_dim, 'g_h0_lin'), train=False))
469 |         h0 = concat([h0, y], 1)
470 | 
471 |         h1 = tf.nn.relu(self.g_bn1(
472 |             linear(h0, self.gf_dim*2*s_h4*s_w4, 'g_h1_lin'), train=False))
473 |         h1 = tf.reshape(h1, [self.batch_size, s_h4, s_w4, self.gf_dim * 2])
474 |         h1 = conv_cond_concat(h1, yb)
475 | 
476 |         h2 = tf.nn.relu(self.g_bn2(
477 |             deconv2d(h1, [self.batch_size, s_h2, s_w2, self.gf_dim * 2], name='g_h2'), train=False))
478 |         h2 = conv_cond_concat(h2, yb)
479 | 
480 |         return tf.nn.sigmoid(deconv2d(h2, [self.batch_size, s_h, s_w, self.c_dim], name='g_h3'))
481 | 
482 |   def load_mnist(self):
483 |     data_dir = os.path.join(self.data_dir, self.dataset_name)
484 |     
485 |     fd = open(os.path.join(data_dir,'train-images-idx3-ubyte'))
486 |     loaded = np.fromfile(file=fd,dtype=np.uint8)
487 |     trX = loaded[16:].reshape((60000,28,28,1)).astype(np.float)
488 | 
489 |     fd = open(os.path.join(data_dir,'train-labels-idx1-ubyte'))
490 |     loaded = np.fromfile(file=fd,dtype=np.uint8)
491 |     trY = loaded[8:].reshape((60000)).astype(np.float)
492 | 
493 |     fd = open(os.path.join(data_dir,'t10k-images-idx3-ubyte'))
494 |     loaded = np.fromfile(file=fd,dtype=np.uint8)
495 |     teX = loaded[16:].reshape((10000,28,28,1)).astype(np.float)
496 | 
497 |     fd = open(os.path.join(data_dir,'t10k-labels-idx1-ubyte'))
498 |     loaded = np.fromfile(file=fd,dtype=np.uint8)
499 |     teY = loaded[8:].reshape((10000)).astype(np.float)
500 | 
501 |     trY = np.asarray(trY)
502 |     teY = np.asarray(teY)
503 |     
504 |     X = np.concatenate((trX, teX), axis=0)
505 |     y = np.concatenate((trY, teY), axis=0).astype(np.int)
506 |     
507 |     seed = 547
508 |     np.random.seed(seed)
509 |     np.random.shuffle(X)
510 |     np.random.seed(seed)
511 |     np.random.shuffle(y)
512 |     
513 |     y_vec = np.zeros((len(y), self.y_dim), dtype=np.float)
514 |     for i, label in enumerate(y):
515 |       y_vec[i,y[i]] = 1.0
516 |     
517 |     return X/255.,y_vec
518 | 
519 |   @property
520 |   def model_dir(self):
521 |     return "{}_{}_{}_{}".format(
522 |         self.dataset_name, self.batch_size,
523 |         self.output_height, self.output_width)
524 |       
525 |   def save(self, checkpoint_dir, step):
526 |     model_name = "DCGAN.model"
527 |     checkpoint_dir = os.path.join(checkpoint_dir, self.model_dir)
528 | 
529 |     if not os.path.exists(checkpoint_dir):
530 |       os.makedirs(checkpoint_dir)
531 | 
532 |     self.saver.save(self.sess,
533 |             os.path.join(checkpoint_dir, model_name),
534 |             global_step=step)
535 | 
536 |   def load(self, checkpoint_dir):
537 |     import re
538 |     print(" [*] Reading checkpoints...")
539 |     checkpoint_dir = os.path.join(checkpoint_dir, self.model_dir)
540 | 
541 |     ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
542 |     if ckpt and ckpt.model_checkpoint_path:
543 |       ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
544 |       self.saver.restore(self.sess, os.path.join(checkpoint_dir, ckpt_name))
545 |       counter = int(next(re.finditer("(\d+)(?!.*\d)",ckpt_name)).group(0))
546 |       print(" [*] Success to read {}".format(ckpt_name))
547 |       return True, counter
548 |     else:
549 |       print(" [*] Failed to find a checkpoint")
550 |       return False, 0
551 | 


--------------------------------------------------------------------------------
/code/9_cnn_models/9.2_train_image_classifier.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Generic training script that trains a model using a given dataset."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import tensorflow as tf
 22 | 
 23 | from datasets import dataset_factory
 24 | from deployment import model_deploy
 25 | from nets import nets_factory
 26 | from preprocessing import preprocessing_factory
 27 | 
 28 | slim = tf.contrib.slim
 29 | 
 30 | tf.app.flags.DEFINE_string(
 31 |     'master', '', 'The address of the TensorFlow master to use.')
 32 | 
 33 | tf.app.flags.DEFINE_string(
 34 |     'train_dir', '/tmp/tfmodel/',
 35 |     'Directory where checkpoints and event logs are written to.')
 36 | 
 37 | tf.app.flags.DEFINE_integer('num_clones', 1,
 38 |                             'Number of model clones to deploy.')
 39 | 
 40 | tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
 41 |                             'Use CPUs to deploy clones.')
 42 | 
 43 | tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.')
 44 | 
 45 | tf.app.flags.DEFINE_integer(
 46 |     'num_ps_tasks', 0,
 47 |     'The number of parameter servers. If the value is 0, then the parameters '
 48 |     'are handled locally by the worker.')
 49 | 
 50 | tf.app.flags.DEFINE_integer(
 51 |     'num_readers', 4,
 52 |     'The number of parallel readers that read data from the dataset.')
 53 | 
 54 | tf.app.flags.DEFINE_integer(
 55 |     'num_preprocessing_threads', 4,
 56 |     'The number of threads used to create the batches.')
 57 | 
 58 | tf.app.flags.DEFINE_integer(
 59 |     'log_every_n_steps', 10,
 60 |     'The frequency with which logs are print.')
 61 | 
 62 | tf.app.flags.DEFINE_integer(
 63 |     'save_summaries_secs', 600,
 64 |     'The frequency with which summaries are saved, in seconds.')
 65 | 
 66 | tf.app.flags.DEFINE_integer(
 67 |     'save_interval_secs', 600,
 68 |     'The frequency with which the model is saved, in seconds.')
 69 | 
 70 | tf.app.flags.DEFINE_integer(
 71 |     'task', 0, 'Task id of the replica running the training.')
 72 | 
 73 | ######################
 74 | # Optimization Flags #
 75 | ######################
 76 | 
 77 | tf.app.flags.DEFINE_float(
 78 |     'weight_decay', 0.00004, 'The weight decay on the model weights.')
 79 | 
 80 | tf.app.flags.DEFINE_string(
 81 |     'optimizer', 'rmsprop',
 82 |     'The name of the optimizer, one of "adadelta", "adagrad", "adam",'
 83 |     '"ftrl", "momentum", "sgd" or "rmsprop".')
 84 | 
 85 | tf.app.flags.DEFINE_float(
 86 |     'adadelta_rho', 0.95,
 87 |     'The decay rate for adadelta.')
 88 | 
 89 | tf.app.flags.DEFINE_float(
 90 |     'adagrad_initial_accumulator_value', 0.1,
 91 |     'Starting value for the AdaGrad accumulators.')
 92 | 
 93 | tf.app.flags.DEFINE_float(
 94 |     'adam_beta1', 0.9,
 95 |     'The exponential decay rate for the 1st moment estimates.')
 96 | 
 97 | tf.app.flags.DEFINE_float(
 98 |     'adam_beta2', 0.999,
 99 |     'The exponential decay rate for the 2nd moment estimates.')
100 | 
101 | tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.')
102 | 
103 | tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5,
104 |                           'The learning rate power.')
105 | 
106 | tf.app.flags.DEFINE_float(
107 |     'ftrl_initial_accumulator_value', 0.1,
108 |     'Starting value for the FTRL accumulators.')
109 | 
110 | tf.app.flags.DEFINE_float(
111 |     'ftrl_l1', 0.0, 'The FTRL l1 regularization strength.')
112 | 
113 | tf.app.flags.DEFINE_float(
114 |     'ftrl_l2', 0.0, 'The FTRL l2 regularization strength.')
115 | 
116 | tf.app.flags.DEFINE_float(
117 |     'momentum', 0.9,
118 |     'The momentum for the MomentumOptimizer and RMSPropOptimizer.')
119 | 
120 | tf.app.flags.DEFINE_float('rmsprop_momentum', 0.9, 'Momentum.')
121 | 
122 | tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')
123 | 
124 | #######################
125 | # Learning Rate Flags #
126 | #######################
127 | 
128 | tf.app.flags.DEFINE_string(
129 |     'learning_rate_decay_type',
130 |     'exponential',
131 |     'Specifies how the learning rate is decayed. One of "fixed", "exponential",'
132 |     ' or "polynomial"')
133 | 
134 | tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
135 | 
136 | tf.app.flags.DEFINE_float(
137 |     'end_learning_rate', 0.0001,
138 |     'The minimal end learning rate used by a polynomial decay learning rate.')
139 | 
140 | tf.app.flags.DEFINE_float(
141 |     'label_smoothing', 0.0, 'The amount of label smoothing.')
142 | 
143 | tf.app.flags.DEFINE_float(
144 |     'learning_rate_decay_factor', 0.94, 'Learning rate decay factor.')
145 | 
146 | tf.app.flags.DEFINE_float(
147 |     'num_epochs_per_decay', 2.0,
148 |     'Number of epochs after which learning rate decays.')
149 | 
150 | tf.app.flags.DEFINE_bool(
151 |     'sync_replicas', False,
152 |     'Whether or not to synchronize the replicas during training.')
153 | 
154 | tf.app.flags.DEFINE_integer(
155 |     'replicas_to_aggregate', 1,
156 |     'The Number of gradients to collect before updating params.')
157 | 
158 | tf.app.flags.DEFINE_float(
159 |     'moving_average_decay', None,
160 |     'The decay to use for the moving average.'
161 |     'If left as None, then moving averages are not used.')
162 | 
163 | #######################
164 | # Dataset Flags #
165 | #######################
166 | 
167 | tf.app.flags.DEFINE_string(
168 |     'dataset_name', 'imagenet', 'The name of the dataset to load.')
169 | 
170 | tf.app.flags.DEFINE_string(
171 |     'dataset_split_name', 'train', 'The name of the train/test split.')
172 | 
173 | tf.app.flags.DEFINE_string(
174 |     'dataset_dir', None, 'The directory where the dataset files are stored.')
175 | 
176 | tf.app.flags.DEFINE_integer(
177 |     'labels_offset', 0,
178 |     'An offset for the labels in the dataset. This flag is primarily used to '
179 |     'evaluate the VGG and ResNet architectures which do not use a background '
180 |     'class for the ImageNet dataset.')
181 | 
182 | tf.app.flags.DEFINE_string(
183 |     'model_name', 'inception_v3', 'The name of the architecture to train.')
184 | 
185 | tf.app.flags.DEFINE_string(
186 |     'preprocessing_name', None, 'The name of the preprocessing to use. If left '
187 |     'as `None`, then the model_name flag is used.')
188 | 
189 | tf.app.flags.DEFINE_integer(
190 |     'batch_size', 32, 'The number of samples in each batch.')
191 | 
192 | tf.app.flags.DEFINE_integer(
193 |     'train_image_size', None, 'Train image size')
194 | 
195 | tf.app.flags.DEFINE_integer('max_number_of_steps', None,
196 |                             'The maximum number of training steps.')
197 | 
198 | #####################
199 | # Fine-Tuning Flags #
200 | #####################
201 | 
202 | tf.app.flags.DEFINE_string(
203 |     'checkpoint_path', None,
204 |     'The path to a checkpoint from which to fine-tune.')
205 | 
206 | tf.app.flags.DEFINE_string(
207 |     'checkpoint_exclude_scopes', None,
208 |     'Comma-separated list of scopes of variables to exclude when restoring '
209 |     'from a checkpoint.')
210 | 
211 | tf.app.flags.DEFINE_string(
212 |     'trainable_scopes', None,
213 |     'Comma-separated list of scopes to filter the set of variables to train.'
214 |     'By default, None would train all the variables.')
215 | 
216 | tf.app.flags.DEFINE_boolean(
217 |     'ignore_missing_vars', False,
218 |     'When restoring a checkpoint would ignore missing variables.')
219 | 
220 | FLAGS = tf.app.flags.FLAGS
221 | 
222 | 
223 | def _configure_learning_rate(num_samples_per_epoch, global_step):
224 |   """Configures the learning rate.
225 |   Args:
226 |     num_samples_per_epoch: The number of samples in each epoch of training.
227 |     global_step: The global_step tensor.
228 |   Returns:
229 |     A `Tensor` representing the learning rate.
230 |   Raises:
231 |     ValueError: if
232 |   """
233 |   decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
234 |                     FLAGS.num_epochs_per_decay)
235 |   if FLAGS.sync_replicas:
236 |     decay_steps /= FLAGS.replicas_to_aggregate
237 | 
238 |   if FLAGS.learning_rate_decay_type == 'exponential':
239 |     return tf.train.exponential_decay(FLAGS.learning_rate,
240 |                                       global_step,
241 |                                       decay_steps,
242 |                                       FLAGS.learning_rate_decay_factor,
243 |                                       staircase=True,
244 |                                       name='exponential_decay_learning_rate')
245 |   elif FLAGS.learning_rate_decay_type == 'fixed':
246 |     return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate')
247 |   elif FLAGS.learning_rate_decay_type == 'polynomial':
248 |     return tf.train.polynomial_decay(FLAGS.learning_rate,
249 |                                      global_step,
250 |                                      decay_steps,
251 |                                      FLAGS.end_learning_rate,
252 |                                      power=1.0,
253 |                                      cycle=False,
254 |                                      name='polynomial_decay_learning_rate')
255 |   else:
256 |     raise ValueError('learning_rate_decay_type [%s] was not recognized',
257 |                      FLAGS.learning_rate_decay_type)
258 | 
259 | 
260 | def _configure_optimizer(learning_rate):
261 |   """Configures the optimizer used for training.
262 |   Args:
263 |     learning_rate: A scalar or `Tensor` learning rate.
264 |   Returns:
265 |     An instance of an optimizer.
266 |   Raises:
267 |     ValueError: if FLAGS.optimizer is not recognized.
268 |   """
269 |   if FLAGS.optimizer == 'adadelta':
270 |     optimizer = tf.train.AdadeltaOptimizer(
271 |         learning_rate,
272 |         rho=FLAGS.adadelta_rho,
273 |         epsilon=FLAGS.opt_epsilon)
274 |   elif FLAGS.optimizer == 'adagrad':
275 |     optimizer = tf.train.AdagradOptimizer(
276 |         learning_rate,
277 |         initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value)
278 |   elif FLAGS.optimizer == 'adam':
279 |     optimizer = tf.train.AdamOptimizer(
280 |         learning_rate,
281 |         beta1=FLAGS.adam_beta1,
282 |         beta2=FLAGS.adam_beta2,
283 |         epsilon=FLAGS.opt_epsilon)
284 |   elif FLAGS.optimizer == 'ftrl':
285 |     optimizer = tf.train.FtrlOptimizer(
286 |         learning_rate,
287 |         learning_rate_power=FLAGS.ftrl_learning_rate_power,
288 |         initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value,
289 |         l1_regularization_strength=FLAGS.ftrl_l1,
290 |         l2_regularization_strength=FLAGS.ftrl_l2)
291 |   elif FLAGS.optimizer == 'momentum':
292 |     optimizer = tf.train.MomentumOptimizer(
293 |         learning_rate,
294 |         momentum=FLAGS.momentum,
295 |         name='Momentum')
296 |   elif FLAGS.optimizer == 'rmsprop':
297 |     optimizer = tf.train.RMSPropOptimizer(
298 |         learning_rate,
299 |         decay=FLAGS.rmsprop_decay,
300 |         momentum=FLAGS.rmsprop_momentum,
301 |         epsilon=FLAGS.opt_epsilon)
302 |   elif FLAGS.optimizer == 'sgd':
303 |     optimizer = tf.train.GradientDescentOptimizer(learning_rate)
304 |   else:
305 |     raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer)
306 |   return optimizer
307 | 
308 | 
309 | def _get_init_fn():
310 |   """Returns a function run by the chief worker to warm-start the training.
311 |   Note that the init_fn is only run when initializing the model during the very
312 |   first global step.
313 |   Returns:
314 |     An init function run by the supervisor.
315 |   """
316 |   if FLAGS.checkpoint_path is None:
317 |     return None
318 | 
319 |   # Warn the user if a checkpoint exists in the train_dir. Then we'll be
320 |   # ignoring the checkpoint anyway.
321 |   if tf.train.latest_checkpoint(FLAGS.train_dir):
322 |     tf.logging.info(
323 |         'Ignoring --checkpoint_path because a checkpoint already exists in %s'
324 |         % FLAGS.train_dir)
325 |     return None
326 | 
327 |   exclusions = []
328 |   if FLAGS.checkpoint_exclude_scopes:
329 |     exclusions = [scope.strip()
330 |                   for scope in FLAGS.checkpoint_exclude_scopes.split(',')]
331 | 
332 |   # TODO(sguada) variables.filter_variables()
333 |   variables_to_restore = []
334 |   for var in slim.get_model_variables():
335 |     for exclusion in exclusions:
336 |       if var.op.name.startswith(exclusion):
337 |         break
338 |     else:
339 |       variables_to_restore.append(var)
340 | 
341 |   if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
342 |     checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
343 |   else:
344 |     checkpoint_path = FLAGS.checkpoint_path
345 | 
346 |   tf.logging.info('Fine-tuning from %s' % checkpoint_path)
347 | 
348 |   return slim.assign_from_checkpoint_fn(
349 |       checkpoint_path,
350 |       variables_to_restore,
351 |       ignore_missing_vars=FLAGS.ignore_missing_vars)
352 | 
353 | 
354 | def _get_variables_to_train():
355 |   """Returns a list of variables to train.
356 |   Returns:
357 |     A list of variables to train by the optimizer.
358 |   """
359 |   if FLAGS.trainable_scopes is None:
360 |     return tf.trainable_variables()
361 |   else:
362 |     scopes = [scope.strip() for scope in FLAGS.trainable_scopes.split(',')]
363 | 
364 |   variables_to_train = []
365 |   for scope in scopes:
366 |     variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
367 |     variables_to_train.extend(variables)
368 |   return variables_to_train
369 | 
370 | 
371 | def main(_):
372 |   if not FLAGS.dataset_dir:
373 |     raise ValueError('You must supply the dataset directory with --dataset_dir')
374 | 
375 |   tf.logging.set_verbosity(tf.logging.INFO)
376 |   with tf.Graph().as_default():
377 |     #######################
378 |     # Config model_deploy #
379 |     #######################
380 |     deploy_config = model_deploy.DeploymentConfig(
381 |         num_clones=FLAGS.num_clones, # Clone 对象的个数
382 |         clone_on_cpu=FLAGS.clone_on_cpu, # 布尔类型变量,表示是否将 Clone 对象部署在 CPU 上
383 |         replica_id=FLAGS.task, # worker 或 PS 进程的 ID
384 |         num_replicas=FLAGS.worker_replicas, # worker 任务数(详见 5.2 节)
385 |         num_ps_tasks=FLAGS.num_ps_tasks) # PS 任务数
386 | 
387 |     # Create global_step
388 |     with tf.device(deploy_config.variables_device()):
389 |       global_step = slim.create_global_step()
390 | 
391 |     ######################
392 |     # Select the dataset #
393 |     ######################
394 |     # 根据 FLAGS 指定的数据集名字 dataset_name(如 imagenet)、
395 |     # 数据集被分割后的子数据集名称 dataset_split_name(如 train)
396 |     # 和数据集所在的绝对路径 dataset_dir,从 dataset_factory 中获得数据集对象 dataset
397 |     dataset = dataset_factory.get_dataset(
398 |         FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
399 | 
400 |     ######################
401 |     # Select the network #
402 |     ######################
403 |     # 根据 FLAGS 指定的模型名称 model_name(如 alexnet_v2)、
404 |     # 分类类别数 num_classes 和权值衰减 weight_decay(即 L2 正则项前面的系数), 
405 |     # 从 nets_factory 中获得模型函数对象 network_fn
406 |     network_fn = nets_factory.get_network_fn(
407 |         FLAGS.model_name,
408 |         num_classes=(dataset.num_classes - FLAGS.labels_offset),
409 |         weight_decay=FLAGS.weight_decay,
410 |         is_training=True)
411 | 
412 |     #####################################
413 |     # Select the preprocessing function #
414 |     #####################################
415 |     # 指定预处理函数名
416 |     preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
417 |     # 根据预处理函数名,从 preprocessing_factory 中获得图像预处理函数对象
418 |     # image_preprocessing_fn。
419 |     image_preprocessing_fn = preprocessing_factory.get_preprocessing(
420 |         preprocessing_name,
421 |         is_training=True)
422 | 
423 |     ##############################################################
424 |     # Create a dataset provider that loads data from the dataset #
425 |     ##############################################################
426 |     with tf.device(deploy_config.inputs_device()):
427 |       # FLAGS.num_readers 指定了同时读取数据集的线程数(默认为 4),
428 |       # 不同线程读取的数据入队到 common_queue 中。此处默认设定 common_queue 
429 |       # 的最大容量为训练批大小(batch_size)的 20 倍。common_queue_min
430 |       # 表示 common_queue 队列中最少保留的数据量,默认设定为训练批大小的 10 倍
431 |       provider = slim.dataset_data_provider.DatasetDataProvider(
432 |           dataset,
433 |           num_readers=FLAGS.num_readers,
434 |           common_queue_capacity=20 * FLAGS.batch_size,
435 |           common_queue_min=10 * FLAGS.batch_size)
436 |       # 如 9.2.1 节所述,可以根据 key 值 image 和 label 从 provider 对象中获得训练数据及其标签张量
437 |       [image, label] = provider.get(['image', 'label'])
438 |       # 因为在 VGG 或 ResNet 模型中,背景没有被当作分类数据集中的一个类别,
439 |       # 所以当训练这两类模型时,labels_offset 要被设置为 1
440 |       label -= FLAGS.labels_offset
441 |       # 设定训练时输入图像的分辨率
442 |       train_image_size = FLAGS.train_image_size or network_fn.default_image_size
443 |       # 训练数据经过图像预处理函数处理
444 |       image = image_preprocessing_fn(image, train_image_size, train_image_size)
445 |       # 通过 FLAGS.num_preprocessing_threads 指定的线程数并行读取,
446 |       # 得到当前迭代用到的训练数据 images 和 labels 张量
447 |       images, labels = tf.train.batch(
448 |           [image, label],
449 |           batch_size=FLAGS.batch_size,
450 |           num_threads=FLAGS.num_preprocessing_threads,
451 |           capacity=5 * FLAGS.batch_size)
452 |       labels = slim.one_hot_encoding(
453 |           labels, dataset.num_classes - FLAGS.labels_offset)
454 |       # 调用 prefetch_queue 方法,启动一个 QueueRunner 对象用于保存预先准备好、 
455 |       # 即将被训练的数据。准备好的数据放在缓冲区队列 batch_queue 中
456 |       batch_queue = slim.prefetch_queue.prefetch_queue(
457 |           [images, labels], capacity=2 * deploy_config.num_clones)
458 | 
459 |     ####################
460 |     # Define the model #
461 |     ####################
462 |     def clone_fn(batch_queue):
463 |       """Allows data parallelism by creating multiple clones of network_fn."""
464 |       # 从 batch_queue 中得到本次迭代所需要的训练数据————images 和 labels
465 |       images, labels = batch_queue.dequeue()
466 |       # 调用 network_fn,得到 CNN 模型最后一层的输出张量 logits,
467 |       # 以及由 CNN 模型中每层的输出张量所组成的集合 end_points
468 |       logits, end_points = network_fn(images)
469 | 
470 |       #############################
471 |       # Specify the loss function #
472 |       #############################
473 |       # 在某些 CNN 模型(如 Inception V3)中,为了减少梯度消失现象,
474 |       # 模型中间某一个或多个层的输出被用于辅助分类。这些层的输出张量为 AuxLogits
475 |       if 'AuxLogits' in end_points:
476 |         # 将辅助分类层的损失函数值也计算在模型整体的损失值中。
477 |         # weight 参数表示辅助分类层对应的损失值在计入总损失值时被乘的折扣系数
478 |         slim.losses.softmax_cross_entropy(
479 |             end_points['AuxLogits'], labels,
480 |             label_smoothing=FLAGS.label_smoothing, weights=0.4,
481 |             scope='aux_loss')
482 |       # 计算最后分类层所对应的损失值
483 |       slim.losses.softmax_cross_entropy(
484 |           logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0)
485 |       # 返回模型每层的输出张量所组成的集合
486 |       return end_points
487 | 
488 |     # Gather initial summaries.
489 |     summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
490 | 
491 |     clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
492 |     first_clone_scope = deploy_config.clone_scope(0)
493 |     # Gather update_ops from the first clone. These contain, for example,
494 |     # the updates for the batch_norm variables created by network_fn.
495 |     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)
496 | 
497 |     # Add summaries for end_points.
498 |     end_points = clones[0].outputs
499 |     for end_point in end_points:
500 |       x = end_points[end_point]
501 |       summaries.add(tf.summary.histogram('activations/' + end_point, x))
502 |       summaries.add(tf.summary.scalar('sparsity/' + end_point,
503 |                                       tf.nn.zero_fraction(x)))
504 | 
505 |     # Add summaries for losses.
506 |     for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
507 |       summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))
508 | 
509 |     # Add summaries for variables.
510 |     for variable in slim.get_model_variables():
511 |       summaries.add(tf.summary.histogram(variable.op.name, variable))
512 | 
513 |     #################################
514 |     # Configure the moving averages #
515 |     #################################
516 |     if FLAGS.moving_average_decay:
517 |       # 如果衰减率 FLAGS.moving_average_decay 的值被指定,
518 |       # 则 moving_average_variables 表示具有滑动平均特性的模型参数变量, 
519 |       # variable_averages 表示相应的滑动平均变量
520 |       moving_average_variables = slim.get_model_variables()
521 |       variable_averages = tf.train.ExponentialMovingAverage(
522 |           FLAGS.moving_average_decay, global_step)
523 |     else:
524 |       moving_average_variables, variable_averages = None, None
525 | 
526 |     #########################################
527 |     # Configure the optimization procedure. #
528 |     #########################################
529 |     with tf.device(deploy_config.optimizer_device()):
530 |       # 当前,在学习速率的调整方面,支持 exponential、fixed、polynomial 这三种策略
531 |       learning_rate = _configure_learning_rate(dataset.num_samples, global_step)
532 |       # 根据 FLAGS 所指定的优化器类型创建相应的优化器 optimizer。
533 |       # 当前支持 adadelta、adagrad、adam、ftrl、momentum、rmsprop 和 sgd 这七种优化器
534 |       optimizer = _configure_optimizer(learning_rate)
535 |       summaries.add(tf.summary.scalar('learning_rate', learning_rate))
536 | 
537 |     if FLAGS.sync_replicas:
538 |       # If sync_replicas is enabled, the averaging will be done in the chief
539 |       # queue runner.
540 |       # 如 5.2 节所述,进行分布式计算时,需要定义同步优化器。
541 |       # 当前开源的 train_image_classifier.py 对分布式支持还不完善,
542 |       # 此处代码需要配合 tf.train.ClusterSpec、tf.train.Server 等接口一起使用,才能实现分布式训练
543 |       optimizer = tf.train.SyncReplicasOptimizer(
544 |           opt=optimizer,
545 |           replicas_to_aggregate=FLAGS.replicas_to_aggregate,
546 |           total_num_replicas=FLAGS.worker_replicas,
547 |           variable_averages=variable_averages,
548 |           variables_to_average=moving_average_variables)
549 |     elif FLAGS.moving_average_decay:
550 |       # 如果衰减率 FLAGS.moving_average_decay 的值被指定,则对模型参数更新采取滑动平均操作
551 |       update_ops.append(variable_averages.apply(moving_average_variables))
552 | 
553 |     # Variables to train.
554 |     # 此段代码类似于 9.2.3 节介绍的 deploy 方法的部分代码(当 optimizer 非 None 时)
555 |     variables_to_train = _get_variables_to_train()
556 | 
557 |     #  and returns a train_tensor and summary_op
558 |     total_loss, clones_gradients = model_deploy.optimize_clones(
559 |         clones,
560 |         optimizer,
561 |         var_list=variables_to_train)
562 |     # Add total_loss to summary.
563 |     summaries.add(tf.summary.scalar('total_loss', total_loss))
564 | 
565 |     # Create gradient updates.
566 |     grad_updates = optimizer.apply_gradients(clones_gradients,
567 |                                              global_step=global_step)
568 |     update_ops.append(grad_updates)
569 | 
570 |     update_op = tf.group(*update_ops)
571 |     with tf.control_dependencies([update_op]):
572 |       train_tensor = tf.identity(total_loss, name='train_op')
573 | 
574 |     # Add the summaries from the first clone. These contain the summaries
575 |     # created by model_fn and either optimize_clones() or _gather_clone_loss().
576 |     summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
577 |                                        first_clone_scope))
578 | 
579 |     # Merge all summaries together.
580 |     summary_op = tf.summary.merge(list(summaries), name='summary_op')
581 | 
582 |     ###########################
583 |     # Kicks off the training. #
584 |     ###########################
585 |     slim.learning.train(
586 |         train_tensor, # 单步迭代的训练操作
587 |         logdir=FLAGS.train_dir, # 训练过程中日志和模型检查点文件等存放的目录
588 |         master=FLAGS.master, # master 的地址,在单机训练时没有用到
589 |         is_chief=(FLAGS.task == 0), # 当前 worker 是否为 chief worker(在分布式训练场景中用到)
590 |         init_fn=_get_init_fn(), # 模型初始化函数
591 |         summary_op=summary_op, # summary 操作
592 |         number_of_steps=FLAGS.max_number_of_steps, # 最大训练步数
593 |         log_every_n_steps=FLAGS.log_every_n_steps, # 输出日志的间隔(以步数为单位)
594 |         save_summaries_secs=FLAGS.save_summaries_secs, # 输出 summary 日志的间隔(以秒为单位)
595 |         save_interval_secs=FLAGS.save_interval_secs, # 保存模型检查点文件的间隔(以秒为单位)
596 |         sync_optimizer=optimizer if FLAGS.sync_replicas else None) # 同步优化器(在单机训练时为 None)
597 | 
598 | 
599 | if __name__ == '__main__':
600 |   tf.app.run()
601 | 


--------------------------------------------------------------------------------
/code/9_cnn_models/9.2_model_deploy.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Deploy Slim models across multiple clones and replicas.
 16 | # TODO(sguada) docstring paragraph by (a) motivating the need for the file and
 17 | # (b) defining clones.
 18 | # TODO(sguada) describe the high-level components of model deployment.
 19 | # E.g. "each model deployment is composed of several parts: a DeploymentConfig,
 20 | # which captures A, B and C, an input_fn which loads data.. etc
 21 | To easily train a model on multiple GPUs or across multiple machines this
 22 | module provides a set of helper functions: `create_clones`,
 23 | `optimize_clones` and `deploy`.
 24 | Usage:
 25 |   g = tf.Graph()
 26 |   # Set up DeploymentConfig
 27 |   config = model_deploy.DeploymentConfig(num_clones=2, clone_on_cpu=True)
 28 |   # Create the global step on the device storing the variables.
 29 |   with tf.device(config.variables_device()):
 30 |     global_step = slim.create_global_step()
 31 |   # Define the inputs
 32 |   with tf.device(config.inputs_device()):
 33 |     images, labels = LoadData(...)
 34 |     inputs_queue = slim.data.prefetch_queue((images, labels))
 35 |   # Define the optimizer.
 36 |   with tf.device(config.optimizer_device()):
 37 |     optimizer = tf.train.MomentumOptimizer(FLAGS.learning_rate, FLAGS.momentum)
 38 |   # Define the model including the loss.
 39 |   def model_fn(inputs_queue):
 40 |     images, labels = inputs_queue.dequeue()
 41 |     predictions = CreateNetwork(images)
 42 |     slim.losses.log_loss(predictions, labels)
 43 |   model_dp = model_deploy.deploy(config, model_fn, [inputs_queue],
 44 |                                  optimizer=optimizer)
 45 |   # Run training.
 46 |   slim.learning.train(model_dp.train_op, my_log_dir,
 47 |                       summary_op=model_dp.summary_op)
 48 | The Clone namedtuple holds together the values associated with each call to
 49 | model_fn:
 50 |   * outputs: The return values of the calls to `model_fn()`.
 51 |   * scope: The scope used to create the clone.
 52 |   * device: The device used to create the clone.
 53 | DeployedModel namedtuple, holds together the values needed to train multiple
 54 | clones:
 55 |   * train_op: An operation that run the optimizer training op and include
 56 |     all the update ops created by `model_fn`. Present only if an optimizer
 57 |     was specified.
 58 |   * summary_op: An operation that run the summaries created by `model_fn`
 59 |     and process_gradients.
 60 |   * total_loss: A `Tensor` that contains the sum of all losses created by
 61 |     `model_fn` plus the regularization losses.
 62 |   * clones: List of `Clone` tuples returned by `create_clones()`.
 63 | DeploymentConfig parameters:
 64 |   * num_clones: Number of model clones to deploy in each replica.
 65 |   * clone_on_cpu: True if clones should be placed on CPU.
 66 |   * replica_id: Integer.  Index of the replica for which the model is
 67 |       deployed.  Usually 0 for the chief replica.
 68 |   * num_replicas: Number of replicas to use.
 69 |   * num_ps_tasks: Number of tasks for the `ps` job. 0 to not use replicas.
 70 |   * worker_job_name: A name for the worker job.
 71 |   * ps_job_name: A name for the parameter server job.
 72 | TODO(sguada):
 73 |   - describe side effect to the graph.
 74 |   - what happens to summaries and update_ops.
 75 |   - which graph collections are altered.
 76 |   - write a tutorial on how to use this.
 77 |   - analyze the possibility of calling deploy more than once.
 78 | """
 79 | 
 80 | from __future__ import absolute_import
 81 | from __future__ import division
 82 | from __future__ import print_function
 83 | 
 84 | import collections
 85 | 
 86 | import tensorflow as tf
 87 | 
 88 | slim = tf.contrib.slim
 89 | 
 90 | 
 91 | __all__ = ['create_clones',
 92 |            'deploy',
 93 |            'optimize_clones',
 94 |            'DeployedModel',
 95 |            'DeploymentConfig',
 96 |            'Clone',
 97 |           ]
 98 | 
 99 | 
100 | # Namedtuple used to represent a clone during deployment.
101 | Clone = collections.namedtuple('Clone',
102 |                                ['outputs',  # 该 Clone 所对应的深度学习模型的输出张量
103 |                                 'scope',  # 该 Clone 的作用域
104 |                                 'device',  # 该Clone的设备
105 |                                ])
106 | 
107 | # Namedtuple used to represent a DeployedModel, returned by deploy().
108 | DeployedModel = collections.namedtuple('DeployedModel',
109 |                                        ['train_op',  # 单步训练操作
110 |                                         'summary_op',  # 记录变量变化的操作
111 |                                         'total_loss',  # 所有Clone对象上模型损失值的总和
112 |                                         'clones',  # 多个Clone对象的集合
113 |                                        ])
114 | 
115 | # Default parameters for DeploymentConfig
116 | _deployment_params = {'num_clones': 1,
117 |                       'clone_on_cpu': False,
118 |                       'replica_id': 0,
119 |                       'num_replicas': 1,
120 |                       'num_ps_tasks': 0,
121 |                       'worker_job_name': 'worker',
122 |                       'ps_job_name': 'ps'}
123 | 
124 | 
125 | def create_clones(config, model_fn, args=None, kwargs=None):
126 |   """Creates multiple clones according to config using a `model_fn`.
127 |   The returned values of `model_fn(*args, **kwargs)` are collected along with
128 |   the scope and device used to created it in a namedtuple
129 |   `Clone(outputs, scope, device)`
130 |   Note: it is assumed that any loss created by `model_fn` is collected at
131 |   the tf.GraphKeys.LOSSES collection.
132 |   To recover the losses, summaries or update_ops created by the clone use:
133 |   ```python
134 |     losses = tf.get_collection(tf.GraphKeys.LOSSES, clone.scope)
135 |     summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, clone.scope)
136 |     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, clone.scope)
137 |   ```
138 |   The deployment options are specified by the config object and support
139 |   deploying one or several clones on different GPUs and one or several replicas
140 |   of such clones.
141 |   The argument `model_fn` is called `config.num_clones` times to create the
142 |   model clones as `model_fn(*args, **kwargs)`.
143 |   If `config` specifies deployment on multiple replicas then the default
144 |   tensorflow device is set appropriatly for each call to `model_fn` and for the
145 |   slim variable creation functions: model and global variables will be created
146 |   on the `ps` device, the clone operations will be on the `worker` device.
147 |   Args:
148 |     config: A DeploymentConfig object.
149 |     model_fn: A callable. Called as `model_fn(*args, **kwargs)`
150 |     args: Optional list of arguments to pass to `model_fn`.
151 |     kwargs: Optional list of keyword arguments to pass to `model_fn`.
152 |   Returns:
153 |     A list of namedtuples `Clone`.
154 |   """
155 |   clones = []
156 |   args = args or []
157 |   kwargs = kwargs or {}
158 |   with slim.arg_scope([slim.model_variable, slim.variable],
159 |                       device=config.variables_device()):
160 |     # Create clones.
161 |     for i in range(0, config.num_clones):
162 |       with tf.name_scope(config.clone_scope(i)) as clone_scope:
163 |         clone_device = config.clone_device(i)
164 |         with tf.device(clone_device):
165 |           with tf.variable_scope(tf.get_variable_scope(),
166 |                                  reuse=True if i > 0 else None):
167 |             outputs = model_fn(*args, **kwargs)
168 |           clones.append(Clone(outputs, clone_scope, clone_device))
169 |   return clones
170 | 
171 | 
172 | def _gather_clone_loss(clone, num_clones, regularization_losses):
173 |   """Gather the loss for a single clone.
174 |   Args:
175 |     clone: A Clone namedtuple.
176 |     num_clones: The number of clones being deployed.
177 |     regularization_losses: Possibly empty list of regularization_losses
178 |       to add to the clone losses.
179 |   Returns:
180 |     A tensor for the total loss for the clone.  Can be None.
181 |   """
182 |   # The return value.
183 |   sum_loss = None
184 |   # Individual components of the loss that will need summaries.
185 |   clone_loss = None
186 |   regularization_loss = None
187 |   # Compute and aggregate losses on the clone device.
188 |   with tf.device(clone.device):
189 |     all_losses = []
190 |     clone_losses = tf.get_collection(tf.GraphKeys.LOSSES, clone.scope)
191 |     if clone_losses:
192 |       clone_loss = tf.add_n(clone_losses, name='clone_loss')
193 |       if num_clones > 1:
194 |         clone_loss = tf.div(clone_loss, 1.0 * num_clones,
195 |                             name='scaled_clone_loss')
196 |       all_losses.append(clone_loss)
197 |     if regularization_losses:
198 |       regularization_loss = tf.add_n(regularization_losses,
199 |                                      name='regularization_loss')
200 |       all_losses.append(regularization_loss)
201 |     if all_losses:
202 |       sum_loss = tf.add_n(all_losses)
203 |   # Add the summaries out of the clone device block.
204 |   if clone_loss is not None:
205 |     tf.summary.scalar('/'.join(filter(None,
206 |                                       ['Losses', clone.scope, 'clone_loss'])),
207 |                       clone_loss)
208 |   if regularization_loss is not None:
209 |     tf.summary.scalar('Losses/regularization_loss', regularization_loss)
210 |   return sum_loss
211 | 
212 | 
213 | def _optimize_clone(optimizer, clone, num_clones, regularization_losses,
214 |                     **kwargs):
215 |   """Compute losses and gradients for a single clone.
216 |   Args:
217 |     optimizer: A tf.Optimizer  object.
218 |     clone: A Clone namedtuple.
219 |     num_clones: The number of clones being deployed.
220 |     regularization_losses: Possibly empty list of regularization_losses
221 |       to add to the clone losses.
222 |     **kwargs: Dict of kwarg to pass to compute_gradients().
223 |   Returns:
224 |     A tuple (clone_loss, clone_grads_and_vars).
225 |       - clone_loss: A tensor for the total loss for the clone.  Can be None.
226 |       - clone_grads_and_vars: List of (gradient, variable) for the clone.
227 |         Can be empty.
228 |   """
229 |   sum_loss = _gather_clone_loss(clone, num_clones, regularization_losses)
230 |   clone_grad = None
231 |   if sum_loss is not None:
232 |     with tf.device(clone.device):
233 |       clone_grad = optimizer.compute_gradients(sum_loss, **kwargs)
234 |   return sum_loss, clone_grad
235 | 
236 | 
237 | def optimize_clones(clones, optimizer,
238 |                     regularization_losses=None,
239 |                     **kwargs):
240 |   """Compute clone losses and gradients for the given list of `Clones`.
241 |   Note: The regularization_losses are added to the first clone losses.
242 |   Args:
243 |    clones: List of `Clones` created by `create_clones()`.
244 |    optimizer: An `Optimizer` object.
245 |    regularization_losses: Optional list of regularization losses. If None it
246 |      will gather them from tf.GraphKeys.REGULARIZATION_LOSSES. Pass `[]` to
247 |      exclude them.
248 |    **kwargs: Optional list of keyword arguments to pass to `compute_gradients`.
249 |   Returns:
250 |    A tuple (total_loss, grads_and_vars).
251 |      - total_loss: A Tensor containing the average of the clone losses including
252 |        the regularization loss.
253 |      - grads_and_vars: A List of tuples (gradient, variable) containing the sum
254 |        of the gradients for each variable.
255 |   """
256 |   grads_and_vars = []
257 |   clones_losses = []
258 |   num_clones = len(clones)
259 |   if regularization_losses is None:
260 |     regularization_losses = tf.get_collection(
261 |         tf.GraphKeys.REGULARIZATION_LOSSES)
262 |   for clone in clones:
263 |     with tf.name_scope(clone.scope):
264 |       clone_loss, clone_grad = _optimize_clone(
265 |           optimizer, clone, num_clones, regularization_losses, **kwargs)
266 |       if clone_loss is not None:
267 |         clones_losses.append(clone_loss)
268 |         grads_and_vars.append(clone_grad)
269 |       # Only use regularization_losses for the first clone
270 |       regularization_losses = None
271 |   # Compute the total_loss summing all the clones_losses.
272 |   total_loss = tf.add_n(clones_losses, name='total_loss')
273 |   # Sum the gradients across clones.
274 |   grads_and_vars = _sum_clones_gradients(grads_and_vars)
275 |   return total_loss, grads_and_vars
276 | 
277 | 
278 | def deploy(config,
279 |            model_fn,
280 |            args=None,
281 |            kwargs=None,
282 |            optimizer=None,
283 |            summarize_gradients=False):
284 |   """Deploys a Slim-constructed model across multiple clones.
285 |   The deployment options are specified by the config object and support
286 |   deploying one or several clones on different GPUs and one or several replicas
287 |   of such clones.
288 |   The argument `model_fn` is called `config.num_clones` times to create the
289 |   model clones as `model_fn(*args, **kwargs)`.
290 |   The optional argument `optimizer` is an `Optimizer` object.  If not `None`,
291 |   the deployed model is configured for training with that optimizer.
292 |   If `config` specifies deployment on multiple replicas then the default
293 |   tensorflow device is set appropriatly for each call to `model_fn` and for the
294 |   slim variable creation functions: model and global variables will be created
295 |   on the `ps` device, the clone operations will be on the `worker` device.
296 |   Args:
297 |     config: A `DeploymentConfig` object.
298 |     model_fn: A callable. Called as `model_fn(*args, **kwargs)`
299 |     args: Optional list of arguments to pass to `model_fn`.
300 |     kwargs: Optional list of keyword arguments to pass to `model_fn`.
301 |     optimizer: Optional `Optimizer` object.  If passed the model is deployed
302 |       for training with that optimizer.
303 |     summarize_gradients: Whether or not add summaries to the gradients.
304 |   Returns:
305 |     A `DeployedModel` namedtuple.
306 |   """
307 |   # Gather initial summaries.
308 |   summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
309 | 
310 |   # Create Clones.
311 |   clones = create_clones(config, model_fn, args, kwargs)
312 |   first_clone = clones[0]
313 | 
314 |   # Gather update_ops from the first clone. These contain, for example,
315 |   # the updates for the batch_norm variables created by model_fn.
316 |   update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone.scope)
317 | 
318 |   train_op = None
319 |   total_loss = None
320 |   with tf.device(config.optimizer_device()):
321 |     if optimizer:
322 |       # Place the global step on the device storing the variables.
323 |       with tf.device(config.variables_device()):
324 |         # 创建多个 worker 并行计算时的同步标记 global_step
325 |         global_step = slim.get_or_create_global_step()
326 |         # 调用model_deploy模块中的optimize_clones方法,得到所有Clone的损失函数值的综合
327 |         # total_loss和clones_gradients。clones_gradients的数据格式为:
328 |         # [(grad_1,var_1),(grad_2,var_2),...,
329 |         # (grad_i,var_i),...,(grad_N,var_N)],其中 var_i 表示第 i 层的参数(weights 或 biases 等),
330 |         # grad_i 表示 var_i 所对应的梯度,grad_i 是所有 Clone 上计算得到的梯度的总和
331 |       # Compute the gradients for the clones.
332 |       total_loss, clones_gradients = optimize_clones(clones, optimizer)
333 | 
334 |       if clones_gradients:
335 |         if summarize_gradients:
336 |           # Add summaries to the gradients.
337 |           summaries |= set(_add_gradients_summaries(clones_gradients))
338 |         # 调用优化器的 apply_gradients 方法,得到 grad_updates 操作, 
339 |         # 该操作利用 clones_gradients 中的梯度更新对应的参数
340 |         # Create gradient updates.
341 |         grad_updates = optimizer.apply_gradients(clones_gradients,
342 |                                                  global_step=global_step)
343 |         update_ops.append(grad_updates)
344 | 
345 |         update_op = tf.group(*update_ops)
346 |         # 最终得到 train_op,它表示完成一次迭代所需的所有更新操作
347 |         with tf.control_dependencies([update_op]):
348 |           train_op = tf.identity(total_loss, name='train_op')
349 |     else:
350 |       clones_losses = []
351 |       regularization_losses = tf.get_collection(
352 |           tf.GraphKeys.REGULARIZATION_LOSSES)
353 |       for clone in clones:
354 |         with tf.name_scope(clone.scope):
355 |           # 调用 model_deploy 模块中的 _gather_clone_loss 方法,得到当前 Clone 所对应的所有损失值, 
356 |           # 该损失值由 tf.GraphKeys.LOSSES 指定的损失值和输入参数 regularization_losses
357 |           # 指定的损失值共同构成。其中,tf.GraphKeys.LOSSES 指定的损失值需要除以 Clone 的个数
358 |           clone_loss = _gather_clone_loss(clone, len(clones),
359 |                                           regularization_losses)
360 |           if clone_loss is not None:
361 |             clones_losses.append(clone_loss)
362 |           # Only use regularization_losses for the first clone
363 |           # 除了第一个 Clone 之外, 其他的 Clone 所对应的 regularization_losses 都被忽略
364 |           regularization_losses = None
365 |       if clones_losses:
366 |         # 将所有 Clone 的损失值 clones_losses 相加,得到总的损失值 total_loss
367 |         total_loss = tf.add_n(clones_losses, name='total_loss')
368 | 
369 |     # Add the summaries from the first clone. These contain the summaries
370 |     # created by model_fn and either optimize_clones() or _gather_clone_loss().
371 |     summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
372 |                                        first_clone.scope))
373 | 
374 |     if total_loss is not None:
375 |       # Add total_loss to summary.
376 |       summaries.add(tf.summary.scalar('total_loss', total_loss))
377 | 
378 |     if summaries:
379 |       # Merge all summaries together.
380 |       summary_op = tf.summary.merge(list(summaries), name='summary_op')
381 |     else:
382 |       summary_op = None
383 | 
384 |   return DeployedModel(train_op, summary_op, total_loss, clones)
385 | 
386 | 
387 | def _sum_clones_gradients(clone_grads):
388 |   """Calculate the sum gradient for each shared variable across all clones.
389 |   This function assumes that the clone_grads has been scaled appropriately by
390 |   1 / num_clones.
391 |   Args:
392 |     clone_grads: A List of List of tuples (gradient, variable), one list per
393 |     `Clone`.
394 |   Returns:
395 |      List of tuples of (gradient, variable) where the gradient has been summed
396 |      across all clones.
397 |   """
398 |   sum_grads = []
399 |   for grad_and_vars in zip(*clone_grads):
400 |     # Note that each grad_and_vars looks like the following:
401 |     #   ((grad_var0_clone0, var0), ... (grad_varN_cloneN, varN))
402 |     grads = []
403 |     var = grad_and_vars[0][1]
404 |     for g, v in grad_and_vars:
405 |       assert v == var
406 |       if g is not None:
407 |         grads.append(g)
408 |     if grads:
409 |       if len(grads) > 1:
410 |         sum_grad = tf.add_n(grads, name=var.op.name + '/sum_grads')
411 |       else:
412 |         sum_grad = grads[0]
413 |       sum_grads.append((sum_grad, var))
414 |   return sum_grads
415 | 
416 | 
417 | def _add_gradients_summaries(grads_and_vars):
418 |   """Add histogram summaries to gradients.
419 |   Note: The summaries are also added to the SUMMARIES collection.
420 |   Args:
421 |     grads_and_vars: A list of gradient to variable pairs (tuples).
422 |   Returns:
423 |     The _list_ of the added summaries for grads_and_vars.
424 |   """
425 |   summaries = []
426 |   for grad, var in grads_and_vars:
427 |     if grad is not None:
428 |       if isinstance(grad, tf.IndexedSlices):
429 |         grad_values = grad.values
430 |       else:
431 |         grad_values = grad
432 |       summaries.append(tf.summary.histogram(var.op.name + ':gradient',
433 |                                             grad_values))
434 |       summaries.append(tf.summary.histogram(var.op.name + ':gradient_norm',
435 |                                             tf.global_norm([grad_values])))
436 |     else:
437 |       tf.logging.info('Var %s has no gradient', var.op.name)
438 |   return summaries
439 | 
440 | 
441 | class DeploymentConfig(object):
442 |   """Configuration for deploying a model with `deploy()`.
443 |   You can pass an instance of this class to `deploy()` to specify exactly
444 |   how to deploy the model to build.  If you do not pass one, an instance built
445 |   from the default deployment_hparams will be used.
446 |   """
447 | 
448 |   def __init__(self,
449 |                num_clones=1,
450 |                clone_on_cpu=False,
451 |                replica_id=0,
452 |                num_replicas=1,
453 |                num_ps_tasks=0,
454 |                worker_job_name='worker',
455 |                ps_job_name='ps'):
456 |     """Create a DeploymentConfig.
457 |     The config describes how to deploy a model across multiple clones and
458 |     replicas.  The model will be replicated `num_clones` times in each replica.
459 |     If `clone_on_cpu` is True, each clone will placed on CPU.
460 |     If `num_replicas` is 1, the model is deployed via a single process.  In that
461 |     case `worker_device`, `num_ps_tasks`, and `ps_device` are ignored.
462 |     If `num_replicas` is greater than 1, then `worker_device` and `ps_device`
463 |     must specify TensorFlow devices for the `worker` and `ps` jobs and
464 |     `num_ps_tasks` must be positive.
465 |     Args:
466 |       num_clones: Number of model clones to deploy in each replica.
467 |       clone_on_cpu: If True clones would be placed on CPU.
468 |       replica_id: Integer.  Index of the replica for which the model is
469 |         deployed.  Usually 0 for the chief replica.
470 |       num_replicas: Number of replicas to use.
471 |       num_ps_tasks: Number of tasks for the `ps` job. 0 to not use replicas.
472 |       worker_job_name: A name for the worker job.
473 |       ps_job_name: A name for the parameter server job.
474 |     Raises:
475 |       ValueError: If the arguments are invalid.
476 |     """
477 |     if num_replicas > 1:
478 |       if num_ps_tasks < 1:
479 |         raise ValueError('When using replicas num_ps_tasks must be positive')
480 |     if num_replicas > 1 or num_ps_tasks > 0:
481 |       if not worker_job_name:
482 |         raise ValueError('Must specify worker_job_name when using replicas')
483 |       if not ps_job_name:
484 |         raise ValueError('Must specify ps_job_name when using parameter server')
485 |     if replica_id >= num_replicas:
486 |       raise ValueError('replica_id must be less than num_replicas')
487 |     self._num_clones = num_clones
488 |     self._clone_on_cpu = clone_on_cpu
489 |     self._replica_id = replica_id
490 |     self._num_replicas = num_replicas
491 |     self._num_ps_tasks = num_ps_tasks
492 |     self._ps_device = '/job:' + ps_job_name if num_ps_tasks > 0 else ''
493 |     self._worker_device = '/job:' + worker_job_name if num_ps_tasks > 0 else ''
494 | 
495 |   @property
496 |   def num_clones(self):
497 |     return self._num_clones
498 | 
499 |   @property
500 |   def clone_on_cpu(self):
501 |     return self._clone_on_cpu
502 | 
503 |   @property
504 |   def replica_id(self):
505 |     return self._replica_id
506 | 
507 |   @property
508 |   def num_replicas(self):
509 |     return self._num_replicas
510 | 
511 |   @property
512 |   def num_ps_tasks(self):
513 |     return self._num_ps_tasks
514 | 
515 |   @property
516 |   def ps_device(self):
517 |     return self._ps_device
518 | 
519 |   @property
520 |   def worker_device(self):
521 |     return self._worker_device
522 | 
523 |   def caching_device(self):
524 |     """Returns the device to use for caching variables.
525 |     Variables are cached on the worker CPU when using replicas.
526 |     Returns:
527 |       A device string or None if the variables do not need to be cached.
528 |     """
529 |     if self._num_ps_tasks > 0:
530 |       return lambda op: op.device
531 |     else:
532 |       return None
533 | 
534 |   def clone_device(self, clone_index):
535 |     """Device used to create the clone and all the ops inside the clone.
536 |     Args:
537 |       clone_index: Int, representing the clone_index.
538 |     Returns:
539 |       A value suitable for `tf.device()`.
540 |     Raises:
541 |       ValueError: if `clone_index` is greater or equal to the number of clones".
542 |     """
543 |     if clone_index >= self._num_clones:
544 |       raise ValueError('clone_index must be less than num_clones')
545 |     device = ''
546 |     if self._num_ps_tasks > 0:
547 |       device += self._worker_device
548 |     if self._clone_on_cpu:
549 |       device += '/device:CPU:0'
550 |     else:
551 |       device += '/device:GPU:%d' % clone_index
552 |     return device
553 | 
554 |   def clone_scope(self, clone_index):
555 |     """Name scope to create the clone.
556 |     Args:
557 |       clone_index: Int, representing the clone_index.
558 |     Returns:
559 |       A name_scope suitable for `tf.name_scope()`.
560 |     Raises:
561 |       ValueError: if `clone_index` is greater or equal to the number of clones".
562 |     """
563 |     if clone_index >= self._num_clones:
564 |       raise ValueError('clone_index must be less than num_clones')
565 |     scope = ''
566 |     if self._num_clones > 1:
567 |       scope = 'clone_%d' % clone_index
568 |     return scope
569 | 
570 |   def optimizer_device(self):
571 |     """Device to use with the optimizer.
572 |     Returns:
573 |       A value suitable for `tf.device()`.
574 |     """
575 |     if self._num_ps_tasks > 0 or self._num_clones > 0:
576 |       return self._worker_device + '/device:CPU:0'
577 |     else:
578 |       return ''
579 | 
580 |   def inputs_device(self):
581 |     """Device to use to build the inputs.
582 |     Returns:
583 |       A value suitable for `tf.device()`.
584 |     """
585 |     device = ''
586 |     if self._num_ps_tasks > 0:
587 |       device += self._worker_device
588 |     device += '/device:CPU:0'
589 |     return device
590 | 
591 |   def variables_device(self):
592 |     """Returns the device to use for variables created inside the clone.
593 |     Returns:
594 |       A value suitable for `tf.device()`.
595 |     """
596 |     device = ''
597 |     if self._num_ps_tasks > 0:
598 |       device += self._ps_device
599 |     device += '/device:CPU:0'
600 | 
601 |     class _PSDeviceChooser(object):
602 |       """Slim device chooser for variables when using PS."""
603 | 
604 |       def __init__(self, device, tasks):
605 |         self._device = device
606 |         self._tasks = tasks
607 |         self._task = 0
608 | 
609 |       def choose(self, op):
610 |         if op.device:
611 |           return op.device
612 |         node_def = op if isinstance(op, tf.NodeDef) else op.node_def
613 |         if node_def.op.startswith('Variable'):
614 |           t = self._task
615 |           self._task = (self._task + 1) % self._tasks
616 |           d = '%s/task:%d' % (self._device, t)
617 |           return d
618 |         else:
619 |           return op.device
620 | 
621 |     if not self._num_ps_tasks:
622 |       return device
623 |     else:
624 |       chooser = _PSDeviceChooser(device, self._num_ps_tasks)
625 |       return chooser.choose
626 | 


--------------------------------------------------------------------------------