├── README.md
├── data
    ├── t10k-images-idx3-ubyte.gz
    ├── t10k-labels-idx1-ubyte.gz
    ├── train-images-idx3-ubyte.gz
    └── train-labels-idx1-ubyte.gz
├── gpu-mem-samples
    ├── centos
    │   ├── Dockerfile
    │   └── main.py
    └── ubuntu
    │   ├── Dockerfile
    │   └── main.py
├── mnist-tf
    ├── Dockerfile
    ├── Dockerfile.cpu
    ├── Makefile
    ├── main.py
    └── pip.conf
├── models
    └── tensorflow
    │   └── mnist.tar.gz
├── mpijob
    └── docker
    │   ├── Dockerfile
    │   └── Makefile
├── pipelines
    ├── iris_pipelines.py
    └── train
    │   ├── iris_training.csv
    │   └── train.py
└── tfjob
    └── docker
        ├── distributed-mnist
            ├── Dockerfile.cpu
            ├── Dockerfile.gpu
            ├── Makefile
            └── main.py
        ├── estimator
            ├── Dockerfile.cpu
            ├── Dockerfile.gpu
            ├── Makefile
            └── mnist_estimator.py
        ├── export-model
            ├── Dockerfile
            ├── Makefile
            └── export_model.py
        ├── mnist-client
            ├── Dockerfile
            ├── Makefile
            ├── data
            │   ├── 0.png
            │   ├── 1.png
            │   ├── 2.png
            │   ├── 3.png
            │   ├── 4.png
            │   ├── 5.png
            │   ├── 6.png
            │   ├── 7.png
            │   ├── 8.png
            │   └── 9.png
            └── mnist_client.py
        ├── mnist
            ├── Dockerfile.cpu
            ├── Dockerfile.gpu
            ├── Makefile
            ├── export_model.py
            └── main.py
        └── v1alpha2
            └── distributed-mnist
                ├── Dockerfile.cpu
                ├── Dockerfile.gpu
                ├── Makefile
                └── main.py


/README.md:
--------------------------------------------------------------------------------
1 | # tensorflow sample code
2 | The sample code of running TensorFlow 
3 | 


--------------------------------------------------------------------------------
/data/t10k-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/data/t10k-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/data/t10k-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/data/t10k-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/data/train-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/data/train-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/data/train-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/data/train-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/gpu-mem-samples/centos/Dockerfile:
--------------------------------------------------------------------------------
1 | # docker build -t registry.cn-shanghai.aliyuncs.com/tensorflow-samples/tensorflow-gpu-mem:10.0-runtime-centos7 .
2 | FROM registry.cn-huhehaote.aliyuncs.com/tensorflow-samples/tensorflow:centos7-cuda10.0-1.14-py36
3 | 
4 | ADD main.py /app/main.py
5 | 
6 | CMD ["python3","/app/main.py"]
7 | 


--------------------------------------------------------------------------------
/gpu-mem-samples/centos/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | FLAGS = None
10 | 
11 | def  train():
12 | 
13 | 	a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
14 | 	b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
15 | 	c = tf.matmul(a, b)
16 |     
17 | 	sess = tf.Session()
18 | 	# Runs the op.
19 | 	while True:
20 | 		sess.run(c)
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     train()


--------------------------------------------------------------------------------
/gpu-mem-samples/ubuntu/Dockerfile:
--------------------------------------------------------------------------------
1 | # docker build -t registry.cn-shanghai.aliyuncs.com/tensorflow-samples/tensorflow-gpu-mem:debian .
2 | FROM tensorflow/tensorflow:1.14.0-gpu-py3
3 | 
4 | ADD main.py /app/main.py
5 | 
6 | CMD ["python3","/app/main.py"]
7 | 


--------------------------------------------------------------------------------
/gpu-mem-samples/ubuntu/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | import tensorflow as tf
 8 | 
 9 | FLAGS = None
10 | 
11 | def  train():
12 | 
13 | 	a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
14 | 	b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
15 | 	c = tf.matmul(a, b)
16 |     
17 | 	sess = tf.Session()
18 | 	# Runs the op.
19 | 	while True:
20 | 		sess.run(c)
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     train()


--------------------------------------------------------------------------------
/mnist-tf/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:1.5.0-devel-gpu
2 | RUN mkdir /app
3 | WORKDIR /app
4 | RUN mkdir ./logs
5 | RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/' /etc/apt/sources.list
6 | COPY pip.conf /root/.pip/pip.conf
7 | 
8 | 
9 | COPY ./* /app/


--------------------------------------------------------------------------------
/mnist-tf/Dockerfile.cpu:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:1.5.0
2 | RUN mkdir /app
3 | WORKDIR /app
4 | RUN mkdir ./logs
5 | RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/' /etc/apt/sources.list
6 | COPY pip.conf /root/.pip/pip.conf
7 | 
8 | 
9 | COPY ./* /app/


--------------------------------------------------------------------------------
/mnist-tf/Makefile:
--------------------------------------------------------------------------------
 1 | DOCKER ?= docker
 2 | 
 3 | .NOTPARALLEL:
 4 | .PHONY: all
 5 | 
 6 | all: cpu gpu
 7 | 
 8 | cpu:
 9 | 	$(DOCKER) build -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-k8s:cpu .
10 |     $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-k8s:cpu
11 | gpu: 
12 | 	$(DOCKER) build -f Dockerfile -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-k8s:gpu .
13 | 	$(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-k8s:gpu


--------------------------------------------------------------------------------
/mnist-tf/main.py:
--------------------------------------------------------------------------------
  1 | # Code modified from: https://github.com/ischlag/distributed-tensorflow-example
  2 | from __future__ import print_function
  3 | import tensorflow as tf
  4 | import sys
  5 | import time
  6 | import os
  7 | import ast
  8 | 
  9 | # input flags
 10 | tf.app.flags.DEFINE_string("logdir", "", "directory to save summaries")
 11 | tf.app.flags.DEFINE_integer("epochs", 20, "number of epochs")
 12 | tf.app.flags.DEFINE_integer("batch_size", 100, "batch size")
 13 | tf.app.flags.DEFINE_float("lr", 0.0005, "learning rate")
 14 | 
 15 | FLAGS = tf.app.flags.FLAGS
 16 | 
 17 | POD_NAME = os.environ.get('POD_NAME')
 18 | CLUSTER_CONFIG = os.environ.get('CLUSTER_CONFIG')
 19 | 
 20 | job_name, task_id = POD_NAME.split('-', 2)
 21 | task_id = int(task_id)
 22 | cluster_def = ast.literal_eval(CLUSTER_CONFIG)
 23 | cluster_spec = tf.train.ClusterSpec(cluster_def)
 24 | 
 25 | is_chief = (job_name == 'worker') and (task_id == 0)
 26 | 
 27 | server = tf.train.Server(
 28 |     cluster_spec,
 29 |     job_name=job_name,
 30 |     task_index=task_id
 31 | )
 32 | 
 33 | if job_name == 'ps':
 34 |     server.join()
 35 | 
 36 | # config
 37 | batch_size = FLAGS.batch_size
 38 | learning_rate = FLAGS.lr
 39 | training_epochs = FLAGS.epochs
 40 | 
 41 | # load mnist data set
 42 | from tensorflow.examples.tutorials.mnist import input_data
 43 | mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
 44 | 
 45 | # Between-graph replication
 46 | with tf.device(tf.train.replica_device_setter(
 47 | 	worker_device="/job:worker/task:%d" % task_id,
 48 | 	cluster=cluster_spec)):
 49 | 
 50 | 	# count the number of updates
 51 | 	global_step = tf.get_variable(
 52 | 		'global_step',
 53 | 		[],
 54 | 		initializer = tf.constant_initializer(0),
 55 | 		trainable = False)
 56 | 
 57 | 	# input images
 58 | 	with tf.name_scope('input'):
 59 | 		# None -> batch size can be any size, 784 -> flattened mnist image
 60 | 		x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
 61 | 		# target 10 output classes
 62 | 		y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
 63 | 
 64 | 	# model parameters will change during training so we use tf.Variable
 65 | 	tf.set_random_seed(1)
 66 | 	with tf.name_scope("weights"):
 67 | 		W1 = tf.Variable(tf.random_normal([784, 100]))
 68 | 		W2 = tf.Variable(tf.random_normal([100, 10]))
 69 | 
 70 | 	# bias
 71 | 	with tf.name_scope("biases"):
 72 | 		b1 = tf.Variable(tf.zeros([100]))
 73 | 		b2 = tf.Variable(tf.zeros([10]))
 74 | 
 75 | 	# implement model
 76 | 	with tf.name_scope("softmax"):
 77 | 		# y is our prediction
 78 | 		z2 = tf.add(tf.matmul(x,W1),b1)
 79 | 		a2 = tf.nn.sigmoid(z2)
 80 | 		z3 = tf.add(tf.matmul(a2,W2),b2)
 81 | 		y  = tf.nn.softmax(z3)
 82 | 
 83 | 	# specify cost function
 84 | 	with tf.name_scope('cross_entropy'):
 85 | 		# this is our cost
 86 | 		cross_entropy = tf.reduce_mean(
 87 | 			-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
 88 | 
 89 | 	# specify optimizer
 90 | 	with tf.name_scope('train'):
 91 | 		# optimizer is an "operation" which we can execute in a session
 92 | 		grad_op = tf.train.GradientDescentOptimizer(learning_rate)
 93 | 		train_op = grad_op.minimize(cross_entropy, global_step=global_step)
 94 | 		
 95 | 	with tf.name_scope('Accuracy'):
 96 | 		# accuracy
 97 | 		correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
 98 | 		accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
 99 | 
100 | 	# create a summary for our cost and accuracy
101 | 	tf.summary.scalar("cost", cross_entropy)
102 | 	tf.summary.scalar("accuracy", accuracy)
103 | 
104 | 	# merge all summaries into a single "operation" which we can execute in a session 
105 | 	summary_op = tf.summary.merge_all()
106 | 	init_op = tf.global_variables_initializer()
107 | 	print("Variables initialized ...")
108 | 
109 | sv = tf.train.Supervisor(is_chief=is_chief,
110 | 						global_step=global_step,
111 | 						init_op=init_op)
112 | 
113 | begin_time = time.time()
114 | frequency = 100
115 | with sv.prepare_or_wait_for_session(server.target) as sess:
116 | 	# create log writer object (this will log on every machine)
117 | 	writer = tf.summary.FileWriter(FLAGS.logdir, graph=tf.get_default_graph())
118 | 			
119 | 	# perform training cycles
120 | 	start_time = time.time()
121 | 	for epoch in range(training_epochs):
122 | 
123 | 		# number of batches in one epoch
124 | 		batch_count = int(mnist.train.num_examples/batch_size)
125 | 
126 | 		count = 0
127 | 		for i in range(batch_count):
128 | 			batch_x, batch_y = mnist.train.next_batch(batch_size)
129 | 			
130 | 			# perform the operations we defined earlier on batch
131 | 			_, cost, summary, step = sess.run([train_op, cross_entropy, summary_op, global_step], 
132 | 											feed_dict={x: batch_x, y_: batch_y})
133 | 			writer.add_summary(summary, step)
134 | 
135 | 			count += 1
136 | 			if count % frequency == 0 or i+1 == batch_count:
137 | 				elapsed_time = time.time() - start_time
138 | 				start_time = time.time()
139 | 				print("Step: %d," % (step+1), 
140 | 							" Epoch: %2d," % (epoch+1), 
141 | 							" Batch: %3d of %3d," % (i+1, batch_count), 
142 | 							" Cost: %.4f," % cost, 
143 | 							" AvgTime: %3.2fms" % float(elapsed_time*1000/frequency))
144 | 				count = 0
145 | 
146 | 
147 | 	print("Test-Accuracy: %2.2f" % sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
148 | 	print("Total Time: %3.2fs" % float(time.time() - begin_time))
149 | 	print("Final Cost: %.4f" % cost)
150 | 
151 | sv.stop()
152 | print("done")


--------------------------------------------------------------------------------
/mnist-tf/pip.conf:
--------------------------------------------------------------------------------
1 | [global]
2 | index-url = http://mirrors.aliyun.com/pypi/simple/
3 | 
4 | [install]
5 | trusted-host=mirrors.aliyun.com


--------------------------------------------------------------------------------
/models/tensorflow/mnist.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/models/tensorflow/mnist.tar.gz


--------------------------------------------------------------------------------
/mpijob/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM uber/horovod:0.13.11-tf1.10.0-torch0.4.0-py3.5
2 | 
3 | RUN cd / && \
4 |     git clone -b cnn_tf_v1.9_compatible https://github.com/tensorflow/benchmarks.git
5 | 
6 | CMD ["bash", "-c", "mpirun python /benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model resnet101 --batch_size 64     --variable_update horovod --train_dir=/training_logs --summary_verbosity=3 --save_summaries_steps=10"]


--------------------------------------------------------------------------------
/mpijob/docker/Makefile:
--------------------------------------------------------------------------------
 1 | DOCKER ?= docker
 2 | 
 3 | .NOTPARALLEL:
 4 | .PHONY: all
 5 | 
 6 | all: gpu
 7 | 
 8 | gpu: 
 9 | 	$(DOCKER) build --no-cache -f Dockerfile -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/horovod:0.13.11-tf1.10.0-torch0.4.0-py3.5 .
10 | 	$(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/horovod:0.13.11-tf1.10.0-torch0.4.0-py3.5


--------------------------------------------------------------------------------
/pipelines/iris_pipelines.py:
--------------------------------------------------------------------------------
1 | pipelines.py


--------------------------------------------------------------------------------
/pipelines/train/iris_training.csv:
--------------------------------------------------------------------------------
  1 | 120,4,setosa,versicolor,virginica
  2 | 6.4,2.8,5.6,2.2,2
  3 | 5.0,2.3,3.3,1.0,1
  4 | 4.9,2.5,4.5,1.7,2
  5 | 4.9,3.1,1.5,0.1,0
  6 | 5.7,3.8,1.7,0.3,0
  7 | 4.4,3.2,1.3,0.2,0
  8 | 5.4,3.4,1.5,0.4,0
  9 | 6.9,3.1,5.1,2.3,2
 10 | 6.7,3.1,4.4,1.4,1
 11 | 5.1,3.7,1.5,0.4,0
 12 | 5.2,2.7,3.9,1.4,1
 13 | 6.9,3.1,4.9,1.5,1
 14 | 5.8,4.0,1.2,0.2,0
 15 | 5.4,3.9,1.7,0.4,0
 16 | 7.7,3.8,6.7,2.2,2
 17 | 6.3,3.3,4.7,1.6,1
 18 | 6.8,3.2,5.9,2.3,2
 19 | 7.6,3.0,6.6,2.1,2
 20 | 6.4,3.2,5.3,2.3,2
 21 | 5.7,4.4,1.5,0.4,0
 22 | 6.7,3.3,5.7,2.1,2
 23 | 6.4,2.8,5.6,2.1,2
 24 | 5.4,3.9,1.3,0.4,0
 25 | 6.1,2.6,5.6,1.4,2
 26 | 7.2,3.0,5.8,1.6,2
 27 | 5.2,3.5,1.5,0.2,0
 28 | 5.8,2.6,4.0,1.2,1
 29 | 5.9,3.0,5.1,1.8,2
 30 | 5.4,3.0,4.5,1.5,1
 31 | 6.7,3.0,5.0,1.7,1
 32 | 6.3,2.3,4.4,1.3,1
 33 | 5.1,2.5,3.0,1.1,1
 34 | 6.4,3.2,4.5,1.5,1
 35 | 6.8,3.0,5.5,2.1,2
 36 | 6.2,2.8,4.8,1.8,2
 37 | 6.9,3.2,5.7,2.3,2
 38 | 6.5,3.2,5.1,2.0,2
 39 | 5.8,2.8,5.1,2.4,2
 40 | 5.1,3.8,1.5,0.3,0
 41 | 4.8,3.0,1.4,0.3,0
 42 | 7.9,3.8,6.4,2.0,2
 43 | 5.8,2.7,5.1,1.9,2
 44 | 6.7,3.0,5.2,2.3,2
 45 | 5.1,3.8,1.9,0.4,0
 46 | 4.7,3.2,1.6,0.2,0
 47 | 6.0,2.2,5.0,1.5,2
 48 | 4.8,3.4,1.6,0.2,0
 49 | 7.7,2.6,6.9,2.3,2
 50 | 4.6,3.6,1.0,0.2,0
 51 | 7.2,3.2,6.0,1.8,2
 52 | 5.0,3.3,1.4,0.2,0
 53 | 6.6,3.0,4.4,1.4,1
 54 | 6.1,2.8,4.0,1.3,1
 55 | 5.0,3.2,1.2,0.2,0
 56 | 7.0,3.2,4.7,1.4,1
 57 | 6.0,3.0,4.8,1.8,2
 58 | 7.4,2.8,6.1,1.9,2
 59 | 5.8,2.7,5.1,1.9,2
 60 | 6.2,3.4,5.4,2.3,2
 61 | 5.0,2.0,3.5,1.0,1
 62 | 5.6,2.5,3.9,1.1,1
 63 | 6.7,3.1,5.6,2.4,2
 64 | 6.3,2.5,5.0,1.9,2
 65 | 6.4,3.1,5.5,1.8,2
 66 | 6.2,2.2,4.5,1.5,1
 67 | 7.3,2.9,6.3,1.8,2
 68 | 4.4,3.0,1.3,0.2,0
 69 | 7.2,3.6,6.1,2.5,2
 70 | 6.5,3.0,5.5,1.8,2
 71 | 5.0,3.4,1.5,0.2,0
 72 | 4.7,3.2,1.3,0.2,0
 73 | 6.6,2.9,4.6,1.3,1
 74 | 5.5,3.5,1.3,0.2,0
 75 | 7.7,3.0,6.1,2.3,2
 76 | 6.1,3.0,4.9,1.8,2
 77 | 4.9,3.1,1.5,0.1,0
 78 | 5.5,2.4,3.8,1.1,1
 79 | 5.7,2.9,4.2,1.3,1
 80 | 6.0,2.9,4.5,1.5,1
 81 | 6.4,2.7,5.3,1.9,2
 82 | 5.4,3.7,1.5,0.2,0
 83 | 6.1,2.9,4.7,1.4,1
 84 | 6.5,2.8,4.6,1.5,1
 85 | 5.6,2.7,4.2,1.3,1
 86 | 6.3,3.4,5.6,2.4,2
 87 | 4.9,3.1,1.5,0.1,0
 88 | 6.8,2.8,4.8,1.4,1
 89 | 5.7,2.8,4.5,1.3,1
 90 | 6.0,2.7,5.1,1.6,1
 91 | 5.0,3.5,1.3,0.3,0
 92 | 6.5,3.0,5.2,2.0,2
 93 | 6.1,2.8,4.7,1.2,1
 94 | 5.1,3.5,1.4,0.3,0
 95 | 4.6,3.1,1.5,0.2,0
 96 | 6.5,3.0,5.8,2.2,2
 97 | 4.6,3.4,1.4,0.3,0
 98 | 4.6,3.2,1.4,0.2,0
 99 | 7.7,2.8,6.7,2.0,2
100 | 5.9,3.2,4.8,1.8,1
101 | 5.1,3.8,1.6,0.2,0
102 | 4.9,3.0,1.4,0.2,0
103 | 4.9,2.4,3.3,1.0,1
104 | 4.5,2.3,1.3,0.3,0
105 | 5.8,2.7,4.1,1.0,1
106 | 5.0,3.4,1.6,0.4,0
107 | 5.2,3.4,1.4,0.2,0
108 | 5.3,3.7,1.5,0.2,0
109 | 5.0,3.6,1.4,0.2,0
110 | 5.6,2.9,3.6,1.3,1
111 | 4.8,3.1,1.6,0.2,0
112 | 6.3,2.7,4.9,1.8,2
113 | 5.7,2.8,4.1,1.3,1
114 | 5.0,3.0,1.6,0.2,0
115 | 6.3,3.3,6.0,2.5,2
116 | 5.0,3.5,1.6,0.6,0
117 | 5.5,2.6,4.4,1.2,1
118 | 5.7,3.0,4.2,1.2,1
119 | 4.4,2.9,1.4,0.2,0
120 | 4.8,3.0,1.4,0.1,0
121 | 5.5,2.4,3.7,1.0,1
122 | 


--------------------------------------------------------------------------------
/pipelines/train/train.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | from tensorflow.contrib.learn.python.learn.datasets import base
 5 | #from tf.data import base
 6 | 
 7 | # import tensorflow_datasets as tfds
 8 | 
 9 | #from tensorflow.data import base
10 | 
11 | IRIS_TRAIN='iris_training.csv'
12 | IRIS_TEST='iris_test.csv'
13 | 
14 | base.load_csv_with_header(filename=IRIS_TRAIN, features_dtype=np.float32, target_dtype=np.int)
15 | train_set = base.load_csv_with_header(filename=IRIS_TRAIN,features_dtype=np.float32,
16 |                                          target_dtype=np.int)
17 | 
18 | test_set = base.load_csv_with_header(filename=IRIS_TEST, features_dtype=np.float32, target_dtype=np.int)


--------------------------------------------------------------------------------
/tfjob/docker/distributed-mnist/Dockerfile.cpu:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:1.5.0
2 | COPY main.py /app/main.py
3 | 
4 | ENTRYPOINT ["python", "/app/main.py"]


--------------------------------------------------------------------------------
/tfjob/docker/distributed-mnist/Dockerfile.gpu:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:1.5.0-gpu
2 | COPY main.py /app/main.py
3 | 
4 | ENTRYPOINT ["python", "/app/main.py"]


--------------------------------------------------------------------------------
/tfjob/docker/distributed-mnist/Makefile:
--------------------------------------------------------------------------------
 1 | DOCKER ?= docker
 2 | 
 3 | .NOTPARALLEL:
 4 | .PHONY: all
 5 | 
 6 | all: cpu gpu
 7 | 
 8 | cpu:
 9 | 	$(DOCKER) build --no-cache -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed:cpu .
10 | 	$(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed:cpu
11 | gpu: 
12 | 	$(DOCKER) build --no-cache -f Dockerfile.gpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed:gpu .
13 | 	$(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed:gpu
14 | 


--------------------------------------------------------------------------------
/tfjob/docker/distributed-mnist/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """A simple MNIST classifier which displays summaries in TensorBoard.
 16 | This is an unimpressive MNIST model, but it is a good example of using
 17 | tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
 18 | naming summary tags so that they are grouped meaningfully in TensorBoard.
 19 | It demonstrates the functionality of every TensorBoard dashboard.
 20 | """
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | import argparse
 26 | import os
 27 | import sys
 28 | import ast
 29 | import json
 30 | 
 31 | import tensorflow as tf
 32 | 
 33 | from tensorflow.examples.tutorials.mnist import input_data
 34 | 
 35 | FLAGS = None
 36 | 
 37 | def train():
 38 |   tf_config_json = os.environ.get("TF_CONFIG", "{}")
 39 |   tf_config = json.loads(tf_config_json)
 40 | 
 41 |   task = tf_config.get("task", {})
 42 |   cluster_spec = tf_config.get("cluster", {})
 43 |   cluster_spec_object = tf.train.ClusterSpec(cluster_spec)
 44 |   job_name = task["type"]
 45 |   task_id = task["index"]
 46 |   server_def = tf.train.ServerDef(
 47 |       cluster=cluster_spec_object.as_cluster_def(),
 48 |       protocol="grpc",
 49 |       job_name=job_name,
 50 |       task_index=task_id)
 51 |   server = tf.train.Server(server_def)
 52 | 
 53 |   is_chief = (job_name == 'master')
 54 |   if job_name == 'ps':
 55 |     server.join()
 56 | 
 57 |   if is_chief:
 58 |         print("Worker %d: Initializing session..." % task_id)
 59 |         tf.reset_default_graph()
 60 |   else:
 61 |         print("Worker %d: Waiting for session to be initialized..." % task_id)
 62 | 
 63 | 
 64 |   # Import data
 65 |   mnist = input_data.read_data_sets(FLAGS.data_dir,
 66 |                                     one_hot=True,
 67 |                                     fake_data=FLAGS.fake_data)
 68 | 
 69 |   
 70 |   # Create a multilayer model.
 71 | 
 72 | 
 73 |   # Between-graph replication
 74 |   with tf.device(tf.train.replica_device_setter(
 75 |     worker_device="/job:{0}/task:{1}".format(job_name,task_id),
 76 |     cluster=cluster_spec)):
 77 |   # with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
 78 |     # worker_device="/job:{0}/task:{1}".format(job_name,task_id),
 79 |     # cluster=cluster_spec)):
 80 | 
 81 |     # count the number of updates
 82 |     global_step = tf.get_variable(
 83 |       'global_step',
 84 |       [],
 85 |       initializer = tf.constant_initializer(0),
 86 |       trainable = False)
 87 | 
 88 |     # Input placeholders
 89 |     with tf.name_scope('input'):
 90 |       x = tf.placeholder(tf.float32, [None, 784], name='x-input')
 91 |       y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
 92 | 
 93 |     with tf.name_scope('input_reshape'):
 94 |       image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
 95 |       tf.summary.image('input', image_shaped_input, 10)
 96 | 
 97 |     # We can't initialize these variables to 0 - the network will get stuck.
 98 |     def weight_variable(shape):
 99 |       """Create a weight variable with appropriate initialization."""
100 |       initial = tf.truncated_normal(shape, stddev=0.1)
101 |       return tf.Variable(initial)
102 | 
103 |     def bias_variable(shape):
104 |       """Create a bias variable with appropriate initialization."""
105 |       initial = tf.constant(0.1, shape=shape)
106 |       return tf.Variable(initial)
107 | 
108 |     def variable_summaries(var):
109 |       """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
110 |       with tf.name_scope('summaries'):
111 |         mean = tf.reduce_mean(var)
112 |         tf.summary.scalar('mean', mean)
113 |         with tf.name_scope('stddev'):
114 |           stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
115 |         tf.summary.scalar('stddev', stddev)
116 |         tf.summary.scalar('max', tf.reduce_max(var))
117 |         tf.summary.scalar('min', tf.reduce_min(var))
118 |         tf.summary.histogram('histogram', var)
119 | 
120 |     def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
121 |       """Reusable code for making a simple neural net layer.
122 |       It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
123 |       It also sets up name scoping so that the resultant graph is easy to read,
124 |       and adds a number of summary ops.
125 |       """
126 |       # Adding a name scope ensures logical grouping of the layers in the graph.
127 |       with tf.name_scope(layer_name):
128 |         # This Variable will hold the state of the weights for the layer
129 |         with tf.name_scope('weights'):
130 |           weights = weight_variable([input_dim, output_dim])
131 |           variable_summaries(weights)
132 |         with tf.name_scope('biases'):
133 |           biases = bias_variable([output_dim])
134 |           variable_summaries(biases)
135 |         with tf.name_scope('Wx_plus_b'):
136 |           preactivate = tf.matmul(input_tensor, weights) + biases
137 |           tf.summary.histogram('pre_activations', preactivate)
138 |         activations = act(preactivate, name='activation')
139 |         tf.summary.histogram('activations', activations)
140 |         return activations
141 | 
142 |     hidden1 = nn_layer(x, 784, 500, 'layer1')
143 | 
144 |     with tf.name_scope('dropout'):
145 |       keep_prob = tf.placeholder_with_default(1.0, shape=())
146 |       tf.summary.scalar('dropout_keep_probability', keep_prob)
147 |       dropped = tf.nn.dropout(hidden1, keep_prob)
148 | 
149 |     # Do not apply softmax activation yet, see below.
150 |     y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
151 | 
152 |     with tf.name_scope('cross_entropy'):
153 |       # The raw formulation of cross-entropy,
154 |       #
155 |       # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
156 |       #                               reduction_indices=[1]))
157 |       #
158 |       # can be numerically unstable.
159 |       #
160 |       # So here we use tf.nn.softmax_cross_entropy_with_logits on the
161 |       # raw outputs of the nn_layer above, and then average across
162 |       # the batch.
163 |       #diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
164 |       # with tf.name_scope('total'):
165 |         #cross_entropy = tf.reduce_mean(diff)
166 |       logits = tf.nn.softmax(y, name='logits')
167 |       cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(logits,1e-10,1.0)), name='cross_entropy')
168 |       tf.summary.scalar('cross_entropy', cross_entropy)
169 | 
170 |     with tf.name_scope('train'):
171 |       train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
172 |           cross_entropy)
173 | 
174 |     with tf.name_scope('accuracy'):
175 |       with tf.name_scope('correct_prediction'):
176 |         correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
177 |       with tf.name_scope('accuracy'):
178 |         accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
179 |     tf.summary.scalar('accuracy', accuracy)
180 | 
181 |     # Merge all the summaries and write them out to
182 |     # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
183 |     merged = tf.summary.merge_all()  
184 | 
185 |     init_op = tf.global_variables_initializer()
186 | 
187 |   def feed_dict(train):
188 |     """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
189 |     if train or FLAGS.fake_data:
190 |       xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
191 |       k = FLAGS.dropout
192 |     else:
193 |       xs, ys = mnist.test.images, mnist.test.labels
194 |       k = 1.0
195 |     return {x: xs, y_: ys, keep_prob: k}
196 | 
197 | 
198 | 
199 |   sv = tf.train.Supervisor(is_chief=is_chief,
200 | 						global_step=global_step,
201 | 						init_op=init_op,
202 | 						logdir=FLAGS.log_dir)
203 |   # sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True,
204 |   #                               device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.worker_index])
205 | 
206 |   with sv.prepare_or_wait_for_session(server.target) as sess:  
207 |     train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
208 |     test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
209 |     # Train the model, and also write summaries.
210 |     # Every 10th step, measure test-set accuracy, and write test summaries
211 |     # All other steps, run train_step on training data, & add training summaries
212 | 
213 |     for i in range(FLAGS.max_steps):
214 |       if i % 10 == 0:  # Record summaries and test-set accuracy
215 |         summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
216 |         test_writer.add_summary(summary, i)
217 |         print('Accuracy at step %s: %s' % (i, acc))
218 |       else:  # Record train set summaries, and train
219 |         if i % 100 == 99:  # Record execution stats
220 |           run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
221 |           run_metadata = tf.RunMetadata()
222 |           summary, _ = sess.run([merged, train_step],
223 |                                 feed_dict=feed_dict(True),
224 |                                 options=run_options,
225 |                                 run_metadata=run_metadata)
226 |           train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
227 |           train_writer.add_summary(summary, i)
228 |           print('Adding run metadata for', i)
229 |         else:  # Record a summary
230 |           summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
231 |           train_writer.add_summary(summary, i)
232 |     train_writer.close()
233 |     test_writer.close()
234 | 
235 | 
236 | def main(_):
237 |   train()
238 | 
239 | 
240 | if __name__ == '__main__':
241 |   parser = argparse.ArgumentParser()
242 |   parser.add_argument('--fake_data', nargs='?', const=True, type=bool,
243 |                       default=False,
244 |                       help='If true, uses fake data for unit testing.')
245 |   parser.add_argument('--max_steps', type=int, default=1000,
246 |                       help='Number of steps to run trainer.')
247 |   parser.add_argument('--learning_rate', type=float, default=0.001,
248 |                       help='Initial learning rate')
249 |   parser.add_argument('--dropout', type=float, default=0.9,
250 |                       help='Keep probability for training dropout.')
251 |   parser.add_argument(
252 |       '--data_dir',
253 |       type=str,
254 |       default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
255 |                            'data'),
256 |       help='Directory for storing input data')
257 |   parser.add_argument(
258 |       '--log_dir',
259 |       type=str,
260 |       default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
261 |                            'tensorflow/logs'),
262 |       help='Summaries log directory')
263 |   FLAGS, unparsed = parser.parse_known_args()
264 |   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
265 | 


--------------------------------------------------------------------------------
/tfjob/docker/estimator/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.10.1-py3
 2 | 
 3 | RUN mkdir -p /app/MNIST/ && \
 4 |     cd  /app/MNIST/ && \
 5 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \
 6 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \
 7 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \
 8 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz
 9 | 
10 | RUN sed -i 's/https:\/\/storage.googleapis.com\/cvdf-datasets\/mnist\//http:\/\/kubeflow-oss.oss-cn-hangzhou.aliyuncs.com\/tensorflow\/input_data\//g' /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py
11 | 
12 | COPY mnist_estimator.py /app/mnist_estimator.py
13 | 
14 | ENTRYPOINT ["python", "/app/mnist_estimator.py"]


--------------------------------------------------------------------------------
/tfjob/docker/estimator/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.10.1-gpu-py3
 2 | 
 3 | RUN mkdir -p /app/MNIST/ && \
 4 |     cd  /app/MNIST/ && \
 5 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \
 6 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \
 7 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \
 8 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz
 9 | 
10 | RUN sed -i 's/https:\/\/storage.googleapis.com\/cvdf-datasets\/mnist\//http:\/\/kubeflow-oss.oss-cn-hangzhou.aliyuncs.com\/tensorflow\/input_data\//g' /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py
11 | 
12 | COPY mnist_estimator.py /app/mnist_estimator.py
13 | 
14 | ENTRYPOINT ["python", "/app/mnist_estimator.py"]


--------------------------------------------------------------------------------
/tfjob/docker/estimator/Makefile:
--------------------------------------------------------------------------------
 1 | DOCKER ?= docker
 2 | 
 3 | .NOTPARALLEL:
 4 | .PHONY: all
 5 | 
 6 | all: cpu gpu
 7 | 
 8 | cpu:
 9 | 	$(DOCKER) build --no-cache -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/mnist-estimator:cpu .
10 | 	$(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/mnist-estimator:cpu
11 | gpu: 
12 | 	$(DOCKER) build --no-cache -f Dockerfile.gpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/mnist-estimator:gpu .
13 | 	$(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/mnist-estimator:gpu
14 | 


--------------------------------------------------------------------------------
/tfjob/docker/estimator/mnist_estimator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | tf.logging.set_verbosity(tf.logging.INFO)
  9 | 
 10 | tf.app.flags.DEFINE_integer('steps', 100, 'The number of steps to train a model')
 11 | tf.app.flags.DEFINE_string('model_dir', './models/ckpt/', 'Dir to save a model and checkpoints')
 12 | tf.app.flags.DEFINE_string('saved_dir', './models/pb/', 'Dir to save a model for TF serving')
 13 | FLAGS = tf.app.flags.FLAGS
 14 | 
 15 | INPUT_FEATURE = 'image'
 16 | NUM_CLASSES = 10
 17 | 
 18 | 
 19 | def cnn_model_fn(features, labels, mode):
 20 |     """Model function for CNN."""
 21 |     # Input Layer
 22 |     input_layer = features[INPUT_FEATURE]
 23 | 
 24 |     # First convolutional Layer and pooling layer
 25 |     conv1 = tf.layers.conv2d(
 26 |         inputs=input_layer,
 27 |         filters=32,
 28 |         kernel_size=[5, 5],
 29 |         padding="same",
 30 |         activation=None)
 31 |     batch_norm1 = tf.layers.batch_normalization(conv1)
 32 |     relu1 = tf.nn.relu(batch_norm1)
 33 |     pool1 = tf.layers.max_pooling2d(inputs=relu1, pool_size=[2, 2], strides=2)
 34 | 
 35 |     # Second convolutional Layer and pooling layer
 36 |     conv2 = tf.layers.conv2d(
 37 |         inputs=pool1,
 38 |         filters=64,
 39 |         kernel_size=[5, 5],
 40 |         padding="same",
 41 |         activation=None)
 42 |     batch_norm2 = tf.layers.batch_normalization(conv2)
 43 |     relu2 = tf.nn.relu(batch_norm2)
 44 |     pool2 = tf.layers.max_pooling2d(inputs=relu2, pool_size=[2, 2], strides=2)
 45 | 
 46 |     # Flatten tensor into a batch of vectors
 47 |     pool2_flat = tf.layers.flatten(pool2)
 48 | 
 49 |     # Dense Layer
 50 |     dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
 51 | 
 52 |     # Add dropout operation
 53 |     dropout = tf.layers.dropout(
 54 |         inputs=dense, rate=0.4, training=(mode == tf.estimator.ModeKeys.TRAIN))
 55 | 
 56 |     # Logits layer
 57 |     logits = tf.layers.dense(inputs=dropout, units=NUM_CLASSES)
 58 | 
 59 |     predictions = {
 60 |         # Generate predictions (for PREDICT and EVAL mode)
 61 |         "classes": tf.argmax(input=logits, axis=1),
 62 |         # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
 63 |         # `logging_hook`.
 64 |         "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
 65 |     }
 66 | 
 67 |     # PREDICT mode
 68 |     if mode == tf.estimator.ModeKeys.PREDICT:
 69 |         return tf.estimator.EstimatorSpec(
 70 |             mode=mode,
 71 |             predictions=predictions,
 72 |             export_outputs={
 73 |                 'predict': tf.estimator.export.PredictOutput(predictions)
 74 |             })
 75 | 
 76 |     # Calculate Loss (for both TRAIN and EVAL modes)
 77 |     loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 78 | 
 79 |     # Configure the Training Op (for TRAIN mode)
 80 |     if mode == tf.estimator.ModeKeys.TRAIN:
 81 |         optimizer = tf.train.AdamOptimizer()
 82 |         train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
 83 |         return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
 84 | 
 85 |     # Add evaluation metrics (for EVAL mode)
 86 |     eval_metric_ops = {
 87 |         "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
 88 |     }
 89 |     return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
 90 | 
 91 | 
 92 | def serving_input_receiver_fn():
 93 |     """
 94 |     This is used to define inputs to serve the model.
 95 |     :return: ServingInputReciever
 96 |     """
 97 |     reciever_tensors = {
 98 |         # The size of input image is flexible.
 99 |         INPUT_FEATURE: tf.placeholder(tf.float32, [None, None, None, 1]),
100 |     }
101 | 
102 |     # Convert give inputs to adjust to the model.
103 |     features = {
104 |         # Resize given images.
105 |         INPUT_FEATURE: tf.image.resize_images(reciever_tensors[INPUT_FEATURE], [28, 28]),
106 |     }
107 |     return tf.estimator.export.ServingInputReceiver(receiver_tensors=reciever_tensors,
108 |                                                     features=features)
109 | 
110 | 
111 | def main(_):
112 |     # Load training and eval data
113 |     mnist = tf.contrib.learn.datasets.load_dataset("mnist")
114 |     train_data = mnist.train.images  # Returns np.array
115 |     train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
116 |     eval_data = mnist.test.images  # Returns np.array
117 |     eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
118 | 
119 |     # reshape images
120 |     # To have input as an image, we reshape images beforehand.
121 |     train_data = train_data.reshape(train_data.shape[0], 28, 28, 1)
122 |     eval_data = eval_data.reshape(eval_data.shape[0], 28, 28, 1)
123 | 
124 |     # Create the Estimator
125 |     training_config = tf.estimator.RunConfig(
126 |         model_dir=FLAGS.model_dir,
127 |         save_summary_steps=20,
128 |         save_checkpoints_steps=20)
129 |     classifier = tf.estimator.Estimator(
130 |         model_fn=cnn_model_fn,
131 |         model_dir=FLAGS.model_dir,
132 |         config=training_config)
133 | 
134 |     # Set up logging for predictions
135 |     # Log the values in the "Softmax" tensor with label "probabilities"
136 |     tensors_to_log = {"probabilities": "softmax_tensor"}
137 |     logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50)
138 | 
139 |     # Train the model
140 |     train_input_fn = tf.estimator.inputs.numpy_input_fn(
141 |         x={INPUT_FEATURE: train_data},
142 |         y=train_labels,
143 |         batch_size=FLAGS.steps,
144 |         num_epochs=None,
145 |         shuffle=True)
146 |     train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
147 |                                         max_steps=100,
148 |                                         hooks=[logging_hook])
149 |     # classifier.train(
150 |     #    input_fn=train_input_fn,
151 |     #    steps=100,
152 |     #    hooks=[logging_hook])
153 | 
154 |     # Evaluate the model and print results
155 |     eval_input_fn = tf.estimator.inputs.numpy_input_fn(
156 |         x={INPUT_FEATURE: eval_data},
157 |         y=eval_labels,
158 |         num_epochs=1,
159 |         shuffle=False)
160 |     eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
161 |     # eval_results = classifier.evaluate(input_fn=eval_input_fn)
162 |     # print(eval_results)
163 |     tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
164 | 
165 |     # Save the model
166 |     classifier.export_savedmodel(FLAGS.saved_dir,
167 |                                  serving_input_receiver_fn=serving_input_receiver_fn)
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     tf.app.run()


--------------------------------------------------------------------------------
/tfjob/docker/export-model/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:1.5.0
2 | COPY export_model.py /app/export_model.py
3 | 
4 | ENTRYPOINT ["python", "/app/export_model.py"]


--------------------------------------------------------------------------------
/tfjob/docker/export-model/Makefile:
--------------------------------------------------------------------------------
 1 | DOCKER ?= docker
 2 | 
 3 | .NOTPARALLEL:
 4 | .PHONY: all
 5 | 
 6 | all: cpu
 7 | 
 8 | cpu:
 9 |     $(DOCKER) build --no-cache -f Dockerfile -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/export-model .
10 |     $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/export-model


--------------------------------------------------------------------------------
/tfjob/docker/export-model/export_model.py:
--------------------------------------------------------------------------------
 1 | """Export given TensorFlow model.
 2 | The model is a pretrained  "MNIST", which saved as TensorFlow model checkpoint. This program
 3 | simply uses TensorFlow SavedModel to
 4 | export the trained model with proper signatures that can be loaded by standard
 5 | tensorflow_model_server.
 6 | Usage: mnist_export.py [--model_version=y] [--checkpoint_dir=checkpoint_oss_path] [--checkpoint_step=checkpoint_step] export_dir
 7 | """
 8 | 
 9 | import os
10 | import sys
11 | 
12 | import tensorflow as tf
13 | from tensorflow.python.saved_model import builder as saved_model_builder
14 | from tensorflow.python.saved_model import signature_constants
15 | from tensorflow.python.saved_model import signature_def_utils
16 | from tensorflow.python.saved_model import tag_constants
17 | from tensorflow.python.saved_model import utils
18 | from tensorflow.python.util import compat
19 | from tensorflow.examples.tutorials.mnist import input_data as mnist_input_data
20 | 
21 | tf.app.flags.DEFINE_integer('model_version', 1, 'version number of the exported model.')
22 | tf.app.flags.DEFINE_integer('checkpoint_step', 0, 'Checkpoint steps that we export.')
23 | tf.app.flags.DEFINE_string('checkpoint_path', None, 'Checkpoints path.')
24 | FLAGS = tf.app.flags.FLAGS
25 | 
26 | 
27 | def main(_):
28 |   if len(sys.argv) < 2 or sys.argv[-1].startswith('-'):
29 |     print('Usage: mnist_dist_export.py '
30 |           '[--model_version=y] [--checkpoint_path=checkpoint_store_path] [--checkpoint_step=checkpoint_step] export_dir')
31 |     sys.exit(-1)
32 |   if FLAGS.model_version <= 0:
33 |     print('Please specify a positive value for exported serveable version number.')
34 |     sys.exit(-1)
35 |   if not FLAGS.checkpoint_path:
36 |     print('Please specify the correct path where checkpoints stored locally or in OSS.')
37 |     sys.exit(-1)
38 | 
39 |   checkpoint_basename="model.ckpt"
40 |   default_meta_graph_suffix='.meta'
41 |   ckpt_path=os.path.join(FLAGS.checkpoint_path, checkpoint_basename + '-' + str(FLAGS.checkpoint_step))
42 |   meta_graph_file=ckpt_path + default_meta_graph_suffix
43 |   with tf.Session() as new_sess:
44 | #   with new_sess.graph.as_default():
45 |   #  tf.reset_default_graph()
46 |   #  new_sess.run(tf.initialize_all_variables())
47 |     new_saver = tf.train.import_meta_graph(meta_graph_file, clear_devices=True) #'/test/mnistoutput/ckpt.meta')
48 |     new_saver.restore(new_sess, ckpt_path) #'/test/mnistoutput/ckpt')
49 |     new_graph = tf.get_default_graph()
50 |     new_x = new_graph.get_tensor_by_name('input/x-input:0')
51 |     print(new_x)
52 |     new_y = new_graph.get_tensor_by_name('cross_entropy/logits:0')
53 |     print(new_y)
54 | 
55 |   # Export model
56 |   # WARNING(break-tutorial-inline-code): The following code snippet is
57 |   # in-lined in tutorials, please update tutorial documents accordingly
58 |   # whenever code changes.
59 |     export_path_base = sys.argv[-1]
60 |     export_path = os.path.join(
61 |       compat.as_bytes(export_path_base),
62 |       compat.as_bytes(str(FLAGS.model_version)))
63 |     print('Exporting trained model to', export_path)
64 |     builder = saved_model_builder.SavedModelBuilder(export_path)
65 | 
66 |   # Build the signature_def_map.
67 |     tensor_info_x = utils.build_tensor_info(new_x)
68 |     tensor_info_y = utils.build_tensor_info(new_y)
69 | 
70 |     prediction_signature = signature_def_utils.build_signature_def(
71 |       inputs={'images': tensor_info_x},
72 |       outputs={'scores': tensor_info_y},
73 |       method_name=signature_constants.PREDICT_METHOD_NAME)
74 | 
75 |     legacy_init_op = tf.group(tf.initialize_all_tables(), name='legacy_init_op')
76 | 
77 |     builder.add_meta_graph_and_variables(
78 |       new_sess, [tag_constants.SERVING],
79 |       signature_def_map={
80 |           'predict_images':
81 |               prediction_signature,
82 |       },
83 |       legacy_init_op=legacy_init_op,
84 |       clear_devices=True)
85 |     builder.save()
86 | 
87 |   print('Done exporting!')
88 | 
89 | if __name__ == '__main__':
90 |   tf.app.run()


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.5.0
 2 | 
 3 | WORKDIR /app
 4 | ADD mnist_client.py /app
 5 | ADD data /app
 6 | ADD requirements.txt /app
 7 | 
 8 | RUN pip install -r /app/requirements.txt
 9 | 
10 | ENTRYPOINT ["tail", "-f", "/dev/null"]


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/Makefile:
--------------------------------------------------------------------------------
1 | DOCKER ?= docker
2 | 
3 | .NOTPARALLEL:
4 | .PHONY: all
5 | 
6 | all: 
7 | 	$(DOCKER) build --no-cache -f Dockerfile -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-client-demo .


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/0.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/1.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/2.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/3.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/4.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/5.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/6.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/7.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/8.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/data/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/9.png


--------------------------------------------------------------------------------
/tfjob/docker/mnist-client/mnist_client.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.7
 2 | 
 3 | import os
 4 | import random
 5 | import numpy
 6 | 
 7 | from PIL import Image
 8 | 
 9 | import tensorflow as tf
10 | from tensorflow.examples.tutorials.mnist import input_data
11 | from tensorflow_serving.apis import predict_pb2
12 | from tensorflow_serving.apis import prediction_service_pb2
13 | 
14 | from grpc.beta import implementations
15 | 
16 | from mnist import MNIST # pylint: disable=no-name-in-module
17 | 
18 | TF_MODEL_SERVER_HOST = os.getenv("TF_MODEL_SERVER_HOST", "127.0.0.1")
19 | TF_MODEL_SERVER_PORT = int(os.getenv("TF_MODEL_SERVER_PORT", 9000))
20 | TF_DATA_DIR = os.getenv("TF_DATA_DIR", "/tmp/data/")
21 | TF_MNIST_IMAGE_PATH = os.getenv("TF_MNIST_IMAGE_PATH", None)
22 | TF_MNIST_TEST_IMAGE_NUMBER = int(os.getenv("TF_MNIST_TEST_IMAGE_NUMBER", -1))
23 | 
24 | if TF_MNIST_IMAGE_PATH != None:
25 |   raw_image = Image.open(TF_MNIST_IMAGE_PATH)
26 |   int_image = numpy.array(raw_image)
27 |   image = numpy.reshape(int_image, 784).astype(numpy.float32)
28 | elif TF_MNIST_TEST_IMAGE_NUMBER > -1:
29 |   test_data_set = input_data.read_data_sets(TF_DATA_DIR, one_hot=True).test
30 |   image = test_data_set.images[TF_MNIST_TEST_IMAGE_NUMBER]
31 | else:
32 |   test_data_set = input_data.read_data_sets(TF_DATA_DIR, one_hot=True).test
33 |   image = random.choice(test_data_set.images)
34 | 
35 | channel = implementations.insecure_channel(
36 |     TF_MODEL_SERVER_HOST, TF_MODEL_SERVER_PORT)
37 | stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
38 | 
39 | request = predict_pb2.PredictRequest()
40 | request.model_spec.name = "mnist"
41 | request.model_spec.signature_name = "predict_images"
42 | request.inputs['images'].CopyFrom(
43 |     tf.contrib.util.make_tensor_proto(image, shape=[1, 784]))
44 | 
45 | result = stub.Predict(request, 10.0)  # 10 secs timeout
46 | 
47 | # print(result)
48 | print(MNIST.display(image, threshold=0))
49 | response = numpy.array(
50 |           result.outputs['scores'].float_val)
51 | prediction = numpy.argmax(response)
52 | # print(prediction)
53 | print("Your model says the above number is... %d!" %
54 |      prediction)


--------------------------------------------------------------------------------
/tfjob/docker/mnist/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.4.0
 2 | 
 3 | RUN mkdir -p /train/tensorflow/input_data && \
 4 |     cd  /train/tensorflow/input_data && \
 5 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \
 6 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \
 7 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \
 8 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz
 9 | 
10 | COPY main.py /app/main.py
11 | 
12 | ENTRYPOINT ["python", "/app/main.py"]


--------------------------------------------------------------------------------
/tfjob/docker/mnist/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.4.0-gpu
 2 | 
 3 | RUN mkdir -p /train/tensorflow/input_data && \
 4 |     cd  /train/tensorflow/input_data && \
 5 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \
 6 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \
 7 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \
 8 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz
 9 | 
10 | COPY main.py /app/main.py
11 | 
12 | ENTRYPOINT ["python", "/app/main.py"]


--------------------------------------------------------------------------------
/tfjob/docker/mnist/Makefile:
--------------------------------------------------------------------------------
 1 | DOCKER ?= docker
 2 | 
 3 | .NOTPARALLEL:
 4 | .PHONY: all
 5 | 
 6 | all: cpu gpu
 7 | 
 8 | cpu:
 9 | 	$(DOCKER) build --no-cache -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-standalone:cpu .
10 |     $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-standalone:cpu
11 | gpu: 
12 | 	$(DOCKER) build --no-cache -f Dockerfile.gpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-standalone:gpu .
13 | 	$(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-standalone:gpu


--------------------------------------------------------------------------------
/tfjob/docker/mnist/export_model.py:
--------------------------------------------------------------------------------
 1 | """Export given TensorFlow model.
 2 | The model is a pretrained  "MNIST", which saved as TensorFlow model checkpoint. This program
 3 | simply uses TensorFlow SavedModel to
 4 | export the trained model with proper signatures that can be loaded by standard
 5 | tensorflow_model_server.
 6 | Usage: mnist_export.py [--model_version=y] [--checkpoint_dir=checkpoint_oss_path] [--checkpoint_step=checkpoint_step] export_dir
 7 | """
 8 | 
 9 | import os
10 | import sys
11 | 
12 | import tensorflow as tf
13 | from tensorflow.python.saved_model import builder as saved_model_builder
14 | from tensorflow.python.saved_model import signature_constants
15 | from tensorflow.python.saved_model import signature_def_utils
16 | from tensorflow.python.saved_model import tag_constants
17 | from tensorflow.python.saved_model import utils
18 | from tensorflow.python.util import compat
19 | from tensorflow.examples.tutorials.mnist import input_data as mnist_input_data
20 | 
21 | tf.app.flags.DEFINE_integer('model_version', 1, 'version number of the exported model.')
22 | tf.app.flags.DEFINE_integer('checkpoint_step', 0, 'Checkpoint steps that we export.')
23 | tf.app.flags.DEFINE_string('checkpoint_path', None, 'Checkpoints path.')
24 | FLAGS = tf.app.flags.FLAGS
25 | 
26 | 
27 | def main(_):
28 |   if len(sys.argv) < 2 or sys.argv[-1].startswith('-'):
29 |     print('Usage: mnist_dist_export.py '
30 |           '[--model_version=y] [--checkpoint_path=checkpoint_store_path] [--checkpoint_step=checkpoint_step] export_dir')
31 |     sys.exit(-1)
32 |   if FLAGS.model_version <= 0:
33 |     print('Please specify a positive value for exported serveable version number.')
34 |     sys.exit(-1)
35 |   if not FLAGS.checkpoint_path:
36 |     print('Please specify the correct path where checkpoints stored locally or in OSS.')
37 |     sys.exit(-1)
38 |   
39 |   checkpoint_basename="model.ckpt"
40 |   default_meta_graph_suffix='.meta'
41 |   ckpt_path=os.path.join(FLAGS.checkpoint_path, checkpoint_basename + '-' + str(FLAGS.checkpoint_step))
42 |   meta_graph_file=ckpt_path + default_meta_graph_suffix
43 |   with tf.Session() as new_sess:
44 | #   with new_sess.graph.as_default():
45 |   #  tf.reset_default_graph()
46 |   #  new_sess.run(tf.initialize_all_variables())
47 |     new_saver = tf.train.import_meta_graph(meta_graph_file, clear_devices=True) #'/test/mnistoutput/ckpt.meta')
48 |     new_saver.restore(new_sess, ckpt_path) #'/test/mnistoutput/ckpt')
49 |     new_graph = tf.get_default_graph()
50 |     new_x = new_graph.get_tensor_by_name('input/x-input:0')
51 |     print(new_x)
52 |     new_y = new_graph.get_tensor_by_name('layer2/activation:0')
53 |     print(new_y)
54 | 
55 |   # Export model
56 |   # WARNING(break-tutorial-inline-code): The following code snippet is
57 |   # in-lined in tutorials, please update tutorial documents accordingly
58 |   # whenever code changes.
59 |     export_path_base = sys.argv[-1]
60 |     export_path = os.path.join(
61 |       compat.as_bytes(export_path_base),
62 |       compat.as_bytes(str(FLAGS.model_version)))
63 |     print('Exporting trained model to', export_path)
64 |     builder = saved_model_builder.SavedModelBuilder(export_path)
65 | 
66 |   # Build the signature_def_map.
67 |     tensor_info_x = utils.build_tensor_info(new_x)
68 |     tensor_info_y = utils.build_tensor_info(new_y)
69 | 
70 |     prediction_signature = signature_def_utils.build_signature_def(
71 |       inputs={'images': tensor_info_x},
72 |       outputs={'scores': tensor_info_y},
73 |       method_name=signature_constants.PREDICT_METHOD_NAME)
74 | 
75 |     legacy_init_op = tf.group(tf.initialize_all_tables(), name='legacy_init_op')
76 | 
77 |     builder.add_meta_graph_and_variables(
78 |       new_sess, [tag_constants.SERVING],
79 |       signature_def_map={
80 |           'predict_images':
81 |               prediction_signature,
82 |       },
83 |       legacy_init_op=legacy_init_op,
84 |       clear_devices=True)
85 |     builder.save()
86 | 
87 |   print('Done exporting!')
88 | 
89 | if __name__ == '__main__':
90 |   tf.app.run()


--------------------------------------------------------------------------------
/tfjob/docker/mnist/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """A simple MNIST classifier which displays summaries in TensorBoard.
 16 | This is an unimpressive MNIST model, but it is a good example of using
 17 | tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
 18 | naming summary tags so that they are grouped meaningfully in TensorBoard.
 19 | It demonstrates the functionality of every TensorBoard dashboard.
 20 | """
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | import argparse
 26 | import os
 27 | import sys
 28 | 
 29 | import tensorflow as tf
 30 | 
 31 | from tensorflow.examples.tutorials.mnist import input_data
 32 | 
 33 | FLAGS = None
 34 | 
 35 | 
 36 | def train():
 37 |   print("data dir: {0}".format(FLAGS.data_dir))
 38 |   # Import data
 39 |   mnist = input_data.read_data_sets(FLAGS.data_dir,
 40 |                                     one_hot=True,
 41 |                                     fake_data=FLAGS.fake_data)
 42 | 
 43 |   # Create a multilayer model.
 44 | 
 45 |   # Input placeholders
 46 |   with tf.name_scope('input'):
 47 |     x = tf.placeholder(tf.float32, [None, 784], name='x-input')
 48 |     y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
 49 | 
 50 |   with tf.name_scope('input_reshape'):
 51 |     image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
 52 |     tf.summary.image('input', image_shaped_input, 10)
 53 | 
 54 |   # We can't initialize these variables to 0 - the network will get stuck.
 55 |   def weight_variable(shape):
 56 |     """Create a weight variable with appropriate initialization."""
 57 |     initial = tf.truncated_normal(shape, stddev=0.1)
 58 |     return tf.Variable(initial)
 59 | 
 60 |   def bias_variable(shape):
 61 |     """Create a bias variable with appropriate initialization."""
 62 |     initial = tf.constant(0.1, shape=shape)
 63 |     return tf.Variable(initial)
 64 | 
 65 |   def variable_summaries(var):
 66 |     """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
 67 |     with tf.name_scope('summaries'):
 68 |       mean = tf.reduce_mean(var)
 69 |       tf.summary.scalar('mean', mean)
 70 |       with tf.name_scope('stddev'):
 71 |         stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
 72 |       tf.summary.scalar('stddev', stddev)
 73 |       tf.summary.scalar('max', tf.reduce_max(var))
 74 |       tf.summary.scalar('min', tf.reduce_min(var))
 75 |       tf.summary.histogram('histogram', var)
 76 | 
 77 |   def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
 78 |     """Reusable code for making a simple neural net layer.
 79 |     It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
 80 |     It also sets up name scoping so that the resultant graph is easy to read,
 81 |     and adds a number of summary ops.
 82 |     """
 83 |     # Adding a name scope ensures logical grouping of the layers in the graph.
 84 |     with tf.name_scope(layer_name):
 85 |       # This Variable will hold the state of the weights for the layer
 86 |       with tf.name_scope('weights'):
 87 |         weights = weight_variable([input_dim, output_dim])
 88 |         variable_summaries(weights)
 89 |       with tf.name_scope('biases'):
 90 |         biases = bias_variable([output_dim])
 91 |         variable_summaries(biases)
 92 |       with tf.name_scope('Wx_plus_b'):
 93 |         preactivate = tf.matmul(input_tensor, weights) + biases
 94 |         tf.summary.histogram('pre_activations', preactivate)
 95 |       activations = act(preactivate, name='activation')
 96 |       tf.summary.histogram('activations', activations)
 97 |       return activations
 98 | 
 99 |   hidden1 = nn_layer(x, 784, 500, 'layer1')
100 | 
101 |   with tf.name_scope('dropout'):
102 |     keep_prob = tf.placeholder_with_default(1.0, shape=())
103 |     tf.summary.scalar('dropout_keep_probability', keep_prob)
104 |     dropped = tf.nn.dropout(hidden1, keep_prob)
105 | 
106 |   # Do not apply softmax activation yet, see below.
107 |   y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
108 | 
109 |   with tf.name_scope('cross_entropy'):
110 |     # The raw formulation of cross-entropy,
111 |     #
112 |     # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
113 |     #                               reduction_indices=[1]))
114 |     #
115 |     # can be numerically unstable.
116 |     #
117 |     # So here we use tf.nn.softmax_cross_entropy_with_logits on the
118 |     # raw outputs of the nn_layer above, and then average across
119 |     # the batch.
120 |     diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
121 |     with tf.name_scope('total'):
122 |       cross_entropy = tf.reduce_mean(diff)
123 |   tf.summary.scalar('cross_entropy', cross_entropy)
124 | 
125 |   with tf.name_scope('train'):
126 |     train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
127 |         cross_entropy)
128 | 
129 |   with tf.name_scope('accuracy'):
130 |     with tf.name_scope('correct_prediction'):
131 |       correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
132 |     with tf.name_scope('accuracy'):
133 |       accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
134 |   tf.summary.scalar('accuracy', accuracy)
135 | 
136 |   # Merge all the summaries and write them out to
137 |   # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
138 |   merged = tf.summary.merge_all()
139 | 
140 |   def feed_dict(train):
141 |     """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
142 |     if train or FLAGS.fake_data:
143 |       xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
144 |       k = FLAGS.dropout
145 |     else:
146 |       xs, ys = mnist.test.images, mnist.test.labels
147 |       k = 1.0
148 |     return {x: xs, y_: ys, keep_prob: k}
149 | 
150 |   sess = tf.InteractiveSession()
151 |   train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
152 |   test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
153 |   tf.global_variables_initializer().run()
154 |   # Train the model, and also write summaries.
155 |   # Every 10th step, measure test-set accuracy, and write test summaries
156 |   # All other steps, run train_step on training data, & add training summaries
157 |   saver = tf.train.Saver()
158 | 
159 |   for i in range(FLAGS.max_steps):
160 |     if i % 10 == 0:  # Record summaries and test-set accuracy
161 |       summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
162 |       test_writer.add_summary(summary, i)
163 |       print('Accuracy at step %s: %s' % (i, acc))
164 |       if i % 100 == 0:
165 |         print('Save checkpoint at step %s: %s' % (i, acc))
166 |         saver.save(sess, FLAGS.log_dir + '/model.ckpt', global_step=i)
167 |     else:  # Record train set summaries, and train
168 |       if i % 100 == 99:  # Record execution stats
169 |         run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
170 |         run_metadata = tf.RunMetadata()
171 |         summary, _ = sess.run([merged, train_step],
172 |                               feed_dict=feed_dict(True),
173 |                               options=run_options,
174 |                               run_metadata=run_metadata)
175 |         train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
176 |         train_writer.add_summary(summary, i)
177 |         print('Adding run metadata for', i)
178 |       else:  # Record a summary
179 |         summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
180 |         train_writer.add_summary(summary, i)
181 | 
182 |   #Train-accuracy
183 |   print('Total Train-accuracy=%s' % (acc))
184 |   train_writer.close()
185 |   test_writer.close()
186 | 
187 | 
188 | def main(_):
189 |   train()
190 | 
191 | 
192 | if __name__ == '__main__':
193 |   parser = argparse.ArgumentParser()
194 |   parser.add_argument('--fake_data', nargs='?', const=True, type=bool,
195 |                       default=False,
196 |                       help='If true, uses fake data for unit testing.')
197 |   parser.add_argument('--max_steps', type=int, default=1000,
198 |                       help='Number of steps to run trainer.')
199 |   parser.add_argument('--learning_rate', type=float, default=0.001,
200 |                       help='Initial learning rate')
201 |   parser.add_argument('--dropout', type=float, default=0.9,
202 |                       help='Keep probability for training dropout.')
203 |   parser.add_argument(
204 |       '--data_dir',
205 |       type=str,
206 |       default=os.path.join(os.getenv('TEST_TMPDIR', '/train'),
207 |                            'data'),
208 |       help='Directory for storing input data')
209 |   parser.add_argument(
210 |       '--log_dir',
211 |       type=str,
212 |       default='/training_logs',
213 |       help='Summaries log directory')
214 |   FLAGS, unparsed = parser.parse_known_args()
215 |   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)


--------------------------------------------------------------------------------
/tfjob/docker/v1alpha2/distributed-mnist/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.5.0
 2 | 
 3 | RUN mkdir -p /train/tensorflow/input_data && \
 4 |     cd  /train/tensorflow/input_data && \
 5 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \
 6 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \
 7 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \
 8 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz
 9 | 
10 | COPY main.py /app/main.py
11 | 
12 | ENTRYPOINT ["python", "/app/main.py"]


--------------------------------------------------------------------------------
/tfjob/docker/v1alpha2/distributed-mnist/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.5.0-gpu
 2 | 
 3 | RUN mkdir -p /train/tensorflow/input_data && \
 4 |     cd  /train/tensorflow/input_data && \
 5 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \
 6 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \
 7 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \
 8 |     curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz
 9 | 
10 | COPY main.py /app/main.py
11 | 
12 | ENTRYPOINT ["python", "/app/main.py"]


--------------------------------------------------------------------------------
/tfjob/docker/v1alpha2/distributed-mnist/Makefile:
--------------------------------------------------------------------------------
 1 | DOCKER ?= docker
 2 | 
 3 | .NOTPARALLEL:
 4 | .PHONY: all
 5 | 
 6 | all: cpu gpu
 7 | 
 8 | cpu:
 9 | 	$(DOCKER) build --no-cache -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed-v1alpha2:cpu .
10 |     $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed-v1alpha2:cpu
11 | gpu: 
12 | 	$(DOCKER) build --no-cache -f Dockerfile.gpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed-v1alpha2:gpu .
13 | 	$(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed-v1alpha2:gpu
14 | 


--------------------------------------------------------------------------------
/tfjob/docker/v1alpha2/distributed-mnist/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """A simple MNIST classifier which displays summaries in TensorBoard.
 16 | This is an unimpressive MNIST model, but it is a good example of using
 17 | tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
 18 | naming summary tags so that they are grouped meaningfully in TensorBoard.
 19 | It demonstrates the functionality of every TensorBoard dashboard.
 20 | """
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | import argparse
 26 | import os
 27 | import sys
 28 | import ast
 29 | import json
 30 | 
 31 | import tensorflow as tf
 32 | 
 33 | from tensorflow.examples.tutorials.mnist import input_data
 34 | 
 35 | FLAGS = None
 36 | 
 37 | def train():
 38 |   tf_config_json = os.environ.get("TF_CONFIG", "{}")
 39 |   tf_config = json.loads(tf_config_json)
 40 | 
 41 |   task = tf_config.get("task", {})
 42 |   cluster_spec = tf_config.get("cluster", {})
 43 |   cluster_spec_object = tf.train.ClusterSpec(cluster_spec)
 44 |   job_name = task["type"]
 45 |   task_id = task["index"]
 46 |   server_def = tf.train.ServerDef(
 47 |       cluster=cluster_spec_object.as_cluster_def(),
 48 |       protocol="grpc",
 49 |       job_name=job_name,
 50 |       task_index=task_id)
 51 |   server = tf.train.Server(server_def)
 52 | 
 53 |   is_chief = (job_name == 'worker') and (task_id == 0)
 54 |   if job_name == 'ps':
 55 |     server.join()
 56 | 
 57 |   if is_chief:
 58 |         print("Worker %d: Initializing session..." % task_id)
 59 |         tf.reset_default_graph()
 60 |   else:
 61 |         print("Worker %d: Waiting for session to be initialized..." % task_id)
 62 | 
 63 | 
 64 |   # Import data
 65 |   mnist = input_data.read_data_sets(FLAGS.data_dir,
 66 |                                     one_hot=True,
 67 |                                     fake_data=FLAGS.fake_data)
 68 | 
 69 |   
 70 |   # Create a multilayer model.
 71 | 
 72 | 
 73 |   # Between-graph replication
 74 |   with tf.device(tf.train.replica_device_setter(
 75 |     worker_device="/job:{0}/task:{1}".format(job_name,task_id),
 76 |     cluster=cluster_spec)):
 77 |   # with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
 78 |     # worker_device="/job:{0}/task:{1}".format(job_name,task_id),
 79 |     # cluster=cluster_spec)):
 80 | 
 81 |     # count the number of updates
 82 |     global_step = tf.get_variable(
 83 |       'global_step',
 84 |       [],
 85 |       initializer = tf.constant_initializer(0),
 86 |       trainable = False)
 87 | 
 88 |     # Input placeholders
 89 |     with tf.name_scope('input'):
 90 |       x = tf.placeholder(tf.float32, [None, 784], name='x-input')
 91 |       y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
 92 | 
 93 |     with tf.name_scope('input_reshape'):
 94 |       image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
 95 |       tf.summary.image('input', image_shaped_input, 10)
 96 | 
 97 |     # We can't initialize these variables to 0 - the network will get stuck.
 98 |     def weight_variable(shape):
 99 |       """Create a weight variable with appropriate initialization."""
100 |       initial = tf.truncated_normal(shape, stddev=0.1)
101 |       return tf.Variable(initial)
102 | 
103 |     def bias_variable(shape):
104 |       """Create a bias variable with appropriate initialization."""
105 |       initial = tf.constant(0.1, shape=shape)
106 |       return tf.Variable(initial)
107 | 
108 |     def variable_summaries(var):
109 |       """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
110 |       with tf.name_scope('summaries'):
111 |         mean = tf.reduce_mean(var)
112 |         tf.summary.scalar('mean', mean)
113 |         with tf.name_scope('stddev'):
114 |           stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
115 |         tf.summary.scalar('stddev', stddev)
116 |         tf.summary.scalar('max', tf.reduce_max(var))
117 |         tf.summary.scalar('min', tf.reduce_min(var))
118 |         tf.summary.histogram('histogram', var)
119 | 
120 |     def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
121 |       """Reusable code for making a simple neural net layer.
122 |       It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
123 |       It also sets up name scoping so that the resultant graph is easy to read,
124 |       and adds a number of summary ops.
125 |       """
126 |       # Adding a name scope ensures logical grouping of the layers in the graph.
127 |       with tf.name_scope(layer_name):
128 |         # This Variable will hold the state of the weights for the layer
129 |         with tf.name_scope('weights'):
130 |           weights = weight_variable([input_dim, output_dim])
131 |           variable_summaries(weights)
132 |         with tf.name_scope('biases'):
133 |           biases = bias_variable([output_dim])
134 |           variable_summaries(biases)
135 |         with tf.name_scope('Wx_plus_b'):
136 |           preactivate = tf.matmul(input_tensor, weights) + biases
137 |           tf.summary.histogram('pre_activations', preactivate)
138 |         activations = act(preactivate, name='activation')
139 |         tf.summary.histogram('activations', activations)
140 |         return activations
141 | 
142 |     hidden1 = nn_layer(x, 784, 500, 'layer1')
143 | 
144 |     with tf.name_scope('dropout'):
145 |       keep_prob = tf.placeholder_with_default(1.0, shape=())
146 |       tf.summary.scalar('dropout_keep_probability', keep_prob)
147 |       dropped = tf.nn.dropout(hidden1, keep_prob)
148 | 
149 |     # Do not apply softmax activation yet, see below.
150 |     y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
151 | 
152 |     with tf.name_scope('cross_entropy'):
153 |       # The raw formulation of cross-entropy,
154 |       #
155 |       # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
156 |       #                               reduction_indices=[1]))
157 |       #
158 |       # can be numerically unstable.
159 |       #
160 |       # So here we use tf.nn.softmax_cross_entropy_with_logits on the
161 |       # raw outputs of the nn_layer above, and then average across
162 |       # the batch.
163 |       #diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
164 |       # with tf.name_scope('total'):
165 |         #cross_entropy = tf.reduce_mean(diff)
166 |       logits = tf.nn.softmax(y, name='logits')
167 |       cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(logits,1e-10,1.0)), name='cross_entropy')
168 |       tf.summary.scalar('cross_entropy', cross_entropy)
169 | 
170 |     with tf.name_scope('train'):
171 |       train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
172 |           cross_entropy)
173 | 
174 |     with tf.name_scope('accuracy'):
175 |       with tf.name_scope('correct_prediction'):
176 |         correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
177 |       with tf.name_scope('accuracy'):
178 |         accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
179 |     tf.summary.scalar('accuracy', accuracy)
180 | 
181 |     # Merge all the summaries and write them out to
182 |     # /train/tensorflow/mnist/logs/mnist_with_summaries (by default)
183 |     merged = tf.summary.merge_all()  
184 | 
185 |     init_op = tf.global_variables_initializer()
186 | 
187 |   def feed_dict(train):
188 |     """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
189 |     if train or FLAGS.fake_data:
190 |       xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
191 |       k = FLAGS.dropout
192 |     else:
193 |       xs, ys = mnist.test.images, mnist.test.labels
194 |       k = 1.0
195 |     return {x: xs, y_: ys, keep_prob: k}
196 | 
197 | 
198 | 
199 |   sv = tf.train.Supervisor(is_chief=is_chief,
200 | 						global_step=global_step,
201 | 						init_op=init_op,
202 | 						logdir=FLAGS.log_dir)
203 |   # sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True,
204 |   #                               device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.worker_index])
205 | 
206 |   with sv.prepare_or_wait_for_session(server.target) as sess:  
207 |     train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
208 |     test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
209 |     # Train the model, and also write summaries.
210 |     # Every 10th step, measure test-set accuracy, and write test summaries
211 |     # All other steps, run train_step on training data, & add training summaries
212 | 
213 |     for i in range(FLAGS.max_steps):
214 |       if i % 10 == 0:  # Record summaries and test-set accuracy
215 |         summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
216 |         test_writer.add_summary(summary, i)
217 |         print('Accuracy at step %s: %s' % (i, acc))
218 |       else:  # Record train set summaries, and train
219 |         if i % 100 == 99:  # Record execution stats
220 |           run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
221 |           run_metadata = tf.RunMetadata()
222 |           summary, _ = sess.run([merged, train_step],
223 |                                 feed_dict=feed_dict(True),
224 |                                 options=run_options,
225 |                                 run_metadata=run_metadata)
226 |           train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
227 |           train_writer.add_summary(summary, i)
228 |           print('Adding run metadata for', i)
229 |         else:  # Record a summary
230 |           summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
231 |           train_writer.add_summary(summary, i)
232 |     train_writer.close()
233 |     test_writer.close()
234 | 
235 | 
236 | def main(_):
237 |   train()
238 | 
239 | 
240 | if __name__ == '__main__':
241 |   parser = argparse.ArgumentParser()
242 |   parser.add_argument('--fake_data', nargs='?', const=True, type=bool,
243 |                       default=False,
244 |                       help='If true, uses fake data for unit testing.')
245 |   parser.add_argument('--max_steps', type=int, default=1000,
246 |                       help='Number of steps to run trainer.')
247 |   parser.add_argument('--learning_rate', type=float, default=0.001,
248 |                       help='Initial learning rate')
249 |   parser.add_argument('--dropout', type=float, default=0.9,
250 |                       help='Keep probability for training dropout.')
251 |   parser.add_argument(
252 |       '--data_dir',
253 |       type=str,
254 |       default=os.path.join(os.getenv('TEST_TMPDIR', '/train'),
255 |                            'data'),
256 |       help='Directory for storing input data')
257 |   parser.add_argument(
258 |       '--log_dir',
259 |       type=str,
260 |       default='/training_logs',
261 |       help='Summaries log directory')
262 |   FLAGS, unparsed = parser.parse_known_args()
263 |   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
264 | 


--------------------------------------------------------------------------------