├── README.md ├── data ├── t10k-images-idx3-ubyte.gz ├── t10k-labels-idx1-ubyte.gz ├── train-images-idx3-ubyte.gz └── train-labels-idx1-ubyte.gz ├── gpu-mem-samples ├── centos │ ├── Dockerfile │ └── main.py └── ubuntu │ ├── Dockerfile │ └── main.py ├── mnist-tf ├── Dockerfile ├── Dockerfile.cpu ├── Makefile ├── main.py └── pip.conf ├── models └── tensorflow │ └── mnist.tar.gz ├── mpijob └── docker │ ├── Dockerfile │ └── Makefile ├── pipelines ├── iris_pipelines.py └── train │ ├── iris_training.csv │ └── train.py └── tfjob └── docker ├── distributed-mnist ├── Dockerfile.cpu ├── Dockerfile.gpu ├── Makefile └── main.py ├── estimator ├── Dockerfile.cpu ├── Dockerfile.gpu ├── Makefile └── mnist_estimator.py ├── export-model ├── Dockerfile ├── Makefile └── export_model.py ├── mnist-client ├── Dockerfile ├── Makefile ├── data │ ├── 0.png │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ └── 9.png └── mnist_client.py ├── mnist ├── Dockerfile.cpu ├── Dockerfile.gpu ├── Makefile ├── export_model.py └── main.py └── v1alpha2 └── distributed-mnist ├── Dockerfile.cpu ├── Dockerfile.gpu ├── Makefile └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # tensorflow sample code 2 | The sample code of running TensorFlow 3 | -------------------------------------------------------------------------------- /data/t10k-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/data/t10k-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /data/t10k-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/data/t10k-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /data/train-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/data/train-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /data/train-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/data/train-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /gpu-mem-samples/centos/Dockerfile: -------------------------------------------------------------------------------- 1 | # docker build -t registry.cn-shanghai.aliyuncs.com/tensorflow-samples/tensorflow-gpu-mem:10.0-runtime-centos7 . 2 | FROM registry.cn-huhehaote.aliyuncs.com/tensorflow-samples/tensorflow:centos7-cuda10.0-1.14-py36 3 | 4 | ADD main.py /app/main.py 5 | 6 | CMD ["python3","/app/main.py"] 7 | -------------------------------------------------------------------------------- /gpu-mem-samples/centos/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | FLAGS = None 10 | 11 | def train(): 12 | 13 | a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') 14 | b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') 15 | c = tf.matmul(a, b) 16 | 17 | sess = tf.Session() 18 | # Runs the op. 19 | while True: 20 | sess.run(c) 21 | 22 | 23 | if __name__ == '__main__': 24 | train() -------------------------------------------------------------------------------- /gpu-mem-samples/ubuntu/Dockerfile: -------------------------------------------------------------------------------- 1 | # docker build -t registry.cn-shanghai.aliyuncs.com/tensorflow-samples/tensorflow-gpu-mem:debian . 2 | FROM tensorflow/tensorflow:1.14.0-gpu-py3 3 | 4 | ADD main.py /app/main.py 5 | 6 | CMD ["python3","/app/main.py"] 7 | -------------------------------------------------------------------------------- /gpu-mem-samples/ubuntu/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import tensorflow as tf 8 | 9 | FLAGS = None 10 | 11 | def train(): 12 | 13 | a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') 14 | b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') 15 | c = tf.matmul(a, b) 16 | 17 | sess = tf.Session() 18 | # Runs the op. 19 | while True: 20 | sess.run(c) 21 | 22 | 23 | if __name__ == '__main__': 24 | train() -------------------------------------------------------------------------------- /mnist-tf/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.5.0-devel-gpu 2 | RUN mkdir /app 3 | WORKDIR /app 4 | RUN mkdir ./logs 5 | RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/' /etc/apt/sources.list 6 | COPY pip.conf /root/.pip/pip.conf 7 | 8 | 9 | COPY ./* /app/ -------------------------------------------------------------------------------- /mnist-tf/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.5.0 2 | RUN mkdir /app 3 | WORKDIR /app 4 | RUN mkdir ./logs 5 | RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/' /etc/apt/sources.list 6 | COPY pip.conf /root/.pip/pip.conf 7 | 8 | 9 | COPY ./* /app/ -------------------------------------------------------------------------------- /mnist-tf/Makefile: -------------------------------------------------------------------------------- 1 | DOCKER ?= docker 2 | 3 | .NOTPARALLEL: 4 | .PHONY: all 5 | 6 | all: cpu gpu 7 | 8 | cpu: 9 | $(DOCKER) build -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-k8s:cpu . 10 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-k8s:cpu 11 | gpu: 12 | $(DOCKER) build -f Dockerfile -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-k8s:gpu . 13 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-k8s:gpu -------------------------------------------------------------------------------- /mnist-tf/main.py: -------------------------------------------------------------------------------- 1 | # Code modified from: https://github.com/ischlag/distributed-tensorflow-example 2 | from __future__ import print_function 3 | import tensorflow as tf 4 | import sys 5 | import time 6 | import os 7 | import ast 8 | 9 | # input flags 10 | tf.app.flags.DEFINE_string("logdir", "", "directory to save summaries") 11 | tf.app.flags.DEFINE_integer("epochs", 20, "number of epochs") 12 | tf.app.flags.DEFINE_integer("batch_size", 100, "batch size") 13 | tf.app.flags.DEFINE_float("lr", 0.0005, "learning rate") 14 | 15 | FLAGS = tf.app.flags.FLAGS 16 | 17 | POD_NAME = os.environ.get('POD_NAME') 18 | CLUSTER_CONFIG = os.environ.get('CLUSTER_CONFIG') 19 | 20 | job_name, task_id = POD_NAME.split('-', 2) 21 | task_id = int(task_id) 22 | cluster_def = ast.literal_eval(CLUSTER_CONFIG) 23 | cluster_spec = tf.train.ClusterSpec(cluster_def) 24 | 25 | is_chief = (job_name == 'worker') and (task_id == 0) 26 | 27 | server = tf.train.Server( 28 | cluster_spec, 29 | job_name=job_name, 30 | task_index=task_id 31 | ) 32 | 33 | if job_name == 'ps': 34 | server.join() 35 | 36 | # config 37 | batch_size = FLAGS.batch_size 38 | learning_rate = FLAGS.lr 39 | training_epochs = FLAGS.epochs 40 | 41 | # load mnist data set 42 | from tensorflow.examples.tutorials.mnist import input_data 43 | mnist = input_data.read_data_sets('MNIST_data', one_hot=True) 44 | 45 | # Between-graph replication 46 | with tf.device(tf.train.replica_device_setter( 47 | worker_device="/job:worker/task:%d" % task_id, 48 | cluster=cluster_spec)): 49 | 50 | # count the number of updates 51 | global_step = tf.get_variable( 52 | 'global_step', 53 | [], 54 | initializer = tf.constant_initializer(0), 55 | trainable = False) 56 | 57 | # input images 58 | with tf.name_scope('input'): 59 | # None -> batch size can be any size, 784 -> flattened mnist image 60 | x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input") 61 | # target 10 output classes 62 | y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input") 63 | 64 | # model parameters will change during training so we use tf.Variable 65 | tf.set_random_seed(1) 66 | with tf.name_scope("weights"): 67 | W1 = tf.Variable(tf.random_normal([784, 100])) 68 | W2 = tf.Variable(tf.random_normal([100, 10])) 69 | 70 | # bias 71 | with tf.name_scope("biases"): 72 | b1 = tf.Variable(tf.zeros([100])) 73 | b2 = tf.Variable(tf.zeros([10])) 74 | 75 | # implement model 76 | with tf.name_scope("softmax"): 77 | # y is our prediction 78 | z2 = tf.add(tf.matmul(x,W1),b1) 79 | a2 = tf.nn.sigmoid(z2) 80 | z3 = tf.add(tf.matmul(a2,W2),b2) 81 | y = tf.nn.softmax(z3) 82 | 83 | # specify cost function 84 | with tf.name_scope('cross_entropy'): 85 | # this is our cost 86 | cross_entropy = tf.reduce_mean( 87 | -tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1])) 88 | 89 | # specify optimizer 90 | with tf.name_scope('train'): 91 | # optimizer is an "operation" which we can execute in a session 92 | grad_op = tf.train.GradientDescentOptimizer(learning_rate) 93 | train_op = grad_op.minimize(cross_entropy, global_step=global_step) 94 | 95 | with tf.name_scope('Accuracy'): 96 | # accuracy 97 | correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) 98 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 99 | 100 | # create a summary for our cost and accuracy 101 | tf.summary.scalar("cost", cross_entropy) 102 | tf.summary.scalar("accuracy", accuracy) 103 | 104 | # merge all summaries into a single "operation" which we can execute in a session 105 | summary_op = tf.summary.merge_all() 106 | init_op = tf.global_variables_initializer() 107 | print("Variables initialized ...") 108 | 109 | sv = tf.train.Supervisor(is_chief=is_chief, 110 | global_step=global_step, 111 | init_op=init_op) 112 | 113 | begin_time = time.time() 114 | frequency = 100 115 | with sv.prepare_or_wait_for_session(server.target) as sess: 116 | # create log writer object (this will log on every machine) 117 | writer = tf.summary.FileWriter(FLAGS.logdir, graph=tf.get_default_graph()) 118 | 119 | # perform training cycles 120 | start_time = time.time() 121 | for epoch in range(training_epochs): 122 | 123 | # number of batches in one epoch 124 | batch_count = int(mnist.train.num_examples/batch_size) 125 | 126 | count = 0 127 | for i in range(batch_count): 128 | batch_x, batch_y = mnist.train.next_batch(batch_size) 129 | 130 | # perform the operations we defined earlier on batch 131 | _, cost, summary, step = sess.run([train_op, cross_entropy, summary_op, global_step], 132 | feed_dict={x: batch_x, y_: batch_y}) 133 | writer.add_summary(summary, step) 134 | 135 | count += 1 136 | if count % frequency == 0 or i+1 == batch_count: 137 | elapsed_time = time.time() - start_time 138 | start_time = time.time() 139 | print("Step: %d," % (step+1), 140 | " Epoch: %2d," % (epoch+1), 141 | " Batch: %3d of %3d," % (i+1, batch_count), 142 | " Cost: %.4f," % cost, 143 | " AvgTime: %3.2fms" % float(elapsed_time*1000/frequency)) 144 | count = 0 145 | 146 | 147 | print("Test-Accuracy: %2.2f" % sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels})) 148 | print("Total Time: %3.2fs" % float(time.time() - begin_time)) 149 | print("Final Cost: %.4f" % cost) 150 | 151 | sv.stop() 152 | print("done") -------------------------------------------------------------------------------- /mnist-tf/pip.conf: -------------------------------------------------------------------------------- 1 | [global] 2 | index-url = http://mirrors.aliyun.com/pypi/simple/ 3 | 4 | [install] 5 | trusted-host=mirrors.aliyun.com -------------------------------------------------------------------------------- /models/tensorflow/mnist.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/models/tensorflow/mnist.tar.gz -------------------------------------------------------------------------------- /mpijob/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM uber/horovod:0.13.11-tf1.10.0-torch0.4.0-py3.5 2 | 3 | RUN cd / && \ 4 | git clone -b cnn_tf_v1.9_compatible https://github.com/tensorflow/benchmarks.git 5 | 6 | CMD ["bash", "-c", "mpirun python /benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model resnet101 --batch_size 64 --variable_update horovod --train_dir=/training_logs --summary_verbosity=3 --save_summaries_steps=10"] -------------------------------------------------------------------------------- /mpijob/docker/Makefile: -------------------------------------------------------------------------------- 1 | DOCKER ?= docker 2 | 3 | .NOTPARALLEL: 4 | .PHONY: all 5 | 6 | all: gpu 7 | 8 | gpu: 9 | $(DOCKER) build --no-cache -f Dockerfile -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/horovod:0.13.11-tf1.10.0-torch0.4.0-py3.5 . 10 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/horovod:0.13.11-tf1.10.0-torch0.4.0-py3.5 -------------------------------------------------------------------------------- /pipelines/iris_pipelines.py: -------------------------------------------------------------------------------- 1 | pipelines.py -------------------------------------------------------------------------------- /pipelines/train/iris_training.csv: -------------------------------------------------------------------------------- 1 | 120,4,setosa,versicolor,virginica 2 | 6.4,2.8,5.6,2.2,2 3 | 5.0,2.3,3.3,1.0,1 4 | 4.9,2.5,4.5,1.7,2 5 | 4.9,3.1,1.5,0.1,0 6 | 5.7,3.8,1.7,0.3,0 7 | 4.4,3.2,1.3,0.2,0 8 | 5.4,3.4,1.5,0.4,0 9 | 6.9,3.1,5.1,2.3,2 10 | 6.7,3.1,4.4,1.4,1 11 | 5.1,3.7,1.5,0.4,0 12 | 5.2,2.7,3.9,1.4,1 13 | 6.9,3.1,4.9,1.5,1 14 | 5.8,4.0,1.2,0.2,0 15 | 5.4,3.9,1.7,0.4,0 16 | 7.7,3.8,6.7,2.2,2 17 | 6.3,3.3,4.7,1.6,1 18 | 6.8,3.2,5.9,2.3,2 19 | 7.6,3.0,6.6,2.1,2 20 | 6.4,3.2,5.3,2.3,2 21 | 5.7,4.4,1.5,0.4,0 22 | 6.7,3.3,5.7,2.1,2 23 | 6.4,2.8,5.6,2.1,2 24 | 5.4,3.9,1.3,0.4,0 25 | 6.1,2.6,5.6,1.4,2 26 | 7.2,3.0,5.8,1.6,2 27 | 5.2,3.5,1.5,0.2,0 28 | 5.8,2.6,4.0,1.2,1 29 | 5.9,3.0,5.1,1.8,2 30 | 5.4,3.0,4.5,1.5,1 31 | 6.7,3.0,5.0,1.7,1 32 | 6.3,2.3,4.4,1.3,1 33 | 5.1,2.5,3.0,1.1,1 34 | 6.4,3.2,4.5,1.5,1 35 | 6.8,3.0,5.5,2.1,2 36 | 6.2,2.8,4.8,1.8,2 37 | 6.9,3.2,5.7,2.3,2 38 | 6.5,3.2,5.1,2.0,2 39 | 5.8,2.8,5.1,2.4,2 40 | 5.1,3.8,1.5,0.3,0 41 | 4.8,3.0,1.4,0.3,0 42 | 7.9,3.8,6.4,2.0,2 43 | 5.8,2.7,5.1,1.9,2 44 | 6.7,3.0,5.2,2.3,2 45 | 5.1,3.8,1.9,0.4,0 46 | 4.7,3.2,1.6,0.2,0 47 | 6.0,2.2,5.0,1.5,2 48 | 4.8,3.4,1.6,0.2,0 49 | 7.7,2.6,6.9,2.3,2 50 | 4.6,3.6,1.0,0.2,0 51 | 7.2,3.2,6.0,1.8,2 52 | 5.0,3.3,1.4,0.2,0 53 | 6.6,3.0,4.4,1.4,1 54 | 6.1,2.8,4.0,1.3,1 55 | 5.0,3.2,1.2,0.2,0 56 | 7.0,3.2,4.7,1.4,1 57 | 6.0,3.0,4.8,1.8,2 58 | 7.4,2.8,6.1,1.9,2 59 | 5.8,2.7,5.1,1.9,2 60 | 6.2,3.4,5.4,2.3,2 61 | 5.0,2.0,3.5,1.0,1 62 | 5.6,2.5,3.9,1.1,1 63 | 6.7,3.1,5.6,2.4,2 64 | 6.3,2.5,5.0,1.9,2 65 | 6.4,3.1,5.5,1.8,2 66 | 6.2,2.2,4.5,1.5,1 67 | 7.3,2.9,6.3,1.8,2 68 | 4.4,3.0,1.3,0.2,0 69 | 7.2,3.6,6.1,2.5,2 70 | 6.5,3.0,5.5,1.8,2 71 | 5.0,3.4,1.5,0.2,0 72 | 4.7,3.2,1.3,0.2,0 73 | 6.6,2.9,4.6,1.3,1 74 | 5.5,3.5,1.3,0.2,0 75 | 7.7,3.0,6.1,2.3,2 76 | 6.1,3.0,4.9,1.8,2 77 | 4.9,3.1,1.5,0.1,0 78 | 5.5,2.4,3.8,1.1,1 79 | 5.7,2.9,4.2,1.3,1 80 | 6.0,2.9,4.5,1.5,1 81 | 6.4,2.7,5.3,1.9,2 82 | 5.4,3.7,1.5,0.2,0 83 | 6.1,2.9,4.7,1.4,1 84 | 6.5,2.8,4.6,1.5,1 85 | 5.6,2.7,4.2,1.3,1 86 | 6.3,3.4,5.6,2.4,2 87 | 4.9,3.1,1.5,0.1,0 88 | 6.8,2.8,4.8,1.4,1 89 | 5.7,2.8,4.5,1.3,1 90 | 6.0,2.7,5.1,1.6,1 91 | 5.0,3.5,1.3,0.3,0 92 | 6.5,3.0,5.2,2.0,2 93 | 6.1,2.8,4.7,1.2,1 94 | 5.1,3.5,1.4,0.3,0 95 | 4.6,3.1,1.5,0.2,0 96 | 6.5,3.0,5.8,2.2,2 97 | 4.6,3.4,1.4,0.3,0 98 | 4.6,3.2,1.4,0.2,0 99 | 7.7,2.8,6.7,2.0,2 100 | 5.9,3.2,4.8,1.8,1 101 | 5.1,3.8,1.6,0.2,0 102 | 4.9,3.0,1.4,0.2,0 103 | 4.9,2.4,3.3,1.0,1 104 | 4.5,2.3,1.3,0.3,0 105 | 5.8,2.7,4.1,1.0,1 106 | 5.0,3.4,1.6,0.4,0 107 | 5.2,3.4,1.4,0.2,0 108 | 5.3,3.7,1.5,0.2,0 109 | 5.0,3.6,1.4,0.2,0 110 | 5.6,2.9,3.6,1.3,1 111 | 4.8,3.1,1.6,0.2,0 112 | 6.3,2.7,4.9,1.8,2 113 | 5.7,2.8,4.1,1.3,1 114 | 5.0,3.0,1.6,0.2,0 115 | 6.3,3.3,6.0,2.5,2 116 | 5.0,3.5,1.6,0.6,0 117 | 5.5,2.6,4.4,1.2,1 118 | 5.7,3.0,4.2,1.2,1 119 | 4.4,2.9,1.4,0.2,0 120 | 4.8,3.0,1.4,0.1,0 121 | 5.5,2.4,3.7,1.0,1 122 | -------------------------------------------------------------------------------- /pipelines/train/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | from tensorflow.contrib.learn.python.learn.datasets import base 5 | #from tf.data import base 6 | 7 | # import tensorflow_datasets as tfds 8 | 9 | #from tensorflow.data import base 10 | 11 | IRIS_TRAIN='iris_training.csv' 12 | IRIS_TEST='iris_test.csv' 13 | 14 | base.load_csv_with_header(filename=IRIS_TRAIN, features_dtype=np.float32, target_dtype=np.int) 15 | train_set = base.load_csv_with_header(filename=IRIS_TRAIN,features_dtype=np.float32, 16 | target_dtype=np.int) 17 | 18 | test_set = base.load_csv_with_header(filename=IRIS_TEST, features_dtype=np.float32, target_dtype=np.int) -------------------------------------------------------------------------------- /tfjob/docker/distributed-mnist/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.5.0 2 | COPY main.py /app/main.py 3 | 4 | ENTRYPOINT ["python", "/app/main.py"] -------------------------------------------------------------------------------- /tfjob/docker/distributed-mnist/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.5.0-gpu 2 | COPY main.py /app/main.py 3 | 4 | ENTRYPOINT ["python", "/app/main.py"] -------------------------------------------------------------------------------- /tfjob/docker/distributed-mnist/Makefile: -------------------------------------------------------------------------------- 1 | DOCKER ?= docker 2 | 3 | .NOTPARALLEL: 4 | .PHONY: all 5 | 6 | all: cpu gpu 7 | 8 | cpu: 9 | $(DOCKER) build --no-cache -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed:cpu . 10 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed:cpu 11 | gpu: 12 | $(DOCKER) build --no-cache -f Dockerfile.gpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed:gpu . 13 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed:gpu 14 | -------------------------------------------------------------------------------- /tfjob/docker/distributed-mnist/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """A simple MNIST classifier which displays summaries in TensorBoard. 16 | This is an unimpressive MNIST model, but it is a good example of using 17 | tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of 18 | naming summary tags so that they are grouped meaningfully in TensorBoard. 19 | It demonstrates the functionality of every TensorBoard dashboard. 20 | """ 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import argparse 26 | import os 27 | import sys 28 | import ast 29 | import json 30 | 31 | import tensorflow as tf 32 | 33 | from tensorflow.examples.tutorials.mnist import input_data 34 | 35 | FLAGS = None 36 | 37 | def train(): 38 | tf_config_json = os.environ.get("TF_CONFIG", "{}") 39 | tf_config = json.loads(tf_config_json) 40 | 41 | task = tf_config.get("task", {}) 42 | cluster_spec = tf_config.get("cluster", {}) 43 | cluster_spec_object = tf.train.ClusterSpec(cluster_spec) 44 | job_name = task["type"] 45 | task_id = task["index"] 46 | server_def = tf.train.ServerDef( 47 | cluster=cluster_spec_object.as_cluster_def(), 48 | protocol="grpc", 49 | job_name=job_name, 50 | task_index=task_id) 51 | server = tf.train.Server(server_def) 52 | 53 | is_chief = (job_name == 'master') 54 | if job_name == 'ps': 55 | server.join() 56 | 57 | if is_chief: 58 | print("Worker %d: Initializing session..." % task_id) 59 | tf.reset_default_graph() 60 | else: 61 | print("Worker %d: Waiting for session to be initialized..." % task_id) 62 | 63 | 64 | # Import data 65 | mnist = input_data.read_data_sets(FLAGS.data_dir, 66 | one_hot=True, 67 | fake_data=FLAGS.fake_data) 68 | 69 | 70 | # Create a multilayer model. 71 | 72 | 73 | # Between-graph replication 74 | with tf.device(tf.train.replica_device_setter( 75 | worker_device="/job:{0}/task:{1}".format(job_name,task_id), 76 | cluster=cluster_spec)): 77 | # with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): 78 | # worker_device="/job:{0}/task:{1}".format(job_name,task_id), 79 | # cluster=cluster_spec)): 80 | 81 | # count the number of updates 82 | global_step = tf.get_variable( 83 | 'global_step', 84 | [], 85 | initializer = tf.constant_initializer(0), 86 | trainable = False) 87 | 88 | # Input placeholders 89 | with tf.name_scope('input'): 90 | x = tf.placeholder(tf.float32, [None, 784], name='x-input') 91 | y_ = tf.placeholder(tf.float32, [None, 10], name='y-input') 92 | 93 | with tf.name_scope('input_reshape'): 94 | image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) 95 | tf.summary.image('input', image_shaped_input, 10) 96 | 97 | # We can't initialize these variables to 0 - the network will get stuck. 98 | def weight_variable(shape): 99 | """Create a weight variable with appropriate initialization.""" 100 | initial = tf.truncated_normal(shape, stddev=0.1) 101 | return tf.Variable(initial) 102 | 103 | def bias_variable(shape): 104 | """Create a bias variable with appropriate initialization.""" 105 | initial = tf.constant(0.1, shape=shape) 106 | return tf.Variable(initial) 107 | 108 | def variable_summaries(var): 109 | """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" 110 | with tf.name_scope('summaries'): 111 | mean = tf.reduce_mean(var) 112 | tf.summary.scalar('mean', mean) 113 | with tf.name_scope('stddev'): 114 | stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) 115 | tf.summary.scalar('stddev', stddev) 116 | tf.summary.scalar('max', tf.reduce_max(var)) 117 | tf.summary.scalar('min', tf.reduce_min(var)) 118 | tf.summary.histogram('histogram', var) 119 | 120 | def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): 121 | """Reusable code for making a simple neural net layer. 122 | It does a matrix multiply, bias add, and then uses ReLU to nonlinearize. 123 | It also sets up name scoping so that the resultant graph is easy to read, 124 | and adds a number of summary ops. 125 | """ 126 | # Adding a name scope ensures logical grouping of the layers in the graph. 127 | with tf.name_scope(layer_name): 128 | # This Variable will hold the state of the weights for the layer 129 | with tf.name_scope('weights'): 130 | weights = weight_variable([input_dim, output_dim]) 131 | variable_summaries(weights) 132 | with tf.name_scope('biases'): 133 | biases = bias_variable([output_dim]) 134 | variable_summaries(biases) 135 | with tf.name_scope('Wx_plus_b'): 136 | preactivate = tf.matmul(input_tensor, weights) + biases 137 | tf.summary.histogram('pre_activations', preactivate) 138 | activations = act(preactivate, name='activation') 139 | tf.summary.histogram('activations', activations) 140 | return activations 141 | 142 | hidden1 = nn_layer(x, 784, 500, 'layer1') 143 | 144 | with tf.name_scope('dropout'): 145 | keep_prob = tf.placeholder_with_default(1.0, shape=()) 146 | tf.summary.scalar('dropout_keep_probability', keep_prob) 147 | dropped = tf.nn.dropout(hidden1, keep_prob) 148 | 149 | # Do not apply softmax activation yet, see below. 150 | y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) 151 | 152 | with tf.name_scope('cross_entropy'): 153 | # The raw formulation of cross-entropy, 154 | # 155 | # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)), 156 | # reduction_indices=[1])) 157 | # 158 | # can be numerically unstable. 159 | # 160 | # So here we use tf.nn.softmax_cross_entropy_with_logits on the 161 | # raw outputs of the nn_layer above, and then average across 162 | # the batch. 163 | #diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y) 164 | # with tf.name_scope('total'): 165 | #cross_entropy = tf.reduce_mean(diff) 166 | logits = tf.nn.softmax(y, name='logits') 167 | cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(logits,1e-10,1.0)), name='cross_entropy') 168 | tf.summary.scalar('cross_entropy', cross_entropy) 169 | 170 | with tf.name_scope('train'): 171 | train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( 172 | cross_entropy) 173 | 174 | with tf.name_scope('accuracy'): 175 | with tf.name_scope('correct_prediction'): 176 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 177 | with tf.name_scope('accuracy'): 178 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 179 | tf.summary.scalar('accuracy', accuracy) 180 | 181 | # Merge all the summaries and write them out to 182 | # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) 183 | merged = tf.summary.merge_all() 184 | 185 | init_op = tf.global_variables_initializer() 186 | 187 | def feed_dict(train): 188 | """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" 189 | if train or FLAGS.fake_data: 190 | xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) 191 | k = FLAGS.dropout 192 | else: 193 | xs, ys = mnist.test.images, mnist.test.labels 194 | k = 1.0 195 | return {x: xs, y_: ys, keep_prob: k} 196 | 197 | 198 | 199 | sv = tf.train.Supervisor(is_chief=is_chief, 200 | global_step=global_step, 201 | init_op=init_op, 202 | logdir=FLAGS.log_dir) 203 | # sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True, 204 | # device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.worker_index]) 205 | 206 | with sv.prepare_or_wait_for_session(server.target) as sess: 207 | train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) 208 | test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') 209 | # Train the model, and also write summaries. 210 | # Every 10th step, measure test-set accuracy, and write test summaries 211 | # All other steps, run train_step on training data, & add training summaries 212 | 213 | for i in range(FLAGS.max_steps): 214 | if i % 10 == 0: # Record summaries and test-set accuracy 215 | summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) 216 | test_writer.add_summary(summary, i) 217 | print('Accuracy at step %s: %s' % (i, acc)) 218 | else: # Record train set summaries, and train 219 | if i % 100 == 99: # Record execution stats 220 | run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 221 | run_metadata = tf.RunMetadata() 222 | summary, _ = sess.run([merged, train_step], 223 | feed_dict=feed_dict(True), 224 | options=run_options, 225 | run_metadata=run_metadata) 226 | train_writer.add_run_metadata(run_metadata, 'step%03d' % i) 227 | train_writer.add_summary(summary, i) 228 | print('Adding run metadata for', i) 229 | else: # Record a summary 230 | summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) 231 | train_writer.add_summary(summary, i) 232 | train_writer.close() 233 | test_writer.close() 234 | 235 | 236 | def main(_): 237 | train() 238 | 239 | 240 | if __name__ == '__main__': 241 | parser = argparse.ArgumentParser() 242 | parser.add_argument('--fake_data', nargs='?', const=True, type=bool, 243 | default=False, 244 | help='If true, uses fake data for unit testing.') 245 | parser.add_argument('--max_steps', type=int, default=1000, 246 | help='Number of steps to run trainer.') 247 | parser.add_argument('--learning_rate', type=float, default=0.001, 248 | help='Initial learning rate') 249 | parser.add_argument('--dropout', type=float, default=0.9, 250 | help='Keep probability for training dropout.') 251 | parser.add_argument( 252 | '--data_dir', 253 | type=str, 254 | default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), 255 | 'data'), 256 | help='Directory for storing input data') 257 | parser.add_argument( 258 | '--log_dir', 259 | type=str, 260 | default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), 261 | 'tensorflow/logs'), 262 | help='Summaries log directory') 263 | FLAGS, unparsed = parser.parse_known_args() 264 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 265 | -------------------------------------------------------------------------------- /tfjob/docker/estimator/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.10.1-py3 2 | 3 | RUN mkdir -p /app/MNIST/ && \ 4 | cd /app/MNIST/ && \ 5 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \ 6 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \ 7 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \ 8 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz 9 | 10 | RUN sed -i 's/https:\/\/storage.googleapis.com\/cvdf-datasets\/mnist\//http:\/\/kubeflow-oss.oss-cn-hangzhou.aliyuncs.com\/tensorflow\/input_data\//g' /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py 11 | 12 | COPY mnist_estimator.py /app/mnist_estimator.py 13 | 14 | ENTRYPOINT ["python", "/app/mnist_estimator.py"] -------------------------------------------------------------------------------- /tfjob/docker/estimator/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.10.1-gpu-py3 2 | 3 | RUN mkdir -p /app/MNIST/ && \ 4 | cd /app/MNIST/ && \ 5 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \ 6 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \ 7 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \ 8 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz 9 | 10 | RUN sed -i 's/https:\/\/storage.googleapis.com\/cvdf-datasets\/mnist\//http:\/\/kubeflow-oss.oss-cn-hangzhou.aliyuncs.com\/tensorflow\/input_data\//g' /usr/local/lib/python3.5/dist-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py 11 | 12 | COPY mnist_estimator.py /app/mnist_estimator.py 13 | 14 | ENTRYPOINT ["python", "/app/mnist_estimator.py"] -------------------------------------------------------------------------------- /tfjob/docker/estimator/Makefile: -------------------------------------------------------------------------------- 1 | DOCKER ?= docker 2 | 3 | .NOTPARALLEL: 4 | .PHONY: all 5 | 6 | all: cpu gpu 7 | 8 | cpu: 9 | $(DOCKER) build --no-cache -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/mnist-estimator:cpu . 10 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/mnist-estimator:cpu 11 | gpu: 12 | $(DOCKER) build --no-cache -f Dockerfile.gpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/mnist-estimator:gpu . 13 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/mnist-estimator:gpu 14 | -------------------------------------------------------------------------------- /tfjob/docker/estimator/mnist_estimator.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | tf.logging.set_verbosity(tf.logging.INFO) 9 | 10 | tf.app.flags.DEFINE_integer('steps', 100, 'The number of steps to train a model') 11 | tf.app.flags.DEFINE_string('model_dir', './models/ckpt/', 'Dir to save a model and checkpoints') 12 | tf.app.flags.DEFINE_string('saved_dir', './models/pb/', 'Dir to save a model for TF serving') 13 | FLAGS = tf.app.flags.FLAGS 14 | 15 | INPUT_FEATURE = 'image' 16 | NUM_CLASSES = 10 17 | 18 | 19 | def cnn_model_fn(features, labels, mode): 20 | """Model function for CNN.""" 21 | # Input Layer 22 | input_layer = features[INPUT_FEATURE] 23 | 24 | # First convolutional Layer and pooling layer 25 | conv1 = tf.layers.conv2d( 26 | inputs=input_layer, 27 | filters=32, 28 | kernel_size=[5, 5], 29 | padding="same", 30 | activation=None) 31 | batch_norm1 = tf.layers.batch_normalization(conv1) 32 | relu1 = tf.nn.relu(batch_norm1) 33 | pool1 = tf.layers.max_pooling2d(inputs=relu1, pool_size=[2, 2], strides=2) 34 | 35 | # Second convolutional Layer and pooling layer 36 | conv2 = tf.layers.conv2d( 37 | inputs=pool1, 38 | filters=64, 39 | kernel_size=[5, 5], 40 | padding="same", 41 | activation=None) 42 | batch_norm2 = tf.layers.batch_normalization(conv2) 43 | relu2 = tf.nn.relu(batch_norm2) 44 | pool2 = tf.layers.max_pooling2d(inputs=relu2, pool_size=[2, 2], strides=2) 45 | 46 | # Flatten tensor into a batch of vectors 47 | pool2_flat = tf.layers.flatten(pool2) 48 | 49 | # Dense Layer 50 | dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) 51 | 52 | # Add dropout operation 53 | dropout = tf.layers.dropout( 54 | inputs=dense, rate=0.4, training=(mode == tf.estimator.ModeKeys.TRAIN)) 55 | 56 | # Logits layer 57 | logits = tf.layers.dense(inputs=dropout, units=NUM_CLASSES) 58 | 59 | predictions = { 60 | # Generate predictions (for PREDICT and EVAL mode) 61 | "classes": tf.argmax(input=logits, axis=1), 62 | # Add `softmax_tensor` to the graph. It is used for PREDICT and by the 63 | # `logging_hook`. 64 | "probabilities": tf.nn.softmax(logits, name="softmax_tensor") 65 | } 66 | 67 | # PREDICT mode 68 | if mode == tf.estimator.ModeKeys.PREDICT: 69 | return tf.estimator.EstimatorSpec( 70 | mode=mode, 71 | predictions=predictions, 72 | export_outputs={ 73 | 'predict': tf.estimator.export.PredictOutput(predictions) 74 | }) 75 | 76 | # Calculate Loss (for both TRAIN and EVAL modes) 77 | loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) 78 | 79 | # Configure the Training Op (for TRAIN mode) 80 | if mode == tf.estimator.ModeKeys.TRAIN: 81 | optimizer = tf.train.AdamOptimizer() 82 | train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) 83 | return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) 84 | 85 | # Add evaluation metrics (for EVAL mode) 86 | eval_metric_ops = { 87 | "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) 88 | } 89 | return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) 90 | 91 | 92 | def serving_input_receiver_fn(): 93 | """ 94 | This is used to define inputs to serve the model. 95 | :return: ServingInputReciever 96 | """ 97 | reciever_tensors = { 98 | # The size of input image is flexible. 99 | INPUT_FEATURE: tf.placeholder(tf.float32, [None, None, None, 1]), 100 | } 101 | 102 | # Convert give inputs to adjust to the model. 103 | features = { 104 | # Resize given images. 105 | INPUT_FEATURE: tf.image.resize_images(reciever_tensors[INPUT_FEATURE], [28, 28]), 106 | } 107 | return tf.estimator.export.ServingInputReceiver(receiver_tensors=reciever_tensors, 108 | features=features) 109 | 110 | 111 | def main(_): 112 | # Load training and eval data 113 | mnist = tf.contrib.learn.datasets.load_dataset("mnist") 114 | train_data = mnist.train.images # Returns np.array 115 | train_labels = np.asarray(mnist.train.labels, dtype=np.int32) 116 | eval_data = mnist.test.images # Returns np.array 117 | eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) 118 | 119 | # reshape images 120 | # To have input as an image, we reshape images beforehand. 121 | train_data = train_data.reshape(train_data.shape[0], 28, 28, 1) 122 | eval_data = eval_data.reshape(eval_data.shape[0], 28, 28, 1) 123 | 124 | # Create the Estimator 125 | training_config = tf.estimator.RunConfig( 126 | model_dir=FLAGS.model_dir, 127 | save_summary_steps=20, 128 | save_checkpoints_steps=20) 129 | classifier = tf.estimator.Estimator( 130 | model_fn=cnn_model_fn, 131 | model_dir=FLAGS.model_dir, 132 | config=training_config) 133 | 134 | # Set up logging for predictions 135 | # Log the values in the "Softmax" tensor with label "probabilities" 136 | tensors_to_log = {"probabilities": "softmax_tensor"} 137 | logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50) 138 | 139 | # Train the model 140 | train_input_fn = tf.estimator.inputs.numpy_input_fn( 141 | x={INPUT_FEATURE: train_data}, 142 | y=train_labels, 143 | batch_size=FLAGS.steps, 144 | num_epochs=None, 145 | shuffle=True) 146 | train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, 147 | max_steps=100, 148 | hooks=[logging_hook]) 149 | # classifier.train( 150 | # input_fn=train_input_fn, 151 | # steps=100, 152 | # hooks=[logging_hook]) 153 | 154 | # Evaluate the model and print results 155 | eval_input_fn = tf.estimator.inputs.numpy_input_fn( 156 | x={INPUT_FEATURE: eval_data}, 157 | y=eval_labels, 158 | num_epochs=1, 159 | shuffle=False) 160 | eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) 161 | # eval_results = classifier.evaluate(input_fn=eval_input_fn) 162 | # print(eval_results) 163 | tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec) 164 | 165 | # Save the model 166 | classifier.export_savedmodel(FLAGS.saved_dir, 167 | serving_input_receiver_fn=serving_input_receiver_fn) 168 | 169 | 170 | if __name__ == "__main__": 171 | tf.app.run() -------------------------------------------------------------------------------- /tfjob/docker/export-model/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.5.0 2 | COPY export_model.py /app/export_model.py 3 | 4 | ENTRYPOINT ["python", "/app/export_model.py"] -------------------------------------------------------------------------------- /tfjob/docker/export-model/Makefile: -------------------------------------------------------------------------------- 1 | DOCKER ?= docker 2 | 3 | .NOTPARALLEL: 4 | .PHONY: all 5 | 6 | all: cpu 7 | 8 | cpu: 9 | $(DOCKER) build --no-cache -f Dockerfile -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/export-model . 10 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/export-model -------------------------------------------------------------------------------- /tfjob/docker/export-model/export_model.py: -------------------------------------------------------------------------------- 1 | """Export given TensorFlow model. 2 | The model is a pretrained "MNIST", which saved as TensorFlow model checkpoint. This program 3 | simply uses TensorFlow SavedModel to 4 | export the trained model with proper signatures that can be loaded by standard 5 | tensorflow_model_server. 6 | Usage: mnist_export.py [--model_version=y] [--checkpoint_dir=checkpoint_oss_path] [--checkpoint_step=checkpoint_step] export_dir 7 | """ 8 | 9 | import os 10 | import sys 11 | 12 | import tensorflow as tf 13 | from tensorflow.python.saved_model import builder as saved_model_builder 14 | from tensorflow.python.saved_model import signature_constants 15 | from tensorflow.python.saved_model import signature_def_utils 16 | from tensorflow.python.saved_model import tag_constants 17 | from tensorflow.python.saved_model import utils 18 | from tensorflow.python.util import compat 19 | from tensorflow.examples.tutorials.mnist import input_data as mnist_input_data 20 | 21 | tf.app.flags.DEFINE_integer('model_version', 1, 'version number of the exported model.') 22 | tf.app.flags.DEFINE_integer('checkpoint_step', 0, 'Checkpoint steps that we export.') 23 | tf.app.flags.DEFINE_string('checkpoint_path', None, 'Checkpoints path.') 24 | FLAGS = tf.app.flags.FLAGS 25 | 26 | 27 | def main(_): 28 | if len(sys.argv) < 2 or sys.argv[-1].startswith('-'): 29 | print('Usage: mnist_dist_export.py ' 30 | '[--model_version=y] [--checkpoint_path=checkpoint_store_path] [--checkpoint_step=checkpoint_step] export_dir') 31 | sys.exit(-1) 32 | if FLAGS.model_version <= 0: 33 | print('Please specify a positive value for exported serveable version number.') 34 | sys.exit(-1) 35 | if not FLAGS.checkpoint_path: 36 | print('Please specify the correct path where checkpoints stored locally or in OSS.') 37 | sys.exit(-1) 38 | 39 | checkpoint_basename="model.ckpt" 40 | default_meta_graph_suffix='.meta' 41 | ckpt_path=os.path.join(FLAGS.checkpoint_path, checkpoint_basename + '-' + str(FLAGS.checkpoint_step)) 42 | meta_graph_file=ckpt_path + default_meta_graph_suffix 43 | with tf.Session() as new_sess: 44 | # with new_sess.graph.as_default(): 45 | # tf.reset_default_graph() 46 | # new_sess.run(tf.initialize_all_variables()) 47 | new_saver = tf.train.import_meta_graph(meta_graph_file, clear_devices=True) #'/test/mnistoutput/ckpt.meta') 48 | new_saver.restore(new_sess, ckpt_path) #'/test/mnistoutput/ckpt') 49 | new_graph = tf.get_default_graph() 50 | new_x = new_graph.get_tensor_by_name('input/x-input:0') 51 | print(new_x) 52 | new_y = new_graph.get_tensor_by_name('cross_entropy/logits:0') 53 | print(new_y) 54 | 55 | # Export model 56 | # WARNING(break-tutorial-inline-code): The following code snippet is 57 | # in-lined in tutorials, please update tutorial documents accordingly 58 | # whenever code changes. 59 | export_path_base = sys.argv[-1] 60 | export_path = os.path.join( 61 | compat.as_bytes(export_path_base), 62 | compat.as_bytes(str(FLAGS.model_version))) 63 | print('Exporting trained model to', export_path) 64 | builder = saved_model_builder.SavedModelBuilder(export_path) 65 | 66 | # Build the signature_def_map. 67 | tensor_info_x = utils.build_tensor_info(new_x) 68 | tensor_info_y = utils.build_tensor_info(new_y) 69 | 70 | prediction_signature = signature_def_utils.build_signature_def( 71 | inputs={'images': tensor_info_x}, 72 | outputs={'scores': tensor_info_y}, 73 | method_name=signature_constants.PREDICT_METHOD_NAME) 74 | 75 | legacy_init_op = tf.group(tf.initialize_all_tables(), name='legacy_init_op') 76 | 77 | builder.add_meta_graph_and_variables( 78 | new_sess, [tag_constants.SERVING], 79 | signature_def_map={ 80 | 'predict_images': 81 | prediction_signature, 82 | }, 83 | legacy_init_op=legacy_init_op, 84 | clear_devices=True) 85 | builder.save() 86 | 87 | print('Done exporting!') 88 | 89 | if __name__ == '__main__': 90 | tf.app.run() -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.5.0 2 | 3 | WORKDIR /app 4 | ADD mnist_client.py /app 5 | ADD data /app 6 | ADD requirements.txt /app 7 | 8 | RUN pip install -r /app/requirements.txt 9 | 10 | ENTRYPOINT ["tail", "-f", "/dev/null"] -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/Makefile: -------------------------------------------------------------------------------- 1 | DOCKER ?= docker 2 | 3 | .NOTPARALLEL: 4 | .PHONY: all 5 | 6 | all: 7 | $(DOCKER) build --no-cache -f Dockerfile -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-client-demo . -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/0.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/1.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/2.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/3.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/4.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/5.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/6.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/7.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/8.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/data/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cheyang/tensorflow-sample-code/7386f42034c347910987e754a401ce052c9516f6/tfjob/docker/mnist-client/data/9.png -------------------------------------------------------------------------------- /tfjob/docker/mnist-client/mnist_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | import os 4 | import random 5 | import numpy 6 | 7 | from PIL import Image 8 | 9 | import tensorflow as tf 10 | from tensorflow.examples.tutorials.mnist import input_data 11 | from tensorflow_serving.apis import predict_pb2 12 | from tensorflow_serving.apis import prediction_service_pb2 13 | 14 | from grpc.beta import implementations 15 | 16 | from mnist import MNIST # pylint: disable=no-name-in-module 17 | 18 | TF_MODEL_SERVER_HOST = os.getenv("TF_MODEL_SERVER_HOST", "127.0.0.1") 19 | TF_MODEL_SERVER_PORT = int(os.getenv("TF_MODEL_SERVER_PORT", 9000)) 20 | TF_DATA_DIR = os.getenv("TF_DATA_DIR", "/tmp/data/") 21 | TF_MNIST_IMAGE_PATH = os.getenv("TF_MNIST_IMAGE_PATH", None) 22 | TF_MNIST_TEST_IMAGE_NUMBER = int(os.getenv("TF_MNIST_TEST_IMAGE_NUMBER", -1)) 23 | 24 | if TF_MNIST_IMAGE_PATH != None: 25 | raw_image = Image.open(TF_MNIST_IMAGE_PATH) 26 | int_image = numpy.array(raw_image) 27 | image = numpy.reshape(int_image, 784).astype(numpy.float32) 28 | elif TF_MNIST_TEST_IMAGE_NUMBER > -1: 29 | test_data_set = input_data.read_data_sets(TF_DATA_DIR, one_hot=True).test 30 | image = test_data_set.images[TF_MNIST_TEST_IMAGE_NUMBER] 31 | else: 32 | test_data_set = input_data.read_data_sets(TF_DATA_DIR, one_hot=True).test 33 | image = random.choice(test_data_set.images) 34 | 35 | channel = implementations.insecure_channel( 36 | TF_MODEL_SERVER_HOST, TF_MODEL_SERVER_PORT) 37 | stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) 38 | 39 | request = predict_pb2.PredictRequest() 40 | request.model_spec.name = "mnist" 41 | request.model_spec.signature_name = "predict_images" 42 | request.inputs['images'].CopyFrom( 43 | tf.contrib.util.make_tensor_proto(image, shape=[1, 784])) 44 | 45 | result = stub.Predict(request, 10.0) # 10 secs timeout 46 | 47 | # print(result) 48 | print(MNIST.display(image, threshold=0)) 49 | response = numpy.array( 50 | result.outputs['scores'].float_val) 51 | prediction = numpy.argmax(response) 52 | # print(prediction) 53 | print("Your model says the above number is... %d!" % 54 | prediction) -------------------------------------------------------------------------------- /tfjob/docker/mnist/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.4.0 2 | 3 | RUN mkdir -p /train/tensorflow/input_data && \ 4 | cd /train/tensorflow/input_data && \ 5 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \ 6 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \ 7 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \ 8 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz 9 | 10 | COPY main.py /app/main.py 11 | 12 | ENTRYPOINT ["python", "/app/main.py"] -------------------------------------------------------------------------------- /tfjob/docker/mnist/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.4.0-gpu 2 | 3 | RUN mkdir -p /train/tensorflow/input_data && \ 4 | cd /train/tensorflow/input_data && \ 5 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \ 6 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \ 7 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \ 8 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz 9 | 10 | COPY main.py /app/main.py 11 | 12 | ENTRYPOINT ["python", "/app/main.py"] -------------------------------------------------------------------------------- /tfjob/docker/mnist/Makefile: -------------------------------------------------------------------------------- 1 | DOCKER ?= docker 2 | 3 | .NOTPARALLEL: 4 | .PHONY: all 5 | 6 | all: cpu gpu 7 | 8 | cpu: 9 | $(DOCKER) build --no-cache -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-standalone:cpu . 10 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-standalone:cpu 11 | gpu: 12 | $(DOCKER) build --no-cache -f Dockerfile.gpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-standalone:gpu . 13 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-standalone:gpu -------------------------------------------------------------------------------- /tfjob/docker/mnist/export_model.py: -------------------------------------------------------------------------------- 1 | """Export given TensorFlow model. 2 | The model is a pretrained "MNIST", which saved as TensorFlow model checkpoint. This program 3 | simply uses TensorFlow SavedModel to 4 | export the trained model with proper signatures that can be loaded by standard 5 | tensorflow_model_server. 6 | Usage: mnist_export.py [--model_version=y] [--checkpoint_dir=checkpoint_oss_path] [--checkpoint_step=checkpoint_step] export_dir 7 | """ 8 | 9 | import os 10 | import sys 11 | 12 | import tensorflow as tf 13 | from tensorflow.python.saved_model import builder as saved_model_builder 14 | from tensorflow.python.saved_model import signature_constants 15 | from tensorflow.python.saved_model import signature_def_utils 16 | from tensorflow.python.saved_model import tag_constants 17 | from tensorflow.python.saved_model import utils 18 | from tensorflow.python.util import compat 19 | from tensorflow.examples.tutorials.mnist import input_data as mnist_input_data 20 | 21 | tf.app.flags.DEFINE_integer('model_version', 1, 'version number of the exported model.') 22 | tf.app.flags.DEFINE_integer('checkpoint_step', 0, 'Checkpoint steps that we export.') 23 | tf.app.flags.DEFINE_string('checkpoint_path', None, 'Checkpoints path.') 24 | FLAGS = tf.app.flags.FLAGS 25 | 26 | 27 | def main(_): 28 | if len(sys.argv) < 2 or sys.argv[-1].startswith('-'): 29 | print('Usage: mnist_dist_export.py ' 30 | '[--model_version=y] [--checkpoint_path=checkpoint_store_path] [--checkpoint_step=checkpoint_step] export_dir') 31 | sys.exit(-1) 32 | if FLAGS.model_version <= 0: 33 | print('Please specify a positive value for exported serveable version number.') 34 | sys.exit(-1) 35 | if not FLAGS.checkpoint_path: 36 | print('Please specify the correct path where checkpoints stored locally or in OSS.') 37 | sys.exit(-1) 38 | 39 | checkpoint_basename="model.ckpt" 40 | default_meta_graph_suffix='.meta' 41 | ckpt_path=os.path.join(FLAGS.checkpoint_path, checkpoint_basename + '-' + str(FLAGS.checkpoint_step)) 42 | meta_graph_file=ckpt_path + default_meta_graph_suffix 43 | with tf.Session() as new_sess: 44 | # with new_sess.graph.as_default(): 45 | # tf.reset_default_graph() 46 | # new_sess.run(tf.initialize_all_variables()) 47 | new_saver = tf.train.import_meta_graph(meta_graph_file, clear_devices=True) #'/test/mnistoutput/ckpt.meta') 48 | new_saver.restore(new_sess, ckpt_path) #'/test/mnistoutput/ckpt') 49 | new_graph = tf.get_default_graph() 50 | new_x = new_graph.get_tensor_by_name('input/x-input:0') 51 | print(new_x) 52 | new_y = new_graph.get_tensor_by_name('layer2/activation:0') 53 | print(new_y) 54 | 55 | # Export model 56 | # WARNING(break-tutorial-inline-code): The following code snippet is 57 | # in-lined in tutorials, please update tutorial documents accordingly 58 | # whenever code changes. 59 | export_path_base = sys.argv[-1] 60 | export_path = os.path.join( 61 | compat.as_bytes(export_path_base), 62 | compat.as_bytes(str(FLAGS.model_version))) 63 | print('Exporting trained model to', export_path) 64 | builder = saved_model_builder.SavedModelBuilder(export_path) 65 | 66 | # Build the signature_def_map. 67 | tensor_info_x = utils.build_tensor_info(new_x) 68 | tensor_info_y = utils.build_tensor_info(new_y) 69 | 70 | prediction_signature = signature_def_utils.build_signature_def( 71 | inputs={'images': tensor_info_x}, 72 | outputs={'scores': tensor_info_y}, 73 | method_name=signature_constants.PREDICT_METHOD_NAME) 74 | 75 | legacy_init_op = tf.group(tf.initialize_all_tables(), name='legacy_init_op') 76 | 77 | builder.add_meta_graph_and_variables( 78 | new_sess, [tag_constants.SERVING], 79 | signature_def_map={ 80 | 'predict_images': 81 | prediction_signature, 82 | }, 83 | legacy_init_op=legacy_init_op, 84 | clear_devices=True) 85 | builder.save() 86 | 87 | print('Done exporting!') 88 | 89 | if __name__ == '__main__': 90 | tf.app.run() -------------------------------------------------------------------------------- /tfjob/docker/mnist/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """A simple MNIST classifier which displays summaries in TensorBoard. 16 | This is an unimpressive MNIST model, but it is a good example of using 17 | tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of 18 | naming summary tags so that they are grouped meaningfully in TensorBoard. 19 | It demonstrates the functionality of every TensorBoard dashboard. 20 | """ 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import argparse 26 | import os 27 | import sys 28 | 29 | import tensorflow as tf 30 | 31 | from tensorflow.examples.tutorials.mnist import input_data 32 | 33 | FLAGS = None 34 | 35 | 36 | def train(): 37 | print("data dir: {0}".format(FLAGS.data_dir)) 38 | # Import data 39 | mnist = input_data.read_data_sets(FLAGS.data_dir, 40 | one_hot=True, 41 | fake_data=FLAGS.fake_data) 42 | 43 | # Create a multilayer model. 44 | 45 | # Input placeholders 46 | with tf.name_scope('input'): 47 | x = tf.placeholder(tf.float32, [None, 784], name='x-input') 48 | y_ = tf.placeholder(tf.float32, [None, 10], name='y-input') 49 | 50 | with tf.name_scope('input_reshape'): 51 | image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) 52 | tf.summary.image('input', image_shaped_input, 10) 53 | 54 | # We can't initialize these variables to 0 - the network will get stuck. 55 | def weight_variable(shape): 56 | """Create a weight variable with appropriate initialization.""" 57 | initial = tf.truncated_normal(shape, stddev=0.1) 58 | return tf.Variable(initial) 59 | 60 | def bias_variable(shape): 61 | """Create a bias variable with appropriate initialization.""" 62 | initial = tf.constant(0.1, shape=shape) 63 | return tf.Variable(initial) 64 | 65 | def variable_summaries(var): 66 | """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" 67 | with tf.name_scope('summaries'): 68 | mean = tf.reduce_mean(var) 69 | tf.summary.scalar('mean', mean) 70 | with tf.name_scope('stddev'): 71 | stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) 72 | tf.summary.scalar('stddev', stddev) 73 | tf.summary.scalar('max', tf.reduce_max(var)) 74 | tf.summary.scalar('min', tf.reduce_min(var)) 75 | tf.summary.histogram('histogram', var) 76 | 77 | def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): 78 | """Reusable code for making a simple neural net layer. 79 | It does a matrix multiply, bias add, and then uses ReLU to nonlinearize. 80 | It also sets up name scoping so that the resultant graph is easy to read, 81 | and adds a number of summary ops. 82 | """ 83 | # Adding a name scope ensures logical grouping of the layers in the graph. 84 | with tf.name_scope(layer_name): 85 | # This Variable will hold the state of the weights for the layer 86 | with tf.name_scope('weights'): 87 | weights = weight_variable([input_dim, output_dim]) 88 | variable_summaries(weights) 89 | with tf.name_scope('biases'): 90 | biases = bias_variable([output_dim]) 91 | variable_summaries(biases) 92 | with tf.name_scope('Wx_plus_b'): 93 | preactivate = tf.matmul(input_tensor, weights) + biases 94 | tf.summary.histogram('pre_activations', preactivate) 95 | activations = act(preactivate, name='activation') 96 | tf.summary.histogram('activations', activations) 97 | return activations 98 | 99 | hidden1 = nn_layer(x, 784, 500, 'layer1') 100 | 101 | with tf.name_scope('dropout'): 102 | keep_prob = tf.placeholder_with_default(1.0, shape=()) 103 | tf.summary.scalar('dropout_keep_probability', keep_prob) 104 | dropped = tf.nn.dropout(hidden1, keep_prob) 105 | 106 | # Do not apply softmax activation yet, see below. 107 | y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) 108 | 109 | with tf.name_scope('cross_entropy'): 110 | # The raw formulation of cross-entropy, 111 | # 112 | # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)), 113 | # reduction_indices=[1])) 114 | # 115 | # can be numerically unstable. 116 | # 117 | # So here we use tf.nn.softmax_cross_entropy_with_logits on the 118 | # raw outputs of the nn_layer above, and then average across 119 | # the batch. 120 | diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y) 121 | with tf.name_scope('total'): 122 | cross_entropy = tf.reduce_mean(diff) 123 | tf.summary.scalar('cross_entropy', cross_entropy) 124 | 125 | with tf.name_scope('train'): 126 | train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( 127 | cross_entropy) 128 | 129 | with tf.name_scope('accuracy'): 130 | with tf.name_scope('correct_prediction'): 131 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 132 | with tf.name_scope('accuracy'): 133 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 134 | tf.summary.scalar('accuracy', accuracy) 135 | 136 | # Merge all the summaries and write them out to 137 | # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) 138 | merged = tf.summary.merge_all() 139 | 140 | def feed_dict(train): 141 | """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" 142 | if train or FLAGS.fake_data: 143 | xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) 144 | k = FLAGS.dropout 145 | else: 146 | xs, ys = mnist.test.images, mnist.test.labels 147 | k = 1.0 148 | return {x: xs, y_: ys, keep_prob: k} 149 | 150 | sess = tf.InteractiveSession() 151 | train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) 152 | test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') 153 | tf.global_variables_initializer().run() 154 | # Train the model, and also write summaries. 155 | # Every 10th step, measure test-set accuracy, and write test summaries 156 | # All other steps, run train_step on training data, & add training summaries 157 | saver = tf.train.Saver() 158 | 159 | for i in range(FLAGS.max_steps): 160 | if i % 10 == 0: # Record summaries and test-set accuracy 161 | summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) 162 | test_writer.add_summary(summary, i) 163 | print('Accuracy at step %s: %s' % (i, acc)) 164 | if i % 100 == 0: 165 | print('Save checkpoint at step %s: %s' % (i, acc)) 166 | saver.save(sess, FLAGS.log_dir + '/model.ckpt', global_step=i) 167 | else: # Record train set summaries, and train 168 | if i % 100 == 99: # Record execution stats 169 | run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 170 | run_metadata = tf.RunMetadata() 171 | summary, _ = sess.run([merged, train_step], 172 | feed_dict=feed_dict(True), 173 | options=run_options, 174 | run_metadata=run_metadata) 175 | train_writer.add_run_metadata(run_metadata, 'step%03d' % i) 176 | train_writer.add_summary(summary, i) 177 | print('Adding run metadata for', i) 178 | else: # Record a summary 179 | summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) 180 | train_writer.add_summary(summary, i) 181 | 182 | #Train-accuracy 183 | print('Total Train-accuracy=%s' % (acc)) 184 | train_writer.close() 185 | test_writer.close() 186 | 187 | 188 | def main(_): 189 | train() 190 | 191 | 192 | if __name__ == '__main__': 193 | parser = argparse.ArgumentParser() 194 | parser.add_argument('--fake_data', nargs='?', const=True, type=bool, 195 | default=False, 196 | help='If true, uses fake data for unit testing.') 197 | parser.add_argument('--max_steps', type=int, default=1000, 198 | help='Number of steps to run trainer.') 199 | parser.add_argument('--learning_rate', type=float, default=0.001, 200 | help='Initial learning rate') 201 | parser.add_argument('--dropout', type=float, default=0.9, 202 | help='Keep probability for training dropout.') 203 | parser.add_argument( 204 | '--data_dir', 205 | type=str, 206 | default=os.path.join(os.getenv('TEST_TMPDIR', '/train'), 207 | 'data'), 208 | help='Directory for storing input data') 209 | parser.add_argument( 210 | '--log_dir', 211 | type=str, 212 | default='/training_logs', 213 | help='Summaries log directory') 214 | FLAGS, unparsed = parser.parse_known_args() 215 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) -------------------------------------------------------------------------------- /tfjob/docker/v1alpha2/distributed-mnist/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.5.0 2 | 3 | RUN mkdir -p /train/tensorflow/input_data && \ 4 | cd /train/tensorflow/input_data && \ 5 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \ 6 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \ 7 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \ 8 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz 9 | 10 | COPY main.py /app/main.py 11 | 12 | ENTRYPOINT ["python", "/app/main.py"] -------------------------------------------------------------------------------- /tfjob/docker/v1alpha2/distributed-mnist/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.5.0-gpu 2 | 3 | RUN mkdir -p /train/tensorflow/input_data && \ 4 | cd /train/tensorflow/input_data && \ 5 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-images-idx3-ubyte.gz && \ 6 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/t10k-labels-idx1-ubyte.gz && \ 7 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-images-idx3-ubyte.gz && \ 8 | curl -O http://kubeflow-oss.oss-cn-hangzhou.aliyuncs.com/tensorflow/input_data/train-labels-idx1-ubyte.gz 9 | 10 | COPY main.py /app/main.py 11 | 12 | ENTRYPOINT ["python", "/app/main.py"] -------------------------------------------------------------------------------- /tfjob/docker/v1alpha2/distributed-mnist/Makefile: -------------------------------------------------------------------------------- 1 | DOCKER ?= docker 2 | 3 | .NOTPARALLEL: 4 | .PHONY: all 5 | 6 | all: cpu gpu 7 | 8 | cpu: 9 | $(DOCKER) build --no-cache -f Dockerfile.cpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed-v1alpha2:cpu . 10 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed-v1alpha2:cpu 11 | gpu: 12 | $(DOCKER) build --no-cache -f Dockerfile.gpu -t registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed-v1alpha2:gpu . 13 | $(DOCKER) push registry.cn-hangzhou.aliyuncs.com/tensorflow-samples/tf-mnist-distributed-v1alpha2:gpu 14 | -------------------------------------------------------------------------------- /tfjob/docker/v1alpha2/distributed-mnist/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """A simple MNIST classifier which displays summaries in TensorBoard. 16 | This is an unimpressive MNIST model, but it is a good example of using 17 | tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of 18 | naming summary tags so that they are grouped meaningfully in TensorBoard. 19 | It demonstrates the functionality of every TensorBoard dashboard. 20 | """ 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import argparse 26 | import os 27 | import sys 28 | import ast 29 | import json 30 | 31 | import tensorflow as tf 32 | 33 | from tensorflow.examples.tutorials.mnist import input_data 34 | 35 | FLAGS = None 36 | 37 | def train(): 38 | tf_config_json = os.environ.get("TF_CONFIG", "{}") 39 | tf_config = json.loads(tf_config_json) 40 | 41 | task = tf_config.get("task", {}) 42 | cluster_spec = tf_config.get("cluster", {}) 43 | cluster_spec_object = tf.train.ClusterSpec(cluster_spec) 44 | job_name = task["type"] 45 | task_id = task["index"] 46 | server_def = tf.train.ServerDef( 47 | cluster=cluster_spec_object.as_cluster_def(), 48 | protocol="grpc", 49 | job_name=job_name, 50 | task_index=task_id) 51 | server = tf.train.Server(server_def) 52 | 53 | is_chief = (job_name == 'worker') and (task_id == 0) 54 | if job_name == 'ps': 55 | server.join() 56 | 57 | if is_chief: 58 | print("Worker %d: Initializing session..." % task_id) 59 | tf.reset_default_graph() 60 | else: 61 | print("Worker %d: Waiting for session to be initialized..." % task_id) 62 | 63 | 64 | # Import data 65 | mnist = input_data.read_data_sets(FLAGS.data_dir, 66 | one_hot=True, 67 | fake_data=FLAGS.fake_data) 68 | 69 | 70 | # Create a multilayer model. 71 | 72 | 73 | # Between-graph replication 74 | with tf.device(tf.train.replica_device_setter( 75 | worker_device="/job:{0}/task:{1}".format(job_name,task_id), 76 | cluster=cluster_spec)): 77 | # with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): 78 | # worker_device="/job:{0}/task:{1}".format(job_name,task_id), 79 | # cluster=cluster_spec)): 80 | 81 | # count the number of updates 82 | global_step = tf.get_variable( 83 | 'global_step', 84 | [], 85 | initializer = tf.constant_initializer(0), 86 | trainable = False) 87 | 88 | # Input placeholders 89 | with tf.name_scope('input'): 90 | x = tf.placeholder(tf.float32, [None, 784], name='x-input') 91 | y_ = tf.placeholder(tf.float32, [None, 10], name='y-input') 92 | 93 | with tf.name_scope('input_reshape'): 94 | image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) 95 | tf.summary.image('input', image_shaped_input, 10) 96 | 97 | # We can't initialize these variables to 0 - the network will get stuck. 98 | def weight_variable(shape): 99 | """Create a weight variable with appropriate initialization.""" 100 | initial = tf.truncated_normal(shape, stddev=0.1) 101 | return tf.Variable(initial) 102 | 103 | def bias_variable(shape): 104 | """Create a bias variable with appropriate initialization.""" 105 | initial = tf.constant(0.1, shape=shape) 106 | return tf.Variable(initial) 107 | 108 | def variable_summaries(var): 109 | """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" 110 | with tf.name_scope('summaries'): 111 | mean = tf.reduce_mean(var) 112 | tf.summary.scalar('mean', mean) 113 | with tf.name_scope('stddev'): 114 | stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) 115 | tf.summary.scalar('stddev', stddev) 116 | tf.summary.scalar('max', tf.reduce_max(var)) 117 | tf.summary.scalar('min', tf.reduce_min(var)) 118 | tf.summary.histogram('histogram', var) 119 | 120 | def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): 121 | """Reusable code for making a simple neural net layer. 122 | It does a matrix multiply, bias add, and then uses ReLU to nonlinearize. 123 | It also sets up name scoping so that the resultant graph is easy to read, 124 | and adds a number of summary ops. 125 | """ 126 | # Adding a name scope ensures logical grouping of the layers in the graph. 127 | with tf.name_scope(layer_name): 128 | # This Variable will hold the state of the weights for the layer 129 | with tf.name_scope('weights'): 130 | weights = weight_variable([input_dim, output_dim]) 131 | variable_summaries(weights) 132 | with tf.name_scope('biases'): 133 | biases = bias_variable([output_dim]) 134 | variable_summaries(biases) 135 | with tf.name_scope('Wx_plus_b'): 136 | preactivate = tf.matmul(input_tensor, weights) + biases 137 | tf.summary.histogram('pre_activations', preactivate) 138 | activations = act(preactivate, name='activation') 139 | tf.summary.histogram('activations', activations) 140 | return activations 141 | 142 | hidden1 = nn_layer(x, 784, 500, 'layer1') 143 | 144 | with tf.name_scope('dropout'): 145 | keep_prob = tf.placeholder_with_default(1.0, shape=()) 146 | tf.summary.scalar('dropout_keep_probability', keep_prob) 147 | dropped = tf.nn.dropout(hidden1, keep_prob) 148 | 149 | # Do not apply softmax activation yet, see below. 150 | y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) 151 | 152 | with tf.name_scope('cross_entropy'): 153 | # The raw formulation of cross-entropy, 154 | # 155 | # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)), 156 | # reduction_indices=[1])) 157 | # 158 | # can be numerically unstable. 159 | # 160 | # So here we use tf.nn.softmax_cross_entropy_with_logits on the 161 | # raw outputs of the nn_layer above, and then average across 162 | # the batch. 163 | #diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y) 164 | # with tf.name_scope('total'): 165 | #cross_entropy = tf.reduce_mean(diff) 166 | logits = tf.nn.softmax(y, name='logits') 167 | cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(logits,1e-10,1.0)), name='cross_entropy') 168 | tf.summary.scalar('cross_entropy', cross_entropy) 169 | 170 | with tf.name_scope('train'): 171 | train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( 172 | cross_entropy) 173 | 174 | with tf.name_scope('accuracy'): 175 | with tf.name_scope('correct_prediction'): 176 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 177 | with tf.name_scope('accuracy'): 178 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 179 | tf.summary.scalar('accuracy', accuracy) 180 | 181 | # Merge all the summaries and write them out to 182 | # /train/tensorflow/mnist/logs/mnist_with_summaries (by default) 183 | merged = tf.summary.merge_all() 184 | 185 | init_op = tf.global_variables_initializer() 186 | 187 | def feed_dict(train): 188 | """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" 189 | if train or FLAGS.fake_data: 190 | xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) 191 | k = FLAGS.dropout 192 | else: 193 | xs, ys = mnist.test.images, mnist.test.labels 194 | k = 1.0 195 | return {x: xs, y_: ys, keep_prob: k} 196 | 197 | 198 | 199 | sv = tf.train.Supervisor(is_chief=is_chief, 200 | global_step=global_step, 201 | init_op=init_op, 202 | logdir=FLAGS.log_dir) 203 | # sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True, 204 | # device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.worker_index]) 205 | 206 | with sv.prepare_or_wait_for_session(server.target) as sess: 207 | train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) 208 | test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') 209 | # Train the model, and also write summaries. 210 | # Every 10th step, measure test-set accuracy, and write test summaries 211 | # All other steps, run train_step on training data, & add training summaries 212 | 213 | for i in range(FLAGS.max_steps): 214 | if i % 10 == 0: # Record summaries and test-set accuracy 215 | summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) 216 | test_writer.add_summary(summary, i) 217 | print('Accuracy at step %s: %s' % (i, acc)) 218 | else: # Record train set summaries, and train 219 | if i % 100 == 99: # Record execution stats 220 | run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 221 | run_metadata = tf.RunMetadata() 222 | summary, _ = sess.run([merged, train_step], 223 | feed_dict=feed_dict(True), 224 | options=run_options, 225 | run_metadata=run_metadata) 226 | train_writer.add_run_metadata(run_metadata, 'step%03d' % i) 227 | train_writer.add_summary(summary, i) 228 | print('Adding run metadata for', i) 229 | else: # Record a summary 230 | summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) 231 | train_writer.add_summary(summary, i) 232 | train_writer.close() 233 | test_writer.close() 234 | 235 | 236 | def main(_): 237 | train() 238 | 239 | 240 | if __name__ == '__main__': 241 | parser = argparse.ArgumentParser() 242 | parser.add_argument('--fake_data', nargs='?', const=True, type=bool, 243 | default=False, 244 | help='If true, uses fake data for unit testing.') 245 | parser.add_argument('--max_steps', type=int, default=1000, 246 | help='Number of steps to run trainer.') 247 | parser.add_argument('--learning_rate', type=float, default=0.001, 248 | help='Initial learning rate') 249 | parser.add_argument('--dropout', type=float, default=0.9, 250 | help='Keep probability for training dropout.') 251 | parser.add_argument( 252 | '--data_dir', 253 | type=str, 254 | default=os.path.join(os.getenv('TEST_TMPDIR', '/train'), 255 | 'data'), 256 | help='Directory for storing input data') 257 | parser.add_argument( 258 | '--log_dir', 259 | type=str, 260 | default='/training_logs', 261 | help='Summaries log directory') 262 | FLAGS, unparsed = parser.parse_known_args() 263 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 264 | --------------------------------------------------------------------------------