├── .DS_Store ├── .gitignore ├── Course_5_emotion_vgg_finetune ├── FVGG_Emo.py ├── test_fvgg_emo.txt ├── train_fvgg_emo.txt └── training_instruction.txt ├── README.md ├── WORKSPACE ├── course_10_dqn.md ├── course_1_tf_basic_operation.py ├── course_1_tf_lr.ipynb ├── course_1_tf_lr.py ├── course_2_tf_nn.ipynb ├── course_2_tf_nn.py ├── course_3_tf_mnist_cnn.ipynb ├── course_3_tf_mnist_cnn.py ├── course_6_obj_detection.txt ├── course_7_lstm_learn_shakespeare.ipynb ├── course_7_seq2seq.py ├── course_7_shakespeare_gen.py ├── course_8_image2txt ├── BUILD ├── configuration.py ├── data │ ├── build_mscoco_data.py │ └── download_and_preprocess_mscoco.sh ├── evaluate.py ├── inference_utils │ ├── BUILD │ ├── caption_generator.py │ ├── caption_generator_test.py │ ├── inference_wrapper_base.py │ └── vocabulary.py ├── inference_wrapper.py ├── ops │ ├── BUILD │ ├── image_embedding.py │ ├── image_embedding_test.py │ ├── image_processing.py │ └── inputs.py ├── readme.md ├── run_inference.py ├── show_and_tell_model.py ├── show_and_tell_model_test.py └── train.py ├── course_9_pix2pix_file.md ├── course_example_vgg ├── .gitignore ├── course_4_vgg16_test.py ├── couse_4_vgg16_test.ipynb ├── imagenet1000_clsid_to_human.py ├── imagenet_class_list.txt ├── np_plot.py ├── np_plot.pyc ├── test_data │ ├── dog.png │ ├── puzzle.jpeg │ └── tiger.jpeg ├── utils.py ├── utils.pyc ├── vgg-model-download-link ├── vgg-model-download-link.txt ├── vgg16.py └── vgg16_test.py ├── example_autoencoder_recon.ipynb ├── g3doc ├── COCO_val2014_000000224477.jpg ├── example_captions.jpg └── show_and_tell_architecture.png ├── index.html ├── libs ├── __init__.py ├── activations.py ├── batch_norm.py ├── connections.py ├── dataset_utils.py ├── datasets.py └── utils.py ├── mnist ├── mnist.pkl.gz ├── t10k-images-idx3-ubyte.gz ├── t10k-labels-idx1-ubyte.gz ├── train-images-idx3-ubyte.gz └── train-labels-idx1-ubyte.gz └── tf_1_try.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bazel-bin 2 | /bazel-ci_build-cache 3 | /bazel-genfiles 4 | /bazel-out 5 | /bazel-im2txt 6 | /bazel-testlogs 7 | /bazel-tf 8 | -------------------------------------------------------------------------------- /Course_5_emotion_vgg_finetune/FVGG_Emo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Retraining (Finetuning) Example with vgg.tflearn. Using weights from VGG model to retrain 3 | network for a new task (your own dataset).All weights are restored except 4 | last layer (softmax) that will be retrained to match the new task (finetuning). 5 | 6 | DATASET for this experiment can be obtianed at: https://pan.baidu.com/s/1kUEnhP1 7 | 8 | edited by wei li for vgg finetuning 9 | 10 | ''' 11 | import tflearn 12 | from tflearn.data_preprocessing import ImagePreprocessing 13 | import os 14 | 15 | 16 | def vgg16(input, num_class): 17 | 18 | #in the model, we added trainable=False to make sure the parameter are not updated during training 19 | x = tflearn.conv_2d(input, 64, 3, activation='relu', scope='conv1_1',trainable=False) 20 | x = tflearn.conv_2d(x, 64, 3, activation='relu', scope='conv1_2',trainable=False) 21 | x = tflearn.max_pool_2d(x, 2, strides=2, name='maxpool1') 22 | 23 | x = tflearn.conv_2d(x, 128, 3, activation='relu', scope='conv2_1',trainable=False) 24 | x = tflearn.conv_2d(x, 128, 3, activation='relu', scope='conv2_2',trainable=False) 25 | x = tflearn.max_pool_2d(x, 2, strides=2, name='maxpool2') 26 | 27 | x = tflearn.conv_2d(x, 256, 3, activation='relu', scope='conv3_1',trainable=False) 28 | x = tflearn.conv_2d(x, 256, 3, activation='relu', scope='conv3_2',trainable=False) 29 | x = tflearn.conv_2d(x, 256, 3, activation='relu', scope='conv3_3',trainable=False) 30 | x = tflearn.max_pool_2d(x, 2, strides=2, name='maxpool3') 31 | 32 | x = tflearn.conv_2d(x, 512, 3, activation='relu', scope='conv4_1',trainable=False) 33 | x = tflearn.conv_2d(x, 512, 3, activation='relu', scope='conv4_2',trainable=False) 34 | x = tflearn.conv_2d(x, 512, 3, activation='relu', scope='conv4_3',trainable=False) 35 | x = tflearn.max_pool_2d(x, 2, strides=2, name='maxpool4') 36 | 37 | x = tflearn.conv_2d(x, 512, 3, activation='relu', scope='conv5_1') 38 | x = tflearn.conv_2d(x, 512, 3, activation='relu', scope='conv5_2') 39 | x = tflearn.conv_2d(x, 512, 3, activation='relu', scope='conv5_3') 40 | x = tflearn.max_pool_2d(x, 2, strides=2, name='maxpool5') 41 | 42 | x = tflearn.fully_connected(x, 4096, activation='relu', scope='fc6') 43 | x = tflearn.dropout(x, 0.5, name='dropout1') 44 | #we changed the structure here to let the fc only have 2048, less parameter, enough for our task 45 | x = tflearn.fully_connected(x, 2048, activation='relu', scope='fc7',restore=False) 46 | x = tflearn.dropout(x, 0.5, name='dropout2') 47 | 48 | x = tflearn.fully_connected(x, num_class, activation='softmax', scope='fc8', 49 | restore=False) 50 | 51 | return x 52 | 53 | 54 | # data_dir = "webemo_tr/" 55 | model_path = "." 56 | # the file gen by generated by gen_files_list.py 57 | files_list = "./train_fvgg_emo.txt" 58 | 59 | from tflearn.data_utils import image_preloader 60 | 61 | X, Y = image_preloader(files_list, image_shape=(224, 224), mode='file', 62 | categorical_labels=True, normalize=False, 63 | files_extension=['.jpg', '.png'], filter_channel=True) 64 | # or use the mode 'floder' 65 | # X, Y = image_preloader(data_dir, image_shape=(224, 224), mode='folder', 66 | # categorical_labels=True, normalize=True, 67 | # files_extension=['.jpg', '.png'], filter_channel=True) 68 | # print X.shape 69 | num_classes = 7 # num of your dataset 70 | 71 | # VGG preprocessing 72 | img_prep = ImagePreprocessing() 73 | img_prep.add_featurewise_zero_center(mean=[123.68, 116.779, 103.939], 74 | per_channel=True) 75 | # VGG Network 76 | x = tflearn.input_data(shape=[None, 224, 224, 3], name='input', 77 | data_preprocessing=img_prep) 78 | softmax = vgg16(x, num_classes) 79 | regression = tflearn.regression(softmax, optimizer='adam', 80 | loss='categorical_crossentropy', 81 | learning_rate=0.0001, restore=False) 82 | 83 | model = tflearn.DNN(regression, checkpoint_path='vgg-finetuning', 84 | max_checkpoints=3, tensorboard_verbose=2, 85 | tensorboard_dir="./logs") 86 | 87 | model_file = os.path.join(model_path, "vgg16.tflearn") 88 | model.load(model_file, weights_only=True) 89 | 90 | # Start finetuning 91 | model.fit(X, Y, n_epoch=20, validation_set=0.1, shuffle=True, 92 | show_metric=True, batch_size=64, snapshot_epoch=False, 93 | snapshot_step=200, run_id='vgg-finetuning') 94 | 95 | model.save('ChinaHadoop_vgg_finetune_emo.tfmodel') 96 | ##let's just test if the model can predict image right 97 | # model.predict(img_array) to see the final result 98 | 99 | -------------------------------------------------------------------------------- /Course_5_emotion_vgg_finetune/training_instruction.txt: -------------------------------------------------------------------------------- 1 | 2 | 0. install tflearn: pip install git+https://github.com/tflearn/tflearn.git 3 | 1. download the training data and unzip at this folder, dataset from https://drive.google.com/open?id=0B3ANX1iL124qbmxOc2cyQzhvUFE 4 | if you cannot access to google network, you can download the dataset from baidu cloud drive: https://pan.baidu.com/s/1kUEnhP1 5 | 2. download the pretrained tflearn vgg model from https://www.dropbox.com/s/9li9mi4105jf45v/vgg16.tflearn?dl=0 6 | 3. go through the code, make sure all the path are correct 7 | 8 | run and wait... 9 | 10 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | This is the codes collection for online deep learning course at ChinaHadoop.cn. 3 | 4 | The codes are tested on ubuntu+cuda+cuDNN environment, if you have problems you can 5 | 6 | contact the lecture Wei Li via weibo or course QQ. 7 | -------------------------------------------------------------------------------- /WORKSPACE: -------------------------------------------------------------------------------- 1 | workspace(name = "im2txt") 2 | -------------------------------------------------------------------------------- /course_10_dqn.md: -------------------------------------------------------------------------------- 1 | https://github.com/wiibrew/dqn 2 | -------------------------------------------------------------------------------- /course_1_tf_basic_operation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | basic tf operation examples, 3 | 1. write a tf function use tf.xxxx 4 | 2. feed data to tf.placeholder and set data to tf.Variable 5 | 3.run... 6 | ''' 7 | 8 | # 9 | import tensorflow as tf 10 | 11 | # direct sum with constand value 12 | a = tf.constant(2) 13 | b = tf.constant(3) 14 | c=a+b 15 | d=a*b 16 | 17 | sess=tf.Session() 18 | print sess.run(c) 19 | print sess.run(d) 20 | 21 | # 22 | a = tf.placeholder(tf.int16) 23 | b = tf.placeholder(tf.int16) 24 | 25 | # 26 | add = tf.add(a, b) 27 | mul = tf.multiply(a, b) 28 | print sess.run(add, feed_dict={a: 2, b: 3}) 29 | print sess.run(mul, feed_dict={a: 2, b: 3}) 30 | 31 | 32 | 33 | # 34 | matrix1 = tf.constant([[3., 3.]]) 35 | matrix2 = tf.constant([[2.],[2.]]) 36 | product = tf.matmul(matrix2, matrix1) 37 | print sess.run(product) 38 | 39 | #here you should also be able to use tf.placeholder 40 | mat1=tf.Variable(tf.random_normal([3,2])) 41 | mat2=tf.Variable(tf.random_normal([2,3])) 42 | product=tf.matmul(mat1,mat2) 43 | 44 | m1=[[1,3],[2,1],[0,5]] 45 | m2=[[3,2,1],[1,2,3]] 46 | 47 | print sess.run(product,feed_dict={mat1:m1,mat2:m2}) -------------------------------------------------------------------------------- /course_1_tf_lr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "'''\n", 12 | "linear regression experiment, hope you can know:\n", 13 | "1. how to design the learning model\n", 14 | "2. optimize the model\n", 15 | "3. dealing with the dataset\n", 16 | "\n", 17 | "Original Author: Aymeric Damien\n", 18 | "Edited by Wei Li for ChinaHadoop Deep learning course\n", 19 | "Project: https://github.com/aymericdamien/TensorFlow-Examples/\n", 20 | "'''\n", 21 | "\n", 22 | "\n", 23 | "import tensorflow as tf\n", 24 | "import numpy\n", 25 | "rng = numpy.random\n", 26 | "\n", 27 | "# model params\n", 28 | "learning_rate = 0.02\n", 29 | "training_epochs = 3000\n", 30 | "display_step=50\n", 31 | "# \n", 32 | "train_X = numpy.asarray([3.3,4.4,5.5,6.71,6.93,4.168,9.779,6.182,7.59,2.167,\n", 33 | " 7.042,10.791,5.313,7.997,5.654,9.27,3.1])\n", 34 | "train_Y = numpy.asarray([1.7,2.76,2.09,3.19,1.694,1.573,3.366,2.596,2.53,1.221,\n", 35 | " 2.827,3.465,1.65,2.904,2.42,2.94,1.3])\n", 36 | "n_samples = train_X.shape[0]\n", 37 | "\n", 38 | "# tf Graph Input\n", 39 | "X = tf.placeholder(\"float\")\n", 40 | "Y = tf.placeholder(\"float\")\n", 41 | "\n", 42 | "# Set model weights\n", 43 | "W = tf.Variable(rng.randn(), name=\"weight\")\n", 44 | "b = tf.Variable(rng.randn(), name=\"bias\")\n", 45 | "\n", 46 | "# Construct a linear model\n", 47 | "pred = tf.add(tf.multiply(X, W), b)\n", 48 | "\n", 49 | "# Mean squared error\n", 50 | "cost = tf.reduce_sum(tf.pow(pred-Y, 2))/(2*n_samples)\n", 51 | "# Gradient descent\n", 52 | "optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)\n", 53 | "\n", 54 | "# Initializing the variables\n", 55 | "init = tf.global_variables_initializer()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 9, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "('Epoch:', '0050', 'cost=', '0.178423569', 'W=', 0.42291793, 'b=', -0.4734658)\n", 70 | "('Epoch:', '0100', 'cost=', '0.156202286', 'W=', 0.40251526, 'b=', -0.32475927)\n", 71 | "('Epoch:', '0150', 'cost=', '0.138855815', 'W=', 0.38448787, 'b=', -0.19336548)\n", 72 | "('Epoch:', '0200', 'cost=', '0.125314981', 'W=', 0.36855927, 'b=', -0.077268951)\n", 73 | "('Epoch:', '0250', 'cost=', '0.114744954', 'W=', 0.35448512, 'b=', 0.025311502)\n", 74 | "('Epoch:', '0300', 'cost=', '0.106494129', 'W=', 0.34204948, 'b=', 0.11594942)\n", 75 | "('Epoch:', '0350', 'cost=', '0.100053802', 'W=', 0.3310616, 'b=', 0.19603507)\n", 76 | "('Epoch:', '0400', 'cost=', '0.095026731', 'W=', 0.32135299, 'b=', 0.26679745)\n", 77 | "('Epoch:', '0450', 'cost=', '0.091103002', 'W=', 0.31277463, 'b=', 0.3293213)\n", 78 | "('Epoch:', '0500', 'cost=', '0.088040523', 'W=', 0.30519509, 'b=', 0.38456526)\n", 79 | "('Epoch:', '0550', 'cost=', '0.085650302', 'W=', 0.29849792, 'b=', 0.43337804)\n", 80 | "('Epoch:', '0600', 'cost=', '0.083784848', 'W=', 0.29258049, 'b=', 0.47650799)\n", 81 | "('Epoch:', '0650', 'cost=', '0.082329050', 'W=', 0.28735185, 'b=', 0.51461679)\n", 82 | "('Epoch:', '0700', 'cost=', '0.081192940', 'W=', 0.28273201, 'b=', 0.54828918)\n", 83 | "('Epoch:', '0750', 'cost=', '0.080306433', 'W=', 0.27865005, 'b=', 0.57804072)\n", 84 | "('Epoch:', '0800', 'cost=', '0.079614699', 'W=', 0.27504328, 'b=', 0.60432887)\n", 85 | "('Epoch:', '0850', 'cost=', '0.079074971', 'W=', 0.27185646, 'b=', 0.6275565)\n", 86 | "('Epoch:', '0900', 'cost=', '0.078653932', 'W=', 0.26904064, 'b=', 0.64807951)\n", 87 | "('Epoch:', '0950', 'cost=', '0.078325450', 'W=', 0.26655263, 'b=', 0.66621393)\n", 88 | "('Epoch:', '1000', 'cost=', '0.078069247', 'W=', 0.26435426, 'b=', 0.68223649)\n", 89 | "('Epoch:', '1050', 'cost=', '0.077869445', 'W=', 0.26241186, 'b=', 0.69639373)\n", 90 | "('Epoch:', '1100', 'cost=', '0.077713616', 'W=', 0.26069549, 'b=', 0.70890343)\n", 91 | "('Epoch:', '1150', 'cost=', '0.077592134', 'W=', 0.25917912, 'b=', 0.71995574)\n", 92 | "('Epoch:', '1200', 'cost=', '0.077497423', 'W=', 0.2578392, 'b=', 0.72972184)\n", 93 | "('Epoch:', '1250', 'cost=', '0.077423617', 'W=', 0.25665545, 'b=', 0.73834991)\n", 94 | "('Epoch:', '1300', 'cost=', '0.077366099', 'W=', 0.2556093, 'b=', 0.74597466)\n", 95 | "('Epoch:', '1350', 'cost=', '0.077321291', 'W=', 0.25468507, 'b=', 0.75271124)\n", 96 | "('Epoch:', '1400', 'cost=', '0.077286400', 'W=', 0.25386831, 'b=', 0.75866407)\n", 97 | "('Epoch:', '1450', 'cost=', '0.077259235', 'W=', 0.25314665, 'b=', 0.76392406)\n", 98 | "('Epoch:', '1500', 'cost=', '0.077238098', 'W=', 0.252509, 'b=', 0.76857102)\n", 99 | "('Epoch:', '1550', 'cost=', '0.077221632', 'W=', 0.25194564, 'b=', 0.77267736)\n", 100 | "('Epoch:', '1600', 'cost=', '0.077208854', 'W=', 0.25144795, 'b=', 0.77630514)\n", 101 | "('Epoch:', '1650', 'cost=', '0.077198923', 'W=', 0.25100803, 'b=', 0.77951139)\n", 102 | "('Epoch:', '1700', 'cost=', '0.077191189', 'W=', 0.25061971, 'b=', 0.78234196)\n", 103 | "('Epoch:', '1750', 'cost=', '0.077185199', 'W=', 0.25027612, 'b=', 0.78484607)\n", 104 | "('Epoch:', '1800', 'cost=', '0.077180564', 'W=', 0.24997255, 'b=', 0.78705853)\n", 105 | "('Epoch:', '1850', 'cost=', '0.077176966', 'W=', 0.2497045, 'b=', 0.78901207)\n", 106 | "('Epoch:', '1900', 'cost=', '0.077174187', 'W=', 0.24946776, 'b=', 0.79073763)\n", 107 | "('Epoch:', '1950', 'cost=', '0.077172041', 'W=', 0.24925858, 'b=', 0.79226238)\n", 108 | "('Epoch:', '2000', 'cost=', '0.077170387', 'W=', 0.24907368, 'b=', 0.7936098)\n", 109 | "('Epoch:', '2050', 'cost=', '0.077169113', 'W=', 0.24891038, 'b=', 0.79480028)\n", 110 | "('Epoch:', '2100', 'cost=', '0.077168114', 'W=', 0.24876596, 'b=', 0.79585338)\n", 111 | "('Epoch:', '2150', 'cost=', '0.077167362', 'W=', 0.24863829, 'b=', 0.79678357)\n", 112 | "('Epoch:', '2200', 'cost=', '0.077166796', 'W=', 0.24852541, 'b=', 0.79760629)\n", 113 | "('Epoch:', '2250', 'cost=', '0.077166334', 'W=', 0.24842578, 'b=', 0.79833227)\n", 114 | "('Epoch:', '2300', 'cost=', '0.077165999', 'W=', 0.2483376, 'b=', 0.79897529)\n", 115 | "('Epoch:', '2350', 'cost=', '0.077165760', 'W=', 0.24825987, 'b=', 0.79954147)\n", 116 | "('Epoch:', '2400', 'cost=', '0.077165581', 'W=', 0.24819092, 'b=', 0.80004394)\n", 117 | "('Epoch:', '2450', 'cost=', '0.077165432', 'W=', 0.24813022, 'b=', 0.80048668)\n", 118 | "('Epoch:', '2500', 'cost=', '0.077165321', 'W=', 0.24807698, 'b=', 0.80087441)\n", 119 | "('Epoch:', '2550', 'cost=', '0.077165253', 'W=', 0.24802969, 'b=', 0.80121905)\n", 120 | "('Epoch:', '2600', 'cost=', '0.077165186', 'W=', 0.24798796, 'b=', 0.80152339)\n", 121 | "('Epoch:', '2650', 'cost=', '0.077165157', 'W=', 0.2479513, 'b=', 0.8017906)\n", 122 | "('Epoch:', '2700', 'cost=', '0.077165119', 'W=', 0.24791868, 'b=', 0.80202842)\n", 123 | "('Epoch:', '2750', 'cost=', '0.077165097', 'W=', 0.24789007, 'b=', 0.80223686)\n", 124 | "('Epoch:', '2800', 'cost=', '0.077165097', 'W=', 0.24786451, 'b=', 0.80242288)\n", 125 | "('Epoch:', '2850', 'cost=', '0.077165082', 'W=', 0.24784194, 'b=', 0.80258781)\n", 126 | "('Epoch:', '2900', 'cost=', '0.077165082', 'W=', 0.24782193, 'b=', 0.80273348)\n", 127 | "('Epoch:', '2950', 'cost=', '0.077165082', 'W=', 0.24780463, 'b=', 0.80285954)\n", 128 | "('Epoch:', '3000', 'cost=', '0.077165082', 'W=', 0.24778947, 'b=', 0.80296975)\n", 129 | "('Training cost=', 0.077165082, 'W=', 0.24778947, 'b=', 0.80296975, '\\n')\n", 130 | "Tssting...\n", 131 | "('Test LOSS=', 0.079976395)\n", 132 | "('Final Loss:', 0.0028113127)\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "\n", 138 | "# Launch the graph\n", 139 | "with tf.Session() as sess:\n", 140 | " sess.run(init)\n", 141 | "\n", 142 | " # Fit all training data\n", 143 | " for epoch in range(training_epochs):\n", 144 | " for (x, y) in zip(train_X, train_Y):\n", 145 | " sess.run(optimizer, feed_dict={X: x, Y: y})\n", 146 | "\n", 147 | " # Display logs per epoch step\n", 148 | " if (epoch+1) % display_step == 0:\n", 149 | " c = sess.run(cost, feed_dict={X: train_X, Y:train_Y})\n", 150 | " print(\"Epoch:\", '%04d' % (epoch+1), \"cost=\", \"{:.9f}\".format(c), \\\n", 151 | " \"W=\", sess.run(W), \"b=\", sess.run(b))\n", 152 | "\n", 153 | "\n", 154 | " training_cost = sess.run(cost, feed_dict={X: train_X, Y: train_Y})\n", 155 | " print(\"Training cost=\", training_cost, \"W=\", sess.run(W), \"b=\", sess.run(b), '\\n')\n", 156 | "\n", 157 | " \n", 158 | "\n", 159 | " # the testing data\n", 160 | " test_X = numpy.asarray([6.83, 4.668, 8.9, 7.91, 5.7, 8.7, 3.1, 2.1])\n", 161 | " test_Y = numpy.asarray([1.84, 2.273, 3.2, 2.831, 2.92, 3.24, 1.35, 1.03])\n", 162 | "\n", 163 | " print(\"Tssting...\")\n", 164 | " testing_cost = sess.run(\n", 165 | " tf.reduce_sum(tf.pow(pred - Y, 2)) / (2 * test_X.shape[0]),\n", 166 | " feed_dict={X: test_X, Y: test_Y}) # same function as cost above\n", 167 | " print(\"Test LOSS=\", testing_cost)\n", 168 | " print(\"Final Loss:\", abs(\n", 169 | " training_cost - testing_cost))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 2", 185 | "language": "python", 186 | "name": "python2" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 2 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython2", 198 | "version": "2.7.12" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 2 203 | } 204 | -------------------------------------------------------------------------------- /course_1_tf_lr.py: -------------------------------------------------------------------------------- 1 | ''' 2 | linear regression experiment, hope you can know: 3 | 1. how to design the learning model 4 | 2. optimize the model 5 | 3. dealing with the dataset 6 | 7 | Original Author: Aymeric Damien 8 | Edited by Wei Li for ChinaHadoop Deep learning course 9 | Project: https://github.com/aymericdamien/TensorFlow-Examples/ 10 | ''' 11 | 12 | 13 | import tensorflow as tf 14 | import numpy 15 | rng = numpy.random 16 | 17 | # model params 18 | learning_rate = 0.02 19 | training_epochs = 3000 20 | display_step=50 21 | # 22 | train_X = numpy.asarray([3.3,4.4,5.5,6.71,6.93,4.168,9.779,6.182,7.59,2.167, 23 | 7.042,10.791,5.313,7.997,5.654,9.27,3.1]) 24 | train_Y = numpy.asarray([1.7,2.76,2.09,3.19,1.694,1.573,3.366,2.596,2.53,1.221, 25 | 2.827,3.465,1.65,2.904,2.42,2.94,1.3]) 26 | n_samples = train_X.shape[0] 27 | 28 | # tf Graph Input 29 | X = tf.placeholder("float") 30 | Y = tf.placeholder("float") 31 | 32 | # Set model weights 33 | W = tf.Variable(rng.randn(), name="weight") 34 | b = tf.Variable(rng.randn(), name="bias") 35 | 36 | # Construct a linear model 37 | pred = tf.add(tf.multiply(X, W), b) 38 | 39 | # Mean squared error 40 | cost = tf.reduce_sum(tf.pow(pred-Y, 2))/(2*n_samples) 41 | # Gradient descent 42 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) 43 | 44 | # Initializing the variables 45 | init = tf.global_variables_initializer() 46 | 47 | # Launch the graph 48 | with tf.Session() as sess: 49 | sess.run(init) 50 | 51 | # Fit all training data 52 | for epoch in range(training_epochs): 53 | for (x, y) in zip(train_X, train_Y): 54 | sess.run(optimizer, feed_dict={X: x, Y: y}) 55 | 56 | # Display logs per epoch step 57 | if (epoch+1) % display_step == 0: 58 | c = sess.run(cost, feed_dict={X: train_X, Y:train_Y}) 59 | print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c), \ 60 | "W=", sess.run(W), "b=", sess.run(b)) 61 | 62 | 63 | training_cost = sess.run(cost, feed_dict={X: train_X, Y: train_Y}) 64 | print("Training cost=", training_cost, "W=", sess.run(W), "b=", sess.run(b), '\n') 65 | 66 | 67 | 68 | # the testing data 69 | test_X = numpy.asarray([6.83, 4.668, 8.9, 7.91, 5.7, 8.7, 3.1, 2.1]) 70 | test_Y = numpy.asarray([1.84, 2.273, 3.2, 2.831, 2.92, 3.24, 1.35, 1.03]) 71 | 72 | print("Tssting...") 73 | testing_cost = sess.run( 74 | tf.reduce_sum(tf.pow(pred - Y, 2)) / (2 * test_X.shape[0]), 75 | feed_dict={X: test_X, Y: test_Y}) # same function as cost above 76 | print("Test LOSS=", testing_cost) 77 | print("Final Loss:", abs( 78 | training_cost - testing_cost)) 79 | 80 | -------------------------------------------------------------------------------- /course_2_tf_nn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [ 12 | { 13 | "name": "stdout", 14 | "output_type": "stream", 15 | "text": [ 16 | "Extracting ./mnist/train-images-idx3-ubyte.gz\n", 17 | "Extracting ./mnist/train-labels-idx1-ubyte.gz\n", 18 | "Extracting ./mnist/t10k-images-idx3-ubyte.gz\n", 19 | "Extracting ./mnist/t10k-labels-idx1-ubyte.gz\n", 20 | "('Epoch:', '0001', 'cost=', '215.548141965')\n", 21 | "('Epoch:', '0002', 'cost=', '54.977557694')\n", 22 | "('Epoch:', '0003', 'cost=', '33.899888993')\n", 23 | "('Epoch:', '0004', 'cost=', '23.234023376')\n", 24 | "('Epoch:', '0005', 'cost=', '16.552313167')\n", 25 | "('Epoch:', '0006', 'cost=', '12.184614655')\n", 26 | "('Epoch:', '0007', 'cost=', '8.918999288')\n", 27 | "('Epoch:', '0008', 'cost=', '6.555203167')\n", 28 | "('Epoch:', '0009', 'cost=', '4.864825427')\n", 29 | "('Epoch:', '0010', 'cost=', '3.541727996')\n", 30 | "('Epoch:', '0011', 'cost=', '2.601980731')\n", 31 | "('Epoch:', '0012', 'cost=', '2.013708151')\n", 32 | "('Epoch:', '0013', 'cost=', '1.447752024')\n", 33 | "('Epoch:', '0014', 'cost=', '1.284220558')\n", 34 | "('Epoch:', '0015', 'cost=', '1.063494972')\n", 35 | "('Epoch:', '0016', 'cost=', '1.089214503')\n", 36 | "('Epoch:', '0017', 'cost=', '0.819465103')\n", 37 | "('Epoch:', '0018', 'cost=', '0.826465986')\n", 38 | "('Epoch:', '0019', 'cost=', '0.756363073')\n", 39 | "('Epoch:', '0020', 'cost=', '0.756904836')\n", 40 | "('Epoch:', '0021', 'cost=', '0.772401051')\n", 41 | "('Epoch:', '0022', 'cost=', '0.591537078')\n", 42 | "('Epoch:', '0023', 'cost=', '0.518754110')\n", 43 | "('Epoch:', '0024', 'cost=', '0.653424654')\n", 44 | "('Epoch:', '0025', 'cost=', '0.639180361')\n", 45 | "('Epoch:', '0026', 'cost=', '0.418257485')\n", 46 | "('Epoch:', '0027', 'cost=', '0.434976982')\n", 47 | "('Epoch:', '0028', 'cost=', '0.606400410')\n", 48 | "('Epoch:', '0029', 'cost=', '0.475488307')\n", 49 | "('Epoch:', '0030', 'cost=', '0.458589170')\n", 50 | "Optimization Finished!\n", 51 | "('Accuracy:', 0.96039999)\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "#get the mnist data \n", 57 | "# wget http://deeplearning.net/data/mnist/mnist.pkl.gz\n", 58 | "\n", 59 | "\n", 60 | "\n", 61 | "\n", 62 | "from tensorflow.examples.tutorials.mnist import input_data\n", 63 | "mnist = input_data.read_data_sets(\"./mnist/\", one_hot=True)\n", 64 | "\n", 65 | "import tensorflow as tf\n", 66 | "\n", 67 | "# Parameters\n", 68 | "learning_rate = 0.001\n", 69 | "training_epochs = 30\n", 70 | "batch_size = 100\n", 71 | "display_step = 1\n", 72 | "\n", 73 | "# Network Parameters\n", 74 | "n_hidden_1 = 256 # 1st layer number of features\n", 75 | "n_hidden_2 = 512 # 2nd layer number of features\n", 76 | "n_input = 784 # MNIST data input (img shape: 28*28)\n", 77 | "n_classes = 10 # MNIST total classes (0-9 digits)\n", 78 | "\n", 79 | "# tf Graph input\n", 80 | "x = tf.placeholder(\"float\", [None, n_input])\n", 81 | "y = tf.placeholder(\"float\", [None, n_classes])\n", 82 | "\n", 83 | "\n", 84 | "# Create model\n", 85 | "def multilayer_perceptron(x, weights, biases):\n", 86 | " # Hidden layer with RELU activation\n", 87 | " layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])\n", 88 | " layer_1 = tf.nn.relu(layer_1)\n", 89 | " # Hidden layer with RELU activation\n", 90 | " layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])\n", 91 | " layer_2 = tf.nn.relu(layer_2)\n", 92 | "\n", 93 | " # layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])\n", 94 | " # layer_3 = tf.nn.relu(layer_3)\n", 95 | "\n", 96 | "\n", 97 | "\n", 98 | " #we can add dropout layer\n", 99 | " # drop_out = tf.nn.dropout(layer_2, 0.75)\n", 100 | "\n", 101 | "\n", 102 | "\n", 103 | " # Output layer with linear activation\n", 104 | " out_layer = tf.matmul(layer_2, weights['out']) + biases['out']\n", 105 | " return out_layer\n", 106 | "\n", 107 | "# Store layers weight & biases\n", 108 | "weights = {\n", 109 | " #you can change \n", 110 | " 'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),\n", 111 | " 'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),\n", 112 | " #'h3': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),\n", 113 | " 'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))\n", 114 | "}\n", 115 | "biases = {\n", 116 | " 'b1': tf.Variable(tf.random_normal([n_hidden_1])),\n", 117 | " 'b2': tf.Variable(tf.random_normal([n_hidden_2])),\n", 118 | " #'b3': tf.Variable(tf.random_normal([n_hidden_2])),\n", 119 | " 'out': tf.Variable(tf.random_normal([n_classes]))\n", 120 | "}\n", 121 | "\n", 122 | "# Construct model\n", 123 | "pred = multilayer_perceptron(x, weights, biases)\n", 124 | "\n", 125 | "# Define loss and optimizer\n", 126 | "cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))\n", 127 | "optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)\n", 128 | "\n", 129 | "# Initializing the variables\n", 130 | "init = tf.global_variables_initializer()\n", 131 | "\n", 132 | "# Launch the graph\n", 133 | "with tf.Session() as sess:\n", 134 | " sess.run(init)\n", 135 | "\n", 136 | " # Training cycle\n", 137 | " for epoch in range(training_epochs):\n", 138 | " avg_cost = 0.\n", 139 | " total_batch = int(mnist.train.num_examples/batch_size)\n", 140 | " # Loop over all batches\n", 141 | " for i in range(total_batch):\n", 142 | " batch_x, batch_y = mnist.train.next_batch(batch_size)\n", 143 | " # Run optimization op (backprop) and cost op (to get loss value)\n", 144 | " _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,\n", 145 | " y: batch_y})\n", 146 | " # Compute average loss\n", 147 | " avg_cost += c / total_batch\n", 148 | " # Display logs per epoch step\n", 149 | " if epoch % display_step == 0:\n", 150 | " print(\"Epoch:\", '%04d' % (epoch+1), \"cost=\", \\\n", 151 | " \"{:.9f}\".format(avg_cost))\n", 152 | " print(\"Optimization Finished!\")\n", 153 | "\n", 154 | " # Test model\n", 155 | " correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))\n", 156 | " # Calculate accuracy\n", 157 | " accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n", 158 | " print(\"Accuracy:\", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))\n" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "collapsed": true, 166 | "deletable": true, 167 | "editable": true 168 | }, 169 | "outputs": [], 170 | "source": [] 171 | } 172 | ], 173 | "metadata": { 174 | "kernelspec": { 175 | "display_name": "Python 2", 176 | "language": "python", 177 | "name": "python2" 178 | }, 179 | "language_info": { 180 | "codemirror_mode": { 181 | "name": "ipython", 182 | "version": 2 183 | }, 184 | "file_extension": ".py", 185 | "mimetype": "text/x-python", 186 | "name": "python", 187 | "nbconvert_exporter": "python", 188 | "pygments_lexer": "ipython2", 189 | "version": "2.7.12" 190 | } 191 | }, 192 | "nbformat": 4, 193 | "nbformat_minor": 2 194 | } 195 | -------------------------------------------------------------------------------- /course_2_tf_nn.py: -------------------------------------------------------------------------------- 1 | #get the mnist data 2 | # wget http://deeplearning.net/data/mnist/mnist.pkl.gz 3 | 4 | 5 | 6 | 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | mnist = input_data.read_data_sets("./mnist/", one_hot=True) 9 | 10 | import tensorflow as tf 11 | 12 | # Parameters 13 | learning_rate = 0.001 14 | training_epochs = 30 15 | batch_size = 100 16 | display_step = 1 17 | 18 | # Network Parameters 19 | n_hidden_1 = 256 # 1st layer number of features 20 | n_hidden_2 = 512 # 2nd layer number of features 21 | n_input = 784 # MNIST data input (img shape: 28*28) 22 | n_classes = 10 # MNIST total classes (0-9 digits) 23 | 24 | # tf Graph input 25 | x = tf.placeholder("float", [None, n_input]) 26 | y = tf.placeholder("float", [None, n_classes]) 27 | 28 | 29 | # Create model 30 | def multilayer_perceptron(x, weights, biases): 31 | # Hidden layer with RELU activation 32 | layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1']) 33 | layer_1 = tf.nn.relu(layer_1) 34 | # Hidden layer with RELU activation 35 | layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2']) 36 | layer_2 = tf.nn.relu(layer_2) 37 | 38 | # layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3']) 39 | # layer_3 = tf.nn.relu(layer_3) 40 | 41 | 42 | 43 | #we can add dropout layer 44 | # drop_out = tf.nn.dropout(layer_2, 0.75) 45 | 46 | 47 | 48 | # Output layer with linear activation 49 | out_layer = tf.matmul(layer_2, weights['out']) + biases['out'] 50 | return out_layer 51 | 52 | # Store layers weight & biases 53 | weights = { 54 | #you can change 55 | 'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])), 56 | 'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])), 57 | #'h3': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])), 58 | 'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes])) 59 | } 60 | biases = { 61 | 'b1': tf.Variable(tf.random_normal([n_hidden_1])), 62 | 'b2': tf.Variable(tf.random_normal([n_hidden_2])), 63 | #'b3': tf.Variable(tf.random_normal([n_hidden_2])), 64 | 'out': tf.Variable(tf.random_normal([n_classes])) 65 | } 66 | 67 | # Construct model 68 | pred = multilayer_perceptron(x, weights, biases) 69 | 70 | # Define loss and optimizer 71 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) 72 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) 73 | 74 | # Initializing the variables 75 | init = tf.global_variables_initializer() 76 | 77 | # Launch the graph 78 | with tf.Session() as sess: 79 | sess.run(init) 80 | 81 | # Training cycle 82 | for epoch in range(training_epochs): 83 | avg_cost = 0. 84 | total_batch = int(mnist.train.num_examples/batch_size) 85 | # Loop over all batches 86 | for i in range(total_batch): 87 | batch_x, batch_y = mnist.train.next_batch(batch_size) 88 | # Run optimization op (backprop) and cost op (to get loss value) 89 | _, c = sess.run([optimizer, cost], feed_dict={x: batch_x, 90 | y: batch_y}) 91 | # Compute average loss 92 | avg_cost += c / total_batch 93 | # Display logs per epoch step 94 | if epoch % display_step == 0: 95 | print("Epoch:", '%04d' % (epoch+1), "cost=", \ 96 | "{:.9f}".format(avg_cost)) 97 | print("Optimization Finished!") 98 | 99 | # Test model 100 | correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) 101 | # Calculate accuracy 102 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 103 | print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})) 104 | -------------------------------------------------------------------------------- /course_3_tf_mnist_cnn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [ 12 | { 13 | "name": "stdout", 14 | "output_type": "stream", 15 | "text": [ 16 | "Extracting ./train-images-idx3-ubyte.gz\n", 17 | "Extracting ./train-labels-idx1-ubyte.gz\n", 18 | "Extracting ./t10k-images-idx3-ubyte.gz\n", 19 | "Extracting ./t10k-labels-idx1-ubyte.gz\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "from tensorflow.examples.tutorials.mnist import input_data\n", 25 | "mnist = input_data.read_data_sets(\".\", one_hot=True)\n", 26 | "\n", 27 | "import tensorflow as tf\n", 28 | "\n", 29 | "# Parameters\n", 30 | "learning_rate = 0.001\n", 31 | "training_epochs = 30\n", 32 | "batch_size = 100\n", 33 | "display_step = 1\n", 34 | "\n", 35 | "# Network Parameters\n", 36 | "n_input = 784 # MNIST data input (img shape: 28*28)\n", 37 | "n_classes = 10 # MNIST total classes (0-9 digits)\n", 38 | "\n", 39 | "# tf Graph input\n", 40 | "x = tf.placeholder(\"float\", [None, n_input])\n", 41 | "y = tf.placeholder(\"float\", [None, n_classes])" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 12, 47 | "metadata": { 48 | "collapsed": true, 49 | "deletable": true, 50 | "editable": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "#pre-define the \n", 55 | "def conv2d(x, W):\n", 56 | " return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')\n", 57 | "\n", 58 | "def max_pool_2x2(x):\n", 59 | " return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],\n", 60 | " strides=[1, 2, 2, 1], padding='SAME')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 13, 66 | "metadata": { 67 | "collapsed": true, 68 | "deletable": true, 69 | "editable": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "def multilayer_perceptron(x, weights, biases):\n", 74 | " #now, we want to change this to a CNN network\n", 75 | "\n", 76 | " #first reshape the data to 4-D\n", 77 | "\n", 78 | " x_image = tf.reshape(x, [-1,28,28,1])\n", 79 | "\n", 80 | " #then apply cnn layers\n", 81 | "\n", 82 | " h_conv1 = tf.nn.relu(conv2d(x_image, weights['conv1']) + biases['conv_b1'])\n", 83 | " h_pool1 = max_pool_2x2(h_conv1)\n", 84 | "\n", 85 | " h_conv2 = tf.nn.relu(conv2d(h_pool1, weights['conv2']) + biases['conv_b2'])\n", 86 | " h_pool2 = max_pool_2x2(h_conv2)\n", 87 | "\n", 88 | " h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])\n", 89 | " h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, weights['fc1']) + biases['fc1_b'])\n", 90 | "\n", 91 | "\n", 92 | " # Output layer with linear activation\n", 93 | " out_layer = tf.matmul(h_fc1, weights['out']) + biases['out_b']\n", 94 | " return out_layer" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 14, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# Store layers weight & biases\n", 106 | "weights = {\n", 107 | " 'conv1': tf.Variable(tf.random_normal([5, 5, 1, 32])),\n", 108 | " 'conv2': tf.Variable(tf.random_normal([5, 5, 32, 64])),\n", 109 | " 'fc1' : tf.Variable(tf.random_normal([7*7*64,256])),\n", 110 | " 'out': tf.Variable(tf.random_normal([256,n_classes]))\n", 111 | "}\n", 112 | "biases = {\n", 113 | " 'conv_b1': tf.Variable(tf.random_normal([32])),\n", 114 | " 'conv_b2': tf.Variable(tf.random_normal([64])),\n", 115 | " 'fc1_b': tf.Variable(tf.random_normal([256])),\n", 116 | " 'out_b': tf.Variable(tf.random_normal([n_classes]))\n", 117 | "}\n", 118 | "\n", 119 | "# Construct model\n", 120 | "pred = multilayer_perceptron(x, weights, biases)\n", 121 | "\n", 122 | "# Define loss and optimizer\n", 123 | "cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))\n", 124 | "optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)\n", 125 | "\n", 126 | "# Initializing the variables\n", 127 | "init = tf.global_variables_initializer()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 15, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "('Epoch:', '0001', 'cost=', '2005.953651756')\n", 142 | "('Epoch:', '0002', 'cost=', '361.200756125')\n", 143 | "('Epoch:', '0003', 'cost=', '222.655593089')\n", 144 | "('Epoch:', '0004', 'cost=', '154.397716973')\n", 145 | "('Epoch:', '0005', 'cost=', '108.289408546')\n", 146 | "('Epoch:', '0006', 'cost=', '83.728486200')\n", 147 | "('Epoch:', '0007', 'cost=', '63.813128544')\n", 148 | "('Epoch:', '0008', 'cost=', '52.091127872')\n", 149 | "('Epoch:', '0009', 'cost=', '38.352929364')\n", 150 | "('Epoch:', '0010', 'cost=', '30.455494692')\n", 151 | "('Epoch:', '0011', 'cost=', '25.972187011')\n", 152 | "('Epoch:', '0012', 'cost=', '20.754565103')\n", 153 | "('Epoch:', '0013', 'cost=', '18.515140012')\n", 154 | "('Epoch:', '0014', 'cost=', '14.170893429')\n", 155 | "('Epoch:', '0015', 'cost=', '13.025495452')\n", 156 | "('Epoch:', '0016', 'cost=', '11.380087092')\n", 157 | "('Epoch:', '0017', 'cost=', '12.045677507')\n", 158 | "('Epoch:', '0018', 'cost=', '9.095552578')\n", 159 | "('Epoch:', '0019', 'cost=', '8.405252479')\n", 160 | "('Epoch:', '0020', 'cost=', '7.802369204')\n", 161 | "('Epoch:', '0021', 'cost=', '8.664561321')\n", 162 | "('Epoch:', '0022', 'cost=', '6.413273589')\n", 163 | "('Epoch:', '0023', 'cost=', '7.001173552')\n", 164 | "('Epoch:', '0024', 'cost=', '3.928643572')\n", 165 | "('Epoch:', '0025', 'cost=', '6.000280571')\n", 166 | "('Epoch:', '0026', 'cost=', '3.947065584')\n", 167 | "('Epoch:', '0027', 'cost=', '5.913655243')\n", 168 | "('Epoch:', '0028', 'cost=', '4.686071558')\n", 169 | "('Epoch:', '0029', 'cost=', '3.783876064')\n", 170 | "('Epoch:', '0030', 'cost=', '3.133972832')\n", 171 | "Optimization Finished!\n", 172 | "('Accuracy:', 0.98420006)\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "\n", 178 | "# Launch the graph\n", 179 | "with tf.Session() as sess:\n", 180 | " sess.run(init)\n", 181 | "\n", 182 | " # Training cycle\n", 183 | " for epoch in range(training_epochs):\n", 184 | " avg_cost = 0.\n", 185 | " total_batch = int(mnist.train.num_examples/batch_size)\n", 186 | " # Loop over all batches\n", 187 | " for i in range(total_batch):\n", 188 | " batch_x, batch_y = mnist.train.next_batch(batch_size)\n", 189 | " # Run optimization op (backprop) and cost op (to get loss value)\n", 190 | " _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,\n", 191 | " y: batch_y})\n", 192 | " # Compute average loss\n", 193 | " avg_cost += c / total_batch\n", 194 | " # Display logs per epoch step\n", 195 | " if epoch % display_step == 0:\n", 196 | " print(\"Epoch:\", '%04d' % (epoch+1), \"cost=\", \\\n", 197 | " \"{:.9f}\".format(avg_cost))\n", 198 | " print(\"Optimization Finished!\")\n", 199 | "\n", 200 | " # Test model\n", 201 | " correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))\n", 202 | " # Calculate accuracy\n", 203 | " accuracy = tf.reduce_mean(tf.cast(correct_prediction, \"float\"))\n", 204 | " print(\"Accuracy:\", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python 2", 220 | "language": "python", 221 | "name": "python2" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 2 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython2", 233 | "version": "2.7.12" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 2 238 | } 239 | -------------------------------------------------------------------------------- /course_3_tf_mnist_cnn.py: -------------------------------------------------------------------------------- 1 | #get the mnist data 2 | # wget http://deeplearning.net/data/mnist/mnist.pkl.gz 3 | 4 | 5 | 6 | 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | mnist = input_data.read_data_sets(".", one_hot=True) 9 | 10 | import tensorflow as tf 11 | 12 | # Parameters 13 | learning_rate = 0.001 14 | training_epochs = 30 15 | batch_size = 100 16 | display_step = 1 17 | 18 | # Network Parameters 19 | n_input = 784 # MNIST data input (img shape: 28*28) 20 | n_classes = 10 # MNIST total classes (0-9 digits) 21 | 22 | # tf Graph input 23 | x = tf.placeholder("float", [None, n_input]) 24 | y = tf.placeholder("float", [None, n_classes]) 25 | 26 | #pre-define the 27 | def conv2d(x, W): 28 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') 29 | 30 | def max_pool_2x2(x): 31 | return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], 32 | strides=[1, 2, 2, 1], padding='SAME') 33 | 34 | 35 | # Create model 36 | def multilayer_perceptron(x, weights, biases): 37 | #now, we want to change this to a CNN network 38 | 39 | #first reshape the data to 4-D 40 | 41 | x_image = tf.reshape(x, [-1,28,28,1]) 42 | 43 | #then apply cnn layers 44 | 45 | h_conv1 = tf.nn.relu(conv2d(x_image, weights['conv1']) + biases['conv_b1']) 46 | h_pool1 = max_pool_2x2(h_conv1) 47 | 48 | h_conv2 = tf.nn.relu(conv2d(h_pool1, weights['conv2']) + biases['conv_b2']) 49 | h_pool2 = max_pool_2x2(h_conv2) 50 | 51 | h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64]) 52 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, weights['fc1']) + biases['fc1_b']) 53 | 54 | 55 | # Output layer with linear activation 56 | out_layer = tf.matmul(h_fc1, weights['out']) + biases['out_b'] 57 | return out_layer 58 | 59 | # Store layers weight & biases 60 | weights = { 61 | 'conv1': tf.Variable(tf.random_normal([5, 5, 1, 32])), 62 | 'conv2': tf.Variable(tf.random_normal([5, 5, 32, 64])), 63 | 'fc1' : tf.Variable(tf.random_normal([7*7*64,256])), 64 | 'out': tf.Variable(tf.random_normal([256,n_classes])) 65 | } 66 | biases = { 67 | 'conv_b1': tf.Variable(tf.random_normal([32])), 68 | 'conv_b2': tf.Variable(tf.random_normal([64])), 69 | 'fc1_b': tf.Variable(tf.random_normal([256])), 70 | 'out_b': tf.Variable(tf.random_normal([n_classes])) 71 | } 72 | 73 | # Construct model 74 | pred = multilayer_perceptron(x, weights, biases) 75 | 76 | # Define loss and optimizer 77 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) 78 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) 79 | 80 | # Initializing the variables 81 | init = tf.global_variables_initializer() 82 | 83 | # Launch the graph 84 | with tf.Session() as sess: 85 | sess.run(init) 86 | 87 | # Training cycle 88 | for epoch in range(training_epochs): 89 | avg_cost = 0. 90 | total_batch = int(mnist.train.num_examples/batch_size) 91 | # Loop over all batches 92 | for i in range(total_batch): 93 | batch_x, batch_y = mnist.train.next_batch(batch_size) 94 | # Run optimization op (backprop) and cost op (to get loss value) 95 | _, c = sess.run([optimizer, cost], feed_dict={x: batch_x, 96 | y: batch_y}) 97 | # Compute average loss 98 | avg_cost += c / total_batch 99 | # Display logs per epoch step 100 | if epoch % display_step == 0: 101 | print("Epoch:", '%04d' % (epoch+1), "cost=", \ 102 | "{:.9f}".format(avg_cost)) 103 | print("Optimization Finished!") 104 | 105 | # Test model 106 | correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) 107 | # Calculate accuracy 108 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 109 | print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels})) -------------------------------------------------------------------------------- /course_6_obj_detection.txt: -------------------------------------------------------------------------------- 1 | YOLO example: 2 | https://github.com/wiibrew/YOLO_tensorflow 3 | 4 | 5 | Faster RCNN examples: 6 | https://github.com/wiibrew/Faster-RCNN_TF 7 | model desfine and train: 8 | https://github.com/wiibrew/Faster-RCNN_TF/blob/master/lib/networks/VGGnet_train.py 9 | details of layer(loss and RPN): 10 | https://github.com/wiibrew/Faster-RCNN_TF/blob/master/lib/networks/network.py 11 | -------------------------------------------------------------------------------- /course_7_lstm_learn_shakespeare.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "scrolled": true 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "hdf5 is not supported on this machine (please install/reinstall h5py for optimal experience)\n" 16 | ] 17 | } 18 | ], 19 | "source": [ 20 | "from __future__ import absolute_import, division, print_function\n", 21 | "\n", 22 | "import os\n", 23 | "import pickle\n", 24 | "from six.moves import urllib\n", 25 | "\n", 26 | "import tflearn\n", 27 | "from tflearn.data_utils import *\n", 28 | "\n", 29 | "path = \"shakespeare_input.txt\"\n", 30 | "char_idx_file = 'char_idx.pickle'\n", 31 | "\n", 32 | "if not os.path.isfile(path):\n", 33 | " urllib.request.urlretrieve(\"https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/shakespeare_input.txt\", path)\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "Loading previous char_idx\n", 48 | "Vectorizing text...\n", 49 | "Text total length: 4,573,338\n", 50 | "Distinct chars : 67\n", 51 | "Total sequences : 1,524,438\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "maxlen = 25\n", 57 | "\n", 58 | "char_idx = None\n", 59 | "if os.path.isfile(char_idx_file):\n", 60 | " print('Loading previous char_idx')\n", 61 | " char_idx = pickle.load(open(char_idx_file, 'rb'))\n", 62 | "\n", 63 | "X, Y, char_idx = \\\n", 64 | " textfile_to_semi_redundant_sequences(path, seq_maxlen=maxlen, redun_step=3,\n", 65 | " pre_defined_char_idx=char_idx)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "pickle.dump(char_idx, open(char_idx_file,'wb'))\n", 77 | "\n", 78 | "g = tflearn.input_data([None, maxlen, len(char_idx)])\n", 79 | "g = tflearn.lstm(g, 512, return_seq=True)\n", 80 | "g = tflearn.dropout(g, 0.5)\n", 81 | "g = tflearn.lstm(g, 512, return_seq=True)\n", 82 | "g = tflearn.dropout(g, 0.5)\n", 83 | "g = tflearn.lstm(g, 512)\n", 84 | "g = tflearn.dropout(g, 0.5)\n", 85 | "g = tflearn.fully_connected(g, len(char_idx), activation='softmax')\n", 86 | "g = tflearn.regression(g, optimizer='adam', loss='categorical_crossentropy',\n", 87 | " learning_rate=0.001)\n", 88 | "\n", 89 | "m = tflearn.SequenceGenerator(g, dictionary=char_idx,\n", 90 | " seq_maxlen=maxlen,\n", 91 | " clip_gradients=5.0,\n", 92 | " checkpoint_path='model_shakespeare')" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 4, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "Training Step: 107189 | total loss: \u001b[1m\u001b[32m1.34488\u001b[0m\u001b[0m | time: 561.687s\n", 107 | "| Adam | epoch: 010 | loss: 1.34488 -- iter: 1371904/1371994\n", 108 | "Training Step: 107190 | total loss: \u001b[1m\u001b[32m1.35806\u001b[0m\u001b[0m | time: 600.688s\n", 109 | "| Adam | epoch: 010 | loss: 1.35806 | val_loss: 1.28005 -- iter: 1371994/1371994\n", 110 | "--\n", 111 | "INFO:tensorflow:/home/wei/Documents/DeepLearningCourseCodes/model_shakespeare-107190 is not in all_model_checkpoint_paths. Manually adding it.\n", 112 | "WARNING:tensorflow:Error encountered when serializing layer_tensor/LSTM.\n", 113 | "Type is unsupported, or the types of the items don't match field type in CollectionDef.\n", 114 | "'list' object has no attribute 'name'\n", 115 | "WARNING:tensorflow:Error encountered when serializing layer_tensor/Dropout.\n", 116 | "Type is unsupported, or the types of the items don't match field type in CollectionDef.\n", 117 | "'list' object has no attribute 'name'\n", 118 | "WARNING:tensorflow:Error encountered when serializing layer_tensor/LSTM_1.\n", 119 | "Type is unsupported, or the types of the items don't match field type in CollectionDef.\n", 120 | "'list' object has no attribute 'name'\n", 121 | "-- TESTING...\n", 122 | "-- Test with temperature of 1.0 --\n", 123 | "ou see'st with peril I have content, which reason'd let me clear?\n", 124 | "\n", 125 | "GORENIL:\n", 126 | "And what you have stop this occasion is better blame?\n", 127 | "\n", 128 | "PAROLLES:\n", 129 | "Why, with the enument in question not, peace, my father knight.\n", 130 | "I'll so know the night, being done,\n", 131 | "And villany, my doom, is the commanded tarteries.\n", 132 | "\n", 133 | "WARWICK:\n", 134 | "My son is thy place and quickly.\n", 135 | "How now, where wates, let Choefight and walks be bones;\n", 136 | "Our flock, if he have partled there than we\n", 137 | "enteral Mancanimone.\n", 138 | "\n", 139 | "PAGE:\n", 140 | "Carsing which not to seek it,\n", 141 | "Not yet worth and sow, beards of himself.\n", 142 | "\n", 143 | "POINS:\n", 144 | "All his cities, thou wilt there?\n", 145 | "\n", 146 | "DUGLET:\n", 147 | "My man for, goes but yet I do\n", 148 | "loathe, and\n", 149 | "-- Test with temperature of 0.5 --\n", 150 | "ou see'st with peril I have been to keep me to him.\n", 151 | "\n", 152 | "DON ADRIANO DE ARMADO:\n", 153 | "There is no fair course and honest sins\n", 154 | "And be the villain of the care of she is these death,\n", 155 | "And the speedy prince were so soon of the house.\n", 156 | "\n", 157 | "SIR TOBY BELCH:\n", 158 | "Thou shalt stand do the love of her soul,\n", 159 | "The last she will come a man to the rest,\n", 160 | "They so service and enemy.\n", 161 | "\n", 162 | "MARCIUS:\n", 163 | "I hope the name of France?\n", 164 | "\n", 165 | "FALSTAFF:\n", 166 | "Why, what is the father may have the large of a stand.\n", 167 | "\n", 168 | "DEMETRIUS:\n", 169 | "And I would not be my soul to him to him: and I have the love\n", 170 | "And be it in his fight.\n", 171 | "\n", 172 | "ALBANY:\n", 173 | "What was this to the heart;\n", 174 | "The night of the most prince and my most\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "for i in range(10):\n", 180 | " seed = random_sequence_from_textfile(path, maxlen)\n", 181 | " m.fit(X, Y, validation_set=0.1, batch_size=128,\n", 182 | " n_epoch=1, run_id='shakespeare')\n", 183 | " print(\"-- TESTING...\")\n", 184 | " print(\"-- Test with temperature of 1.0 --\")\n", 185 | " print(m.generate(600, temperature=1.0, seq_seed=seed))\n", 186 | " print(\"-- Test with temperature of 0.5 --\")\n", 187 | " print(m.generate(600, temperature=0.5, seq_seed=seed))" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 2", 203 | "language": "python", 204 | "name": "python2" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 2 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython2", 216 | "version": "2.7.12" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 2 221 | } 222 | -------------------------------------------------------------------------------- /course_7_shakespeare_gen.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import pickle 5 | from six.moves import urllib 6 | 7 | import tflearn 8 | from tflearn.data_utils import * 9 | 10 | path = "shakespeare_input.txt" 11 | char_idx_file = 'char_idx.pickle' 12 | 13 | if not os.path.isfile(path): 14 | urllib.request.urlretrieve("https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/shakespeare_input.txt", path) 15 | 16 | maxlen = 25 17 | 18 | char_idx = None 19 | if os.path.isfile(char_idx_file): 20 | print('Loading previous char_idx') 21 | char_idx = pickle.load(open(char_idx_file, 'rb')) 22 | 23 | X, Y, char_idx = \ 24 | textfile_to_semi_redundant_sequences(path, seq_maxlen=maxlen, redun_step=3, 25 | pre_defined_char_idx=char_idx) 26 | 27 | pickle.dump(char_idx, open(char_idx_file,'wb')) 28 | 29 | g = tflearn.input_data([None, maxlen, len(char_idx)]) 30 | g = tflearn.lstm(g, 512, return_seq=True) 31 | g = tflearn.dropout(g, 0.5) 32 | g = tflearn.lstm(g, 512, return_seq=True) 33 | g = tflearn.dropout(g, 0.5) 34 | g = tflearn.lstm(g, 512) 35 | g = tflearn.dropout(g, 0.5) 36 | g = tflearn.fully_connected(g, len(char_idx), activation='softmax') 37 | g = tflearn.regression(g, optimizer='adam', loss='categorical_crossentropy', 38 | learning_rate=0.001) 39 | 40 | m = tflearn.SequenceGenerator(g, dictionary=char_idx, 41 | seq_maxlen=maxlen, 42 | clip_gradients=5.0, 43 | checkpoint_path='model_shakespeare') 44 | 45 | for i in range(50): 46 | seed = random_sequence_from_textfile(path, maxlen) 47 | m.fit(X, Y, validation_set=0.1, batch_size=128, 48 | n_epoch=1, run_id='shakespeare') 49 | print("-- TESTING...") 50 | print("-- Test with temperature of 1.0 --") 51 | print(m.generate(600, temperature=1.0, seq_seed=seed)) 52 | print("-- Test with temperature of 0.5 --") 53 | print(m.generate(600, temperature=0.5, seq_seed=seed)) 54 | -------------------------------------------------------------------------------- /course_8_image2txt/BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility = [":internal"]) 2 | 3 | licenses(["notice"]) # Apache 2.0 4 | 5 | exports_files(["LICENSE"]) 6 | 7 | package_group( 8 | name = "internal", 9 | packages = [ 10 | "//im2txt/...", 11 | ], 12 | ) 13 | 14 | py_binary( 15 | name = "build_mscoco_data", 16 | srcs = [ 17 | "data/build_mscoco_data.py", 18 | ], 19 | ) 20 | 21 | sh_binary( 22 | name = "download_and_preprocess_mscoco", 23 | srcs = ["data/download_and_preprocess_mscoco.sh"], 24 | data = [ 25 | ":build_mscoco_data", 26 | ], 27 | ) 28 | 29 | py_library( 30 | name = "configuration", 31 | srcs = ["configuration.py"], 32 | srcs_version = "PY2AND3", 33 | ) 34 | 35 | py_library( 36 | name = "show_and_tell_model", 37 | srcs = ["show_and_tell_model.py"], 38 | srcs_version = "PY2AND3", 39 | deps = [ 40 | "//im2txt/ops:image_embedding", 41 | "//im2txt/ops:image_processing", 42 | "//im2txt/ops:inputs", 43 | ], 44 | ) 45 | 46 | py_test( 47 | name = "show_and_tell_model_test", 48 | size = "large", 49 | srcs = ["show_and_tell_model_test.py"], 50 | deps = [ 51 | ":configuration", 52 | ":show_and_tell_model", 53 | ], 54 | ) 55 | 56 | py_library( 57 | name = "inference_wrapper", 58 | srcs = ["inference_wrapper.py"], 59 | srcs_version = "PY2AND3", 60 | deps = [ 61 | ":show_and_tell_model", 62 | "//im2txt/inference_utils:inference_wrapper_base", 63 | ], 64 | ) 65 | 66 | py_binary( 67 | name = "train", 68 | srcs = ["train.py"], 69 | srcs_version = "PY2AND3", 70 | deps = [ 71 | ":configuration", 72 | ":show_and_tell_model", 73 | ], 74 | ) 75 | 76 | py_binary( 77 | name = "evaluate", 78 | srcs = ["evaluate.py"], 79 | srcs_version = "PY2AND3", 80 | deps = [ 81 | ":configuration", 82 | ":show_and_tell_model", 83 | ], 84 | ) 85 | 86 | py_binary( 87 | name = "run_inference", 88 | srcs = ["run_inference.py"], 89 | srcs_version = "PY2AND3", 90 | deps = [ 91 | ":configuration", 92 | ":inference_wrapper", 93 | "//im2txt/inference_utils:caption_generator", 94 | "//im2txt/inference_utils:vocabulary", 95 | ], 96 | ) 97 | -------------------------------------------------------------------------------- /course_8_image2txt/configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Image-to-text model and training configurations.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | class ModelConfig(object): 24 | """Wrapper class for model hyperparameters.""" 25 | 26 | def __init__(self): 27 | """Sets the default model hyperparameters.""" 28 | # File pattern of sharded TFRecord file containing SequenceExample protos. 29 | # Must be provided in training and evaluation modes. 30 | self.input_file_pattern = None 31 | 32 | # Image format ("jpeg" or "png"). 33 | self.image_format = "jpeg" 34 | 35 | # Approximate number of values per input shard. Used to ensure sufficient 36 | # mixing between shards in training. 37 | self.values_per_input_shard = 2300 38 | # Minimum number of shards to keep in the input queue. 39 | self.input_queue_capacity_factor = 2 40 | # Number of threads for prefetching SequenceExample protos. 41 | self.num_input_reader_threads = 1 42 | 43 | # Name of the SequenceExample context feature containing image data. 44 | self.image_feature_name = "image/data" 45 | # Name of the SequenceExample feature list containing integer captions. 46 | self.caption_feature_name = "image/caption_ids" 47 | 48 | # Number of unique words in the vocab (plus 1, for ). 49 | # The default value is larger than the expected actual vocab size to allow 50 | # for differences between tokenizer versions used in preprocessing. There is 51 | # no harm in using a value greater than the actual vocab size, but using a 52 | # value less than the actual vocab size will result in an error. 53 | self.vocab_size = 12000 54 | 55 | # Number of threads for image preprocessing. Should be a multiple of 2. 56 | self.num_preprocess_threads = 4 57 | 58 | # Batch size. 59 | self.batch_size = 32 60 | 61 | # File containing an Inception v3 checkpoint to initialize the variables 62 | # of the Inception model. Must be provided when starting training for the 63 | # first time. 64 | self.inception_checkpoint_file = None 65 | 66 | # Dimensions of Inception v3 input images. 67 | self.image_height = 299 68 | self.image_width = 299 69 | 70 | # Scale used to initialize model variables. 71 | self.initializer_scale = 0.08 72 | 73 | # LSTM input and output dimensionality, respectively. 74 | self.embedding_size = 512 75 | self.num_lstm_units = 512 76 | 77 | # If < 1.0, the dropout keep probability applied to LSTM variables. 78 | self.lstm_dropout_keep_prob = 0.7 79 | 80 | 81 | class TrainingConfig(object): 82 | """Wrapper class for training hyperparameters.""" 83 | 84 | def __init__(self): 85 | """Sets the default training hyperparameters.""" 86 | # Number of examples per epoch of training data. 87 | self.num_examples_per_epoch = 586363 88 | 89 | # Optimizer for training the model. 90 | self.optimizer = "SGD" 91 | 92 | # Learning rate for the initial phase of training. 93 | self.initial_learning_rate = 2.0 94 | self.learning_rate_decay_factor = 0.5 95 | self.num_epochs_per_decay = 8.0 96 | 97 | # Learning rate when fine tuning the Inception v3 parameters. 98 | self.train_inception_learning_rate = 0.0005 99 | 100 | # If not None, clip gradients to this value. 101 | self.clip_gradients = 5.0 102 | 103 | # How many model checkpoints to keep. 104 | self.max_checkpoints_to_keep = 5 105 | -------------------------------------------------------------------------------- /course_8_image2txt/data/download_and_preprocess_mscoco.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | # Script to download and preprocess the MSCOCO data set. 18 | # 19 | # The outputs of this script are sharded TFRecord files containing serialized 20 | # SequenceExample protocol buffers. See build_mscoco_data.py for details of how 21 | # the SequenceExample protocol buffers are constructed. 22 | # 23 | # usage: 24 | # ./download_and_preprocess_mscoco.sh 25 | set -e 26 | 27 | if [ -z "$1" ]; then 28 | echo "usage download_and_preproces_mscoco.sh [data dir]" 29 | exit 30 | fi 31 | 32 | if [ "$(uname)" == "Darwin" ]; then 33 | UNZIP="tar -xf" 34 | else 35 | UNZIP="unzip -nq" 36 | fi 37 | 38 | # Create the output directories. 39 | OUTPUT_DIR="${1%/}" 40 | SCRATCH_DIR="${OUTPUT_DIR}/raw-data" 41 | mkdir -p "${OUTPUT_DIR}" 42 | mkdir -p "${SCRATCH_DIR}" 43 | CURRENT_DIR=$(pwd) 44 | WORK_DIR="$0.runfiles/im2txt/im2txt" 45 | 46 | # Helper function to download and unpack a .zip file. 47 | function download_and_unzip() { 48 | local BASE_URL=${1} 49 | local FILENAME=${2} 50 | 51 | if [ ! -f ${FILENAME} ]; then 52 | echo "Downloading ${FILENAME} to $(pwd)" 53 | wget -nd -c "${BASE_URL}/${FILENAME}" 54 | else 55 | echo "Skipping download of ${FILENAME}" 56 | fi 57 | echo "Unzipping ${FILENAME}" 58 | ${UNZIP} ${FILENAME} 59 | } 60 | 61 | cd ${SCRATCH_DIR} 62 | 63 | # Download the images. 64 | BASE_IMAGE_URL="http://msvocds.blob.core.windows.net/coco2014" 65 | 66 | TRAIN_IMAGE_FILE="train2014.zip" 67 | download_and_unzip ${BASE_IMAGE_URL} ${TRAIN_IMAGE_FILE} 68 | TRAIN_IMAGE_DIR="${SCRATCH_DIR}/train2014" 69 | 70 | VAL_IMAGE_FILE="val2014.zip" 71 | download_and_unzip ${BASE_IMAGE_URL} ${VAL_IMAGE_FILE} 72 | VAL_IMAGE_DIR="${SCRATCH_DIR}/val2014" 73 | 74 | # Download the captions. 75 | BASE_CAPTIONS_URL="http://msvocds.blob.core.windows.net/annotations-1-0-3" 76 | CAPTIONS_FILE="captions_train-val2014.zip" 77 | download_and_unzip ${BASE_CAPTIONS_URL} ${CAPTIONS_FILE} 78 | TRAIN_CAPTIONS_FILE="${SCRATCH_DIR}/annotations/captions_train2014.json" 79 | VAL_CAPTIONS_FILE="${SCRATCH_DIR}/annotations/captions_val2014.json" 80 | 81 | # Build TFRecords of the image data. 82 | cd "${CURRENT_DIR}" 83 | BUILD_SCRIPT="${WORK_DIR}/build_mscoco_data" 84 | "${BUILD_SCRIPT}" \ 85 | --train_image_dir="${TRAIN_IMAGE_DIR}" \ 86 | --val_image_dir="${VAL_IMAGE_DIR}" \ 87 | --train_captions_file="${TRAIN_CAPTIONS_FILE}" \ 88 | --val_captions_file="${VAL_CAPTIONS_FILE}" \ 89 | --output_dir="${OUTPUT_DIR}" \ 90 | --word_counts_output_file="${OUTPUT_DIR}/word_counts.txt" \ 91 | -------------------------------------------------------------------------------- /course_8_image2txt/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Evaluate the model. 17 | 18 | This script should be run concurrently with training so that summaries show up 19 | in TensorBoard. 20 | """ 21 | 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | 26 | import math 27 | import os.path 28 | import time 29 | 30 | 31 | import numpy as np 32 | import tensorflow as tf 33 | 34 | from im2txt import configuration 35 | from im2txt import show_and_tell_model 36 | 37 | FLAGS = tf.flags.FLAGS 38 | 39 | tf.flags.DEFINE_string("input_file_pattern", "", 40 | "File pattern of sharded TFRecord input files.") 41 | tf.flags.DEFINE_string("checkpoint_dir", "", 42 | "Directory containing model checkpoints.") 43 | tf.flags.DEFINE_string("eval_dir", "", "Directory to write event logs.") 44 | 45 | tf.flags.DEFINE_integer("eval_interval_secs", 600, 46 | "Interval between evaluation runs.") 47 | tf.flags.DEFINE_integer("num_eval_examples", 10132, 48 | "Number of examples for evaluation.") 49 | 50 | tf.flags.DEFINE_integer("min_global_step", 5000, 51 | "Minimum global step to run evaluation.") 52 | 53 | tf.logging.set_verbosity(tf.logging.INFO) 54 | 55 | 56 | def evaluate_model(sess, model, global_step, summary_writer, summary_op): 57 | """Computes perplexity-per-word over the evaluation dataset. 58 | 59 | Summaries and perplexity-per-word are written out to the eval directory. 60 | 61 | Args: 62 | sess: Session object. 63 | model: Instance of ShowAndTellModel; the model to evaluate. 64 | global_step: Integer; global step of the model checkpoint. 65 | summary_writer: Instance of FileWriter. 66 | summary_op: Op for generating model summaries. 67 | """ 68 | # Log model summaries on a single batch. 69 | summary_str = sess.run(summary_op) 70 | summary_writer.add_summary(summary_str, global_step) 71 | 72 | # Compute perplexity over the entire dataset. 73 | num_eval_batches = int( 74 | math.ceil(FLAGS.num_eval_examples / model.config.batch_size)) 75 | 76 | start_time = time.time() 77 | sum_losses = 0. 78 | sum_weights = 0. 79 | for i in xrange(num_eval_batches): 80 | cross_entropy_losses, weights = sess.run([ 81 | model.target_cross_entropy_losses, 82 | model.target_cross_entropy_loss_weights 83 | ]) 84 | sum_losses += np.sum(cross_entropy_losses * weights) 85 | sum_weights += np.sum(weights) 86 | if not i % 100: 87 | tf.logging.info("Computed losses for %d of %d batches.", i + 1, 88 | num_eval_batches) 89 | eval_time = time.time() - start_time 90 | 91 | perplexity = math.exp(sum_losses / sum_weights) 92 | tf.logging.info("Perplexity = %f (%.2g sec)", perplexity, eval_time) 93 | 94 | # Log perplexity to the FileWriter. 95 | summary = tf.Summary() 96 | value = summary.value.add() 97 | value.simple_value = perplexity 98 | value.tag = "Perplexity" 99 | summary_writer.add_summary(summary, global_step) 100 | 101 | # Write the Events file to the eval directory. 102 | summary_writer.flush() 103 | tf.logging.info("Finished processing evaluation at global step %d.", 104 | global_step) 105 | 106 | 107 | def run_once(model, saver, summary_writer, summary_op): 108 | """Evaluates the latest model checkpoint. 109 | 110 | Args: 111 | model: Instance of ShowAndTellModel; the model to evaluate. 112 | saver: Instance of tf.train.Saver for restoring model Variables. 113 | summary_writer: Instance of FileWriter. 114 | summary_op: Op for generating model summaries. 115 | """ 116 | model_path = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 117 | if not model_path: 118 | tf.logging.info("Skipping evaluation. No checkpoint found in: %s", 119 | FLAGS.checkpoint_dir) 120 | return 121 | 122 | with tf.Session() as sess: 123 | # Load model from checkpoint. 124 | tf.logging.info("Loading model from checkpoint: %s", model_path) 125 | saver.restore(sess, model_path) 126 | global_step = tf.train.global_step(sess, model.global_step.name) 127 | tf.logging.info("Successfully loaded %s at global step = %d.", 128 | os.path.basename(model_path), global_step) 129 | if global_step < FLAGS.min_global_step: 130 | tf.logging.info("Skipping evaluation. Global step = %d < %d", global_step, 131 | FLAGS.min_global_step) 132 | return 133 | 134 | # Start the queue runners. 135 | coord = tf.train.Coordinator() 136 | threads = tf.train.start_queue_runners(coord=coord) 137 | 138 | # Run evaluation on the latest checkpoint. 139 | try: 140 | evaluate_model( 141 | sess=sess, 142 | model=model, 143 | global_step=global_step, 144 | summary_writer=summary_writer, 145 | summary_op=summary_op) 146 | except Exception, e: # pylint: disable=broad-except 147 | tf.logging.error("Evaluation failed.") 148 | coord.request_stop(e) 149 | 150 | coord.request_stop() 151 | coord.join(threads, stop_grace_period_secs=10) 152 | 153 | 154 | def run(): 155 | """Runs evaluation in a loop, and logs summaries to TensorBoard.""" 156 | # Create the evaluation directory if it doesn't exist. 157 | eval_dir = FLAGS.eval_dir 158 | if not tf.gfile.IsDirectory(eval_dir): 159 | tf.logging.info("Creating eval directory: %s", eval_dir) 160 | tf.gfile.MakeDirs(eval_dir) 161 | 162 | g = tf.Graph() 163 | with g.as_default(): 164 | # Build the model for evaluation. 165 | model_config = configuration.ModelConfig() 166 | model_config.input_file_pattern = FLAGS.input_file_pattern 167 | model = show_and_tell_model.ShowAndTellModel(model_config, mode="eval") 168 | model.build() 169 | 170 | # Create the Saver to restore model Variables. 171 | saver = tf.train.Saver() 172 | 173 | # Create the summary operation and the summary writer. 174 | summary_op = tf.summary.merge_all() 175 | summary_writer = tf.summary.FileWriter(eval_dir) 176 | 177 | g.finalize() 178 | 179 | # Run a new evaluation run every eval_interval_secs. 180 | while True: 181 | start = time.time() 182 | tf.logging.info("Starting evaluation at " + time.strftime( 183 | "%Y-%m-%d-%H:%M:%S", time.localtime())) 184 | run_once(model, saver, summary_writer, summary_op) 185 | time_to_next_eval = start + FLAGS.eval_interval_secs - time.time() 186 | if time_to_next_eval > 0: 187 | time.sleep(time_to_next_eval) 188 | 189 | 190 | def main(unused_argv): 191 | assert FLAGS.input_file_pattern, "--input_file_pattern is required" 192 | assert FLAGS.checkpoint_dir, "--checkpoint_dir is required" 193 | assert FLAGS.eval_dir, "--eval_dir is required" 194 | run() 195 | 196 | 197 | if __name__ == "__main__": 198 | tf.app.run() 199 | -------------------------------------------------------------------------------- /course_8_image2txt/inference_utils/BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility = ["//im2txt:internal"]) 2 | 3 | licenses(["notice"]) # Apache 2.0 4 | 5 | exports_files(["LICENSE"]) 6 | 7 | py_library( 8 | name = "inference_wrapper_base", 9 | srcs = ["inference_wrapper_base.py"], 10 | srcs_version = "PY2AND3", 11 | ) 12 | 13 | py_library( 14 | name = "vocabulary", 15 | srcs = ["vocabulary.py"], 16 | srcs_version = "PY2AND3", 17 | ) 18 | 19 | py_library( 20 | name = "caption_generator", 21 | srcs = ["caption_generator.py"], 22 | srcs_version = "PY2AND3", 23 | ) 24 | 25 | py_test( 26 | name = "caption_generator_test", 27 | srcs = ["caption_generator_test.py"], 28 | deps = [ 29 | ":caption_generator", 30 | ], 31 | ) 32 | -------------------------------------------------------------------------------- /course_8_image2txt/inference_utils/caption_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Class for generating captions from an image-to-text model.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import heapq 22 | import math 23 | 24 | 25 | import numpy as np 26 | 27 | 28 | class Caption(object): 29 | """Represents a complete or partial caption.""" 30 | 31 | def __init__(self, sentence, state, logprob, score, metadata=None): 32 | """Initializes the Caption. 33 | 34 | Args: 35 | sentence: List of word ids in the caption. 36 | state: Model state after generating the previous word. 37 | logprob: Log-probability of the caption. 38 | score: Score of the caption. 39 | metadata: Optional metadata associated with the partial sentence. If not 40 | None, a list of strings with the same length as 'sentence'. 41 | """ 42 | self.sentence = sentence 43 | self.state = state 44 | self.logprob = logprob 45 | self.score = score 46 | self.metadata = metadata 47 | 48 | def __cmp__(self, other): 49 | """Compares Captions by score.""" 50 | assert isinstance(other, Caption) 51 | if self.score == other.score: 52 | return 0 53 | elif self.score < other.score: 54 | return -1 55 | else: 56 | return 1 57 | 58 | # For Python 3 compatibility (__cmp__ is deprecated). 59 | def __lt__(self, other): 60 | assert isinstance(other, Caption) 61 | return self.score < other.score 62 | 63 | # Also for Python 3 compatibility. 64 | def __eq__(self, other): 65 | assert isinstance(other, Caption) 66 | return self.score == other.score 67 | 68 | 69 | class TopN(object): 70 | """Maintains the top n elements of an incrementally provided set.""" 71 | 72 | def __init__(self, n): 73 | self._n = n 74 | self._data = [] 75 | 76 | def size(self): 77 | assert self._data is not None 78 | return len(self._data) 79 | 80 | def push(self, x): 81 | """Pushes a new element.""" 82 | assert self._data is not None 83 | if len(self._data) < self._n: 84 | heapq.heappush(self._data, x) 85 | else: 86 | heapq.heappushpop(self._data, x) 87 | 88 | def extract(self, sort=False): 89 | """Extracts all elements from the TopN. This is a destructive operation. 90 | 91 | The only method that can be called immediately after extract() is reset(). 92 | 93 | Args: 94 | sort: Whether to return the elements in descending sorted order. 95 | 96 | Returns: 97 | A list of data; the top n elements provided to the set. 98 | """ 99 | assert self._data is not None 100 | data = self._data 101 | self._data = None 102 | if sort: 103 | data.sort(reverse=True) 104 | return data 105 | 106 | def reset(self): 107 | """Returns the TopN to an empty state.""" 108 | self._data = [] 109 | 110 | 111 | class CaptionGenerator(object): 112 | """Class to generate captions from an image-to-text model.""" 113 | 114 | def __init__(self, 115 | model, 116 | vocab, 117 | beam_size=3, 118 | max_caption_length=20, 119 | length_normalization_factor=0.0): 120 | """Initializes the generator. 121 | 122 | Args: 123 | model: Object encapsulating a trained image-to-text model. Must have 124 | methods feed_image() and inference_step(). For example, an instance of 125 | InferenceWrapperBase. 126 | vocab: A Vocabulary object. 127 | beam_size: Beam size to use when generating captions. 128 | max_caption_length: The maximum caption length before stopping the search. 129 | length_normalization_factor: If != 0, a number x such that captions are 130 | scored by logprob/length^x, rather than logprob. This changes the 131 | relative scores of captions depending on their lengths. For example, if 132 | x > 0 then longer captions will be favored. 133 | """ 134 | self.vocab = vocab 135 | self.model = model 136 | 137 | self.beam_size = beam_size 138 | self.max_caption_length = max_caption_length 139 | self.length_normalization_factor = length_normalization_factor 140 | 141 | def beam_search(self, sess, encoded_image): 142 | """Runs beam search caption generation on a single image. 143 | 144 | Args: 145 | sess: TensorFlow Session object. 146 | encoded_image: An encoded image string. 147 | 148 | Returns: 149 | A list of Caption sorted by descending score. 150 | """ 151 | # Feed in the image to get the initial state. 152 | initial_state = self.model.feed_image(sess, encoded_image) 153 | 154 | initial_beam = Caption( 155 | sentence=[self.vocab.start_id], 156 | state=initial_state[0], 157 | logprob=0.0, 158 | score=0.0, 159 | metadata=[""]) 160 | partial_captions = TopN(self.beam_size) 161 | partial_captions.push(initial_beam) 162 | complete_captions = TopN(self.beam_size) 163 | 164 | # Run beam search. 165 | for _ in range(self.max_caption_length - 1): 166 | partial_captions_list = partial_captions.extract() 167 | partial_captions.reset() 168 | input_feed = np.array([c.sentence[-1] for c in partial_captions_list]) 169 | state_feed = np.array([c.state for c in partial_captions_list]) 170 | 171 | softmax, new_states, metadata = self.model.inference_step(sess, 172 | input_feed, 173 | state_feed) 174 | 175 | for i, partial_caption in enumerate(partial_captions_list): 176 | word_probabilities = softmax[i] 177 | state = new_states[i] 178 | # For this partial caption, get the beam_size most probable next words. 179 | words_and_probs = list(enumerate(word_probabilities)) 180 | words_and_probs.sort(key=lambda x: -x[1]) 181 | words_and_probs = words_and_probs[0:self.beam_size] 182 | # Each next word gives a new partial caption. 183 | for w, p in words_and_probs: 184 | if p < 1e-12: 185 | continue # Avoid log(0). 186 | sentence = partial_caption.sentence + [w] 187 | logprob = partial_caption.logprob + math.log(p) 188 | score = logprob 189 | if metadata: 190 | metadata_list = partial_caption.metadata + [metadata[i]] 191 | else: 192 | metadata_list = None 193 | if w == self.vocab.end_id: 194 | if self.length_normalization_factor > 0: 195 | score /= len(sentence)**self.length_normalization_factor 196 | beam = Caption(sentence, state, logprob, score, metadata_list) 197 | complete_captions.push(beam) 198 | else: 199 | beam = Caption(sentence, state, logprob, score, metadata_list) 200 | partial_captions.push(beam) 201 | if partial_captions.size() == 0: 202 | # We have run out of partial candidates; happens when beam_size = 1. 203 | break 204 | 205 | # If we have no complete captions then fall back to the partial captions. 206 | # But never output a mixture of complete and partial captions because a 207 | # partial caption could have a higher score than all the complete captions. 208 | if not complete_captions.size(): 209 | complete_captions = partial_captions 210 | 211 | return complete_captions.extract(sort=True) 212 | -------------------------------------------------------------------------------- /course_8_image2txt/inference_utils/caption_generator_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Unit tests for CaptionGenerator.""" 16 | 17 | import math 18 | 19 | 20 | 21 | import numpy as np 22 | import tensorflow as tf 23 | 24 | from im2txt.inference_utils import caption_generator 25 | 26 | 27 | class FakeVocab(object): 28 | """Fake Vocabulary for testing purposes.""" 29 | 30 | def __init__(self): 31 | self.start_id = 0 # Word id denoting sentence start. 32 | self.end_id = 1 # Word id denoting sentence end. 33 | 34 | 35 | class FakeModel(object): 36 | """Fake model for testing purposes.""" 37 | 38 | def __init__(self): 39 | # Number of words in the vocab. 40 | self._vocab_size = 12 41 | 42 | # Dimensionality of the nominal model state. 43 | self._state_size = 1 44 | 45 | # Map of previous word to the probability distribution of the next word. 46 | self._probabilities = { 47 | 0: {1: 0.1, 48 | 2: 0.2, 49 | 3: 0.3, 50 | 4: 0.4}, 51 | 2: {5: 0.1, 52 | 6: 0.9}, 53 | 3: {1: 0.1, 54 | 7: 0.4, 55 | 8: 0.5}, 56 | 4: {1: 0.3, 57 | 9: 0.3, 58 | 10: 0.4}, 59 | 5: {1: 1.0}, 60 | 6: {1: 1.0}, 61 | 7: {1: 1.0}, 62 | 8: {1: 1.0}, 63 | 9: {1: 0.5, 64 | 11: 0.5}, 65 | 10: {1: 1.0}, 66 | 11: {1: 1.0}, 67 | } 68 | 69 | # pylint: disable=unused-argument 70 | 71 | def feed_image(self, sess, encoded_image): 72 | # Return a nominal model state. 73 | return np.zeros([1, self._state_size]) 74 | 75 | def inference_step(self, sess, input_feed, state_feed): 76 | # Compute the matrix of softmax distributions for the next batch of words. 77 | batch_size = input_feed.shape[0] 78 | softmax_output = np.zeros([batch_size, self._vocab_size]) 79 | for batch_index, word_id in enumerate(input_feed): 80 | for next_word, probability in self._probabilities[word_id].items(): 81 | softmax_output[batch_index, next_word] = probability 82 | 83 | # Nominal state and metadata. 84 | new_state = np.zeros([batch_size, self._state_size]) 85 | metadata = None 86 | 87 | return softmax_output, new_state, metadata 88 | 89 | # pylint: enable=unused-argument 90 | 91 | 92 | class CaptionGeneratorTest(tf.test.TestCase): 93 | 94 | def _assertExpectedCaptions(self, 95 | expected_captions, 96 | beam_size=3, 97 | max_caption_length=20, 98 | length_normalization_factor=0): 99 | """Tests that beam search generates the expected captions. 100 | 101 | Args: 102 | expected_captions: A sequence of pairs (sentence, probability), where 103 | sentence is a list of integer ids and probability is a float in [0, 1]. 104 | beam_size: Parameter passed to beam_search(). 105 | max_caption_length: Parameter passed to beam_search(). 106 | length_normalization_factor: Parameter passed to beam_search(). 107 | """ 108 | expected_sentences = [c[0] for c in expected_captions] 109 | expected_probabilities = [c[1] for c in expected_captions] 110 | 111 | # Generate captions. 112 | generator = caption_generator.CaptionGenerator( 113 | model=FakeModel(), 114 | vocab=FakeVocab(), 115 | beam_size=beam_size, 116 | max_caption_length=max_caption_length, 117 | length_normalization_factor=length_normalization_factor) 118 | actual_captions = generator.beam_search(sess=None, encoded_image=None) 119 | 120 | actual_sentences = [c.sentence for c in actual_captions] 121 | actual_probabilities = [math.exp(c.logprob) for c in actual_captions] 122 | 123 | self.assertEqual(expected_sentences, actual_sentences) 124 | self.assertAllClose(expected_probabilities, actual_probabilities) 125 | 126 | def testBeamSize(self): 127 | # Beam size = 1. 128 | expected = [([0, 4, 10, 1], 0.16)] 129 | self._assertExpectedCaptions(expected, beam_size=1) 130 | 131 | # Beam size = 2. 132 | expected = [([0, 4, 10, 1], 0.16), ([0, 3, 8, 1], 0.15)] 133 | self._assertExpectedCaptions(expected, beam_size=2) 134 | 135 | # Beam size = 3. 136 | expected = [ 137 | ([0, 2, 6, 1], 0.18), ([0, 4, 10, 1], 0.16), ([0, 3, 8, 1], 0.15) 138 | ] 139 | self._assertExpectedCaptions(expected, beam_size=3) 140 | 141 | def testMaxLength(self): 142 | # Max length = 1. 143 | expected = [([0], 1.0)] 144 | self._assertExpectedCaptions(expected, max_caption_length=1) 145 | 146 | # Max length = 2. 147 | # There are no complete sentences, so partial sentences are returned. 148 | expected = [([0, 4], 0.4), ([0, 3], 0.3), ([0, 2], 0.2)] 149 | self._assertExpectedCaptions(expected, max_caption_length=2) 150 | 151 | # Max length = 3. 152 | # There is at least one complete sentence, so only complete sentences are 153 | # returned. 154 | expected = [([0, 4, 1], 0.12), ([0, 3, 1], 0.03)] 155 | self._assertExpectedCaptions(expected, max_caption_length=3) 156 | 157 | # Max length = 4. 158 | expected = [ 159 | ([0, 2, 6, 1], 0.18), ([0, 4, 10, 1], 0.16), ([0, 3, 8, 1], 0.15) 160 | ] 161 | self._assertExpectedCaptions(expected, max_caption_length=4) 162 | 163 | def testLengthNormalization(self): 164 | # Length normalization factor = 3. 165 | # The longest caption is returned first, despite having low probability, 166 | # because it has the highest log(probability)/length**3. 167 | expected = [ 168 | ([0, 4, 9, 11, 1], 0.06), 169 | ([0, 2, 6, 1], 0.18), 170 | ([0, 4, 10, 1], 0.16), 171 | ([0, 3, 8, 1], 0.15), 172 | ] 173 | self._assertExpectedCaptions( 174 | expected, beam_size=4, length_normalization_factor=3) 175 | 176 | 177 | if __name__ == '__main__': 178 | tf.test.main() 179 | -------------------------------------------------------------------------------- /course_8_image2txt/inference_utils/inference_wrapper_base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Base wrapper class for performing inference with an image-to-text model. 16 | 17 | Subclasses must implement the following methods: 18 | 19 | build_model(): 20 | Builds the model for inference and returns the model object. 21 | 22 | feed_image(): 23 | Takes an encoded image and returns the initial model state, where "state" 24 | is a numpy array whose specifics are defined by the subclass, e.g. 25 | concatenated LSTM state. It's assumed that feed_image() will be called 26 | precisely once at the start of inference for each image. Subclasses may 27 | compute and/or save per-image internal context in this method. 28 | 29 | inference_step(): 30 | Takes a batch of inputs and states at a single time-step. Returns the 31 | softmax output corresponding to the inputs, and the new states of the batch. 32 | Optionally also returns metadata about the current inference step, e.g. a 33 | serialized numpy array containing activations from a particular model layer. 34 | 35 | Client usage: 36 | 1. Build the model inference graph via build_graph_from_config() or 37 | build_graph_from_proto(). 38 | 2. Call the resulting restore_fn to load the model checkpoint. 39 | 3. For each image in a batch of images: 40 | a) Call feed_image() once to get the initial state. 41 | b) For each step of caption generation, call inference_step(). 42 | """ 43 | 44 | from __future__ import absolute_import 45 | from __future__ import division 46 | from __future__ import print_function 47 | 48 | import os.path 49 | 50 | 51 | import tensorflow as tf 52 | 53 | # pylint: disable=unused-argument 54 | 55 | 56 | class InferenceWrapperBase(object): 57 | """Base wrapper class for performing inference with an image-to-text model.""" 58 | 59 | def __init__(self): 60 | pass 61 | 62 | def build_model(self, model_config): 63 | """Builds the model for inference. 64 | 65 | Args: 66 | model_config: Object containing configuration for building the model. 67 | 68 | Returns: 69 | model: The model object. 70 | """ 71 | tf.logging.fatal("Please implement build_model in subclass") 72 | 73 | def _create_restore_fn(self, checkpoint_path, saver): 74 | """Creates a function that restores a model from checkpoint. 75 | 76 | Args: 77 | checkpoint_path: Checkpoint file or a directory containing a checkpoint 78 | file. 79 | saver: Saver for restoring variables from the checkpoint file. 80 | 81 | Returns: 82 | restore_fn: A function such that restore_fn(sess) loads model variables 83 | from the checkpoint file. 84 | 85 | Raises: 86 | ValueError: If checkpoint_path does not refer to a checkpoint file or a 87 | directory containing a checkpoint file. 88 | """ 89 | if tf.gfile.IsDirectory(checkpoint_path): 90 | checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) 91 | if not checkpoint_path: 92 | raise ValueError("No checkpoint file found in: %s" % checkpoint_path) 93 | 94 | def _restore_fn(sess): 95 | tf.logging.info("Loading model from checkpoint: %s", checkpoint_path) 96 | saver.restore(sess, checkpoint_path) 97 | tf.logging.info("Successfully loaded checkpoint: %s", 98 | os.path.basename(checkpoint_path)) 99 | 100 | return _restore_fn 101 | 102 | def build_graph_from_config(self, model_config, checkpoint_path): 103 | """Builds the inference graph from a configuration object. 104 | 105 | Args: 106 | model_config: Object containing configuration for building the model. 107 | checkpoint_path: Checkpoint file or a directory containing a checkpoint 108 | file. 109 | 110 | Returns: 111 | restore_fn: A function such that restore_fn(sess) loads model variables 112 | from the checkpoint file. 113 | """ 114 | tf.logging.info("Building model.") 115 | self.build_model(model_config) 116 | saver = tf.train.Saver() 117 | 118 | return self._create_restore_fn(checkpoint_path, saver) 119 | 120 | def build_graph_from_proto(self, graph_def_file, saver_def_file, 121 | checkpoint_path): 122 | """Builds the inference graph from serialized GraphDef and SaverDef protos. 123 | 124 | Args: 125 | graph_def_file: File containing a serialized GraphDef proto. 126 | saver_def_file: File containing a serialized SaverDef proto. 127 | checkpoint_path: Checkpoint file or a directory containing a checkpoint 128 | file. 129 | 130 | Returns: 131 | restore_fn: A function such that restore_fn(sess) loads model variables 132 | from the checkpoint file. 133 | """ 134 | # Load the Graph. 135 | tf.logging.info("Loading GraphDef from file: %s", graph_def_file) 136 | graph_def = tf.GraphDef() 137 | with tf.gfile.FastGFile(graph_def_file, "rb") as f: 138 | graph_def.ParseFromString(f.read()) 139 | tf.import_graph_def(graph_def, name="") 140 | 141 | # Load the Saver. 142 | tf.logging.info("Loading SaverDef from file: %s", saver_def_file) 143 | saver_def = tf.train.SaverDef() 144 | with tf.gfile.FastGFile(saver_def_file, "rb") as f: 145 | saver_def.ParseFromString(f.read()) 146 | saver = tf.train.Saver(saver_def=saver_def) 147 | 148 | return self._create_restore_fn(checkpoint_path, saver) 149 | 150 | def feed_image(self, sess, encoded_image): 151 | """Feeds an image and returns the initial model state. 152 | 153 | See comments at the top of file. 154 | 155 | Args: 156 | sess: TensorFlow Session object. 157 | encoded_image: An encoded image string. 158 | 159 | Returns: 160 | state: A numpy array of shape [1, state_size]. 161 | """ 162 | tf.logging.fatal("Please implement feed_image in subclass") 163 | 164 | def inference_step(self, sess, input_feed, state_feed): 165 | """Runs one step of inference. 166 | 167 | Args: 168 | sess: TensorFlow Session object. 169 | input_feed: A numpy array of shape [batch_size]. 170 | state_feed: A numpy array of shape [batch_size, state_size]. 171 | 172 | Returns: 173 | softmax_output: A numpy array of shape [batch_size, vocab_size]. 174 | new_state: A numpy array of shape [batch_size, state_size]. 175 | metadata: Optional. If not None, a string containing metadata about the 176 | current inference step (e.g. serialized numpy array containing 177 | activations from a particular model layer.). 178 | """ 179 | tf.logging.fatal("Please implement inference_step in subclass") 180 | 181 | # pylint: enable=unused-argument 182 | -------------------------------------------------------------------------------- /course_8_image2txt/inference_utils/vocabulary.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Vocabulary class for an image-to-text model.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | 22 | import tensorflow as tf 23 | 24 | 25 | class Vocabulary(object): 26 | """Vocabulary class for an image-to-text model.""" 27 | 28 | def __init__(self, 29 | vocab_file, 30 | start_word="", 31 | end_word="", 32 | unk_word=""): 33 | """Initializes the vocabulary. 34 | 35 | Args: 36 | vocab_file: File containing the vocabulary, where the words are the first 37 | whitespace-separated token on each line (other tokens are ignored) and 38 | the word ids are the corresponding line numbers. 39 | start_word: Special word denoting sentence start. 40 | end_word: Special word denoting sentence end. 41 | unk_word: Special word denoting unknown words. 42 | """ 43 | if not tf.gfile.Exists(vocab_file): 44 | tf.logging.fatal("Vocab file %s not found.", vocab_file) 45 | tf.logging.info("Initializing vocabulary from file: %s", vocab_file) 46 | 47 | with tf.gfile.GFile(vocab_file, mode="r") as f: 48 | reverse_vocab = list(f.readlines()) 49 | reverse_vocab = [line.split()[0] for line in reverse_vocab] 50 | assert start_word in reverse_vocab 51 | assert end_word in reverse_vocab 52 | if unk_word not in reverse_vocab: 53 | reverse_vocab.append(unk_word) 54 | vocab = dict([(x, y) for (y, x) in enumerate(reverse_vocab)]) 55 | 56 | tf.logging.info("Created vocabulary with %d words" % len(vocab)) 57 | 58 | self.vocab = vocab # vocab[word] = id 59 | self.reverse_vocab = reverse_vocab # reverse_vocab[id] = word 60 | 61 | # Save special word ids. 62 | self.start_id = vocab[start_word] 63 | self.end_id = vocab[end_word] 64 | self.unk_id = vocab[unk_word] 65 | 66 | def word_to_id(self, word): 67 | """Returns the integer word id of a word string.""" 68 | if word in self.vocab: 69 | return self.vocab[word] 70 | else: 71 | return self.unk_id 72 | 73 | def id_to_word(self, word_id): 74 | """Returns the word string of an integer word id.""" 75 | if word_id >= len(self.reverse_vocab): 76 | return self.reverse_vocab[self.unk_id] 77 | else: 78 | return self.reverse_vocab[word_id] 79 | -------------------------------------------------------------------------------- /course_8_image2txt/inference_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Model wrapper class for performing inference with a ShowAndTellModel.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | 24 | from im2txt import show_and_tell_model 25 | from im2txt.inference_utils import inference_wrapper_base 26 | 27 | 28 | class InferenceWrapper(inference_wrapper_base.InferenceWrapperBase): 29 | """Model wrapper class for performing inference with a ShowAndTellModel.""" 30 | 31 | def __init__(self): 32 | super(InferenceWrapper, self).__init__() 33 | 34 | def build_model(self, model_config): 35 | model = show_and_tell_model.ShowAndTellModel(model_config, mode="inference") 36 | model.build() 37 | return model 38 | 39 | def feed_image(self, sess, encoded_image): 40 | initial_state = sess.run(fetches="lstm/initial_state:0", 41 | feed_dict={"image_feed:0": encoded_image}) 42 | return initial_state 43 | 44 | def inference_step(self, sess, input_feed, state_feed): 45 | softmax_output, state_output = sess.run( 46 | fetches=["softmax:0", "lstm/state:0"], 47 | feed_dict={ 48 | "input_feed:0": input_feed, 49 | "lstm/state_feed:0": state_feed, 50 | }) 51 | return softmax_output, state_output, None 52 | -------------------------------------------------------------------------------- /course_8_image2txt/ops/BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility = ["//im2txt:internal"]) 2 | 3 | licenses(["notice"]) # Apache 2.0 4 | 5 | exports_files(["LICENSE"]) 6 | 7 | py_library( 8 | name = "image_processing", 9 | srcs = ["image_processing.py"], 10 | srcs_version = "PY2AND3", 11 | ) 12 | 13 | py_library( 14 | name = "image_embedding", 15 | srcs = ["image_embedding.py"], 16 | srcs_version = "PY2AND3", 17 | ) 18 | 19 | py_test( 20 | name = "image_embedding_test", 21 | size = "small", 22 | srcs = ["image_embedding_test.py"], 23 | deps = [ 24 | ":image_embedding", 25 | ], 26 | ) 27 | 28 | py_library( 29 | name = "inputs", 30 | srcs = ["inputs.py"], 31 | srcs_version = "PY2AND3", 32 | ) 33 | -------------------------------------------------------------------------------- /course_8_image2txt/ops/image_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Image embedding ops.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import tensorflow as tf 24 | 25 | from tensorflow.contrib.slim.python.slim.nets.inception_v3 import inception_v3_base 26 | 27 | slim = tf.contrib.slim 28 | 29 | 30 | def inception_v3(images, 31 | trainable=True, 32 | is_training=True, 33 | weight_decay=0.00004, 34 | stddev=0.1, 35 | dropout_keep_prob=0.8, 36 | use_batch_norm=True, 37 | batch_norm_params=None, 38 | add_summaries=True, 39 | scope="InceptionV3"): 40 | """Builds an Inception V3 subgraph for image embeddings. 41 | 42 | Args: 43 | images: A float32 Tensor of shape [batch, height, width, channels]. 44 | trainable: Whether the inception submodel should be trainable or not. 45 | is_training: Boolean indicating training mode or not. 46 | weight_decay: Coefficient for weight regularization. 47 | stddev: The standard deviation of the trunctated normal weight initializer. 48 | dropout_keep_prob: Dropout keep probability. 49 | use_batch_norm: Whether to use batch normalization. 50 | batch_norm_params: Parameters for batch normalization. See 51 | tf.contrib.layers.batch_norm for details. 52 | add_summaries: Whether to add activation summaries. 53 | scope: Optional Variable scope. 54 | 55 | Returns: 56 | end_points: A dictionary of activations from inception_v3 layers. 57 | """ 58 | # Only consider the inception model to be in training mode if it's trainable. 59 | is_inception_model_training = trainable and is_training 60 | 61 | if use_batch_norm: 62 | # Default parameters for batch normalization. 63 | if not batch_norm_params: 64 | batch_norm_params = { 65 | "is_training": is_inception_model_training, 66 | "trainable": trainable, 67 | # Decay for the moving averages. 68 | "decay": 0.9997, 69 | # Epsilon to prevent 0s in variance. 70 | "epsilon": 0.001, 71 | # Collection containing the moving mean and moving variance. 72 | "variables_collections": { 73 | "beta": None, 74 | "gamma": None, 75 | "moving_mean": ["moving_vars"], 76 | "moving_variance": ["moving_vars"], 77 | } 78 | } 79 | else: 80 | batch_norm_params = None 81 | 82 | if trainable: 83 | weights_regularizer = tf.contrib.layers.l2_regularizer(weight_decay) 84 | else: 85 | weights_regularizer = None 86 | 87 | with tf.variable_scope(scope, "InceptionV3", [images]) as scope: 88 | with slim.arg_scope( 89 | [slim.conv2d, slim.fully_connected], 90 | weights_regularizer=weights_regularizer, 91 | trainable=trainable): 92 | with slim.arg_scope( 93 | [slim.conv2d], 94 | weights_initializer=tf.truncated_normal_initializer(stddev=stddev), 95 | activation_fn=tf.nn.relu, 96 | normalizer_fn=slim.batch_norm, 97 | normalizer_params=batch_norm_params): 98 | net, end_points = inception_v3_base(images, scope=scope) 99 | with tf.variable_scope("logits"): 100 | shape = net.get_shape() 101 | net = slim.avg_pool2d(net, shape[1:3], padding="VALID", scope="pool") 102 | net = slim.dropout( 103 | net, 104 | keep_prob=dropout_keep_prob, 105 | is_training=is_inception_model_training, 106 | scope="dropout") 107 | net = slim.flatten(net, scope="flatten") 108 | 109 | # Add summaries. 110 | if add_summaries: 111 | for v in end_points.values(): 112 | tf.contrib.layers.summaries.summarize_activation(v) 113 | 114 | return net 115 | -------------------------------------------------------------------------------- /course_8_image2txt/ops/image_embedding_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Tests for tensorflow_models.im2txt.ops.image_embedding.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import tensorflow as tf 24 | 25 | from im2txt.ops import image_embedding 26 | 27 | 28 | class InceptionV3Test(tf.test.TestCase): 29 | 30 | def setUp(self): 31 | super(InceptionV3Test, self).setUp() 32 | 33 | batch_size = 4 34 | height = 299 35 | width = 299 36 | num_channels = 3 37 | self._images = tf.placeholder(tf.float32, 38 | [batch_size, height, width, num_channels]) 39 | self._batch_size = batch_size 40 | 41 | def _countInceptionParameters(self): 42 | """Counts the number of parameters in the inception model at top scope.""" 43 | counter = {} 44 | for v in tf.global_variables(): 45 | name_tokens = v.op.name.split("/") 46 | if name_tokens[0] == "InceptionV3": 47 | name = "InceptionV3/" + name_tokens[1] 48 | num_params = v.get_shape().num_elements() 49 | assert num_params 50 | counter[name] = counter.get(name, 0) + num_params 51 | return counter 52 | 53 | def _verifyParameterCounts(self): 54 | """Verifies the number of parameters in the inception model.""" 55 | param_counts = self._countInceptionParameters() 56 | expected_param_counts = { 57 | "InceptionV3/Conv2d_1a_3x3": 960, 58 | "InceptionV3/Conv2d_2a_3x3": 9312, 59 | "InceptionV3/Conv2d_2b_3x3": 18624, 60 | "InceptionV3/Conv2d_3b_1x1": 5360, 61 | "InceptionV3/Conv2d_4a_3x3": 138816, 62 | "InceptionV3/Mixed_5b": 256368, 63 | "InceptionV3/Mixed_5c": 277968, 64 | "InceptionV3/Mixed_5d": 285648, 65 | "InceptionV3/Mixed_6a": 1153920, 66 | "InceptionV3/Mixed_6b": 1298944, 67 | "InceptionV3/Mixed_6c": 1692736, 68 | "InceptionV3/Mixed_6d": 1692736, 69 | "InceptionV3/Mixed_6e": 2143872, 70 | "InceptionV3/Mixed_7a": 1699584, 71 | "InceptionV3/Mixed_7b": 5047872, 72 | "InceptionV3/Mixed_7c": 6080064, 73 | } 74 | self.assertDictEqual(expected_param_counts, param_counts) 75 | 76 | def _assertCollectionSize(self, expected_size, collection): 77 | actual_size = len(tf.get_collection(collection)) 78 | if expected_size != actual_size: 79 | self.fail("Found %d items in collection %s (expected %d)." % 80 | (actual_size, collection, expected_size)) 81 | 82 | def testTrainableTrueIsTrainingTrue(self): 83 | embeddings = image_embedding.inception_v3( 84 | self._images, trainable=True, is_training=True) 85 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) 86 | 87 | self._verifyParameterCounts() 88 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES) 89 | self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES) 90 | self._assertCollectionSize(188, tf.GraphKeys.UPDATE_OPS) 91 | self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES) 92 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES) 93 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES) 94 | 95 | def testTrainableTrueIsTrainingFalse(self): 96 | embeddings = image_embedding.inception_v3( 97 | self._images, trainable=True, is_training=False) 98 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) 99 | 100 | self._verifyParameterCounts() 101 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES) 102 | self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES) 103 | self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS) 104 | self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES) 105 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES) 106 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES) 107 | 108 | def testTrainableFalseIsTrainingTrue(self): 109 | embeddings = image_embedding.inception_v3( 110 | self._images, trainable=False, is_training=True) 111 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) 112 | 113 | self._verifyParameterCounts() 114 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES) 115 | self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES) 116 | self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS) 117 | self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES) 118 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES) 119 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES) 120 | 121 | def testTrainableFalseIsTrainingFalse(self): 122 | embeddings = image_embedding.inception_v3( 123 | self._images, trainable=False, is_training=False) 124 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) 125 | 126 | self._verifyParameterCounts() 127 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES) 128 | self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES) 129 | self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS) 130 | self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES) 131 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES) 132 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES) 133 | 134 | 135 | if __name__ == "__main__": 136 | tf.test.main() 137 | -------------------------------------------------------------------------------- /course_8_image2txt/ops/image_processing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Helper functions for image preprocessing.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import tensorflow as tf 24 | 25 | 26 | def distort_image(image, thread_id): 27 | """Perform random distortions on an image. 28 | 29 | Args: 30 | image: A float32 Tensor of shape [height, width, 3] with values in [0, 1). 31 | thread_id: Preprocessing thread id used to select the ordering of color 32 | distortions. There should be a multiple of 2 preprocessing threads. 33 | 34 | Returns: 35 | distorted_image: A float32 Tensor of shape [height, width, 3] with values in 36 | [0, 1]. 37 | """ 38 | # Randomly flip horizontally. 39 | with tf.name_scope("flip_horizontal", values=[image]): 40 | image = tf.image.random_flip_left_right(image) 41 | 42 | # Randomly distort the colors based on thread id. 43 | color_ordering = thread_id % 2 44 | with tf.name_scope("distort_color", values=[image]): 45 | if color_ordering == 0: 46 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 47 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 48 | image = tf.image.random_hue(image, max_delta=0.032) 49 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 50 | elif color_ordering == 1: 51 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 52 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 53 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 54 | image = tf.image.random_hue(image, max_delta=0.032) 55 | 56 | # The random_* ops do not necessarily clamp. 57 | image = tf.clip_by_value(image, 0.0, 1.0) 58 | 59 | return image 60 | 61 | 62 | def process_image(encoded_image, 63 | is_training, 64 | height, 65 | width, 66 | resize_height=346, 67 | resize_width=346, 68 | thread_id=0, 69 | image_format="jpeg"): 70 | """Decode an image, resize and apply random distortions. 71 | 72 | In training, images are distorted slightly differently depending on thread_id. 73 | 74 | Args: 75 | encoded_image: String Tensor containing the image. 76 | is_training: Boolean; whether preprocessing for training or eval. 77 | height: Height of the output image. 78 | width: Width of the output image. 79 | resize_height: If > 0, resize height before crop to final dimensions. 80 | resize_width: If > 0, resize width before crop to final dimensions. 81 | thread_id: Preprocessing thread id used to select the ordering of color 82 | distortions. There should be a multiple of 2 preprocessing threads. 83 | image_format: "jpeg" or "png". 84 | 85 | Returns: 86 | A float32 Tensor of shape [height, width, 3] with values in [-1, 1]. 87 | 88 | Raises: 89 | ValueError: If image_format is invalid. 90 | """ 91 | # Helper function to log an image summary to the visualizer. Summaries are 92 | # only logged in thread 0. 93 | def image_summary(name, image): 94 | if not thread_id: 95 | tf.summary.image(name, tf.expand_dims(image, 0)) 96 | 97 | # Decode image into a float32 Tensor of shape [?, ?, 3] with values in [0, 1). 98 | with tf.name_scope("decode", values=[encoded_image]): 99 | if image_format == "jpeg": 100 | image = tf.image.decode_jpeg(encoded_image, channels=3) 101 | elif image_format == "png": 102 | image = tf.image.decode_png(encoded_image, channels=3) 103 | else: 104 | raise ValueError("Invalid image format: %s" % image_format) 105 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 106 | image_summary("original_image", image) 107 | 108 | # Resize image. 109 | assert (resize_height > 0) == (resize_width > 0) 110 | if resize_height: 111 | image = tf.image.resize_images(image, 112 | size=[resize_height, resize_width], 113 | method=tf.image.ResizeMethod.BILINEAR) 114 | 115 | # Crop to final dimensions. 116 | if is_training: 117 | image = tf.random_crop(image, [height, width, 3]) 118 | else: 119 | # Central crop, assuming resize_height > height, resize_width > width. 120 | image = tf.image.resize_image_with_crop_or_pad(image, height, width) 121 | 122 | image_summary("resized_image", image) 123 | 124 | # Randomly distort the image. 125 | if is_training: 126 | image = distort_image(image, thread_id) 127 | 128 | image_summary("final_image", image) 129 | 130 | # Rescale to [-1,1] instead of [0, 1] 131 | image = tf.subtract(image, 0.5) 132 | image = tf.multiply(image, 2.0) 133 | return image 134 | -------------------------------------------------------------------------------- /course_8_image2txt/ops/inputs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Input ops.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import tensorflow as tf 24 | 25 | 26 | def parse_sequence_example(serialized, image_feature, caption_feature): 27 | """Parses a tensorflow.SequenceExample into an image and caption. 28 | 29 | Args: 30 | serialized: A scalar string Tensor; a single serialized SequenceExample. 31 | image_feature: Name of SequenceExample context feature containing image 32 | data. 33 | caption_feature: Name of SequenceExample feature list containing integer 34 | captions. 35 | 36 | Returns: 37 | encoded_image: A scalar string Tensor containing a JPEG encoded image. 38 | caption: A 1-D uint64 Tensor with dynamically specified length. 39 | """ 40 | context, sequence = tf.parse_single_sequence_example( 41 | serialized, 42 | context_features={ 43 | image_feature: tf.FixedLenFeature([], dtype=tf.string) 44 | }, 45 | sequence_features={ 46 | caption_feature: tf.FixedLenSequenceFeature([], dtype=tf.int64), 47 | }) 48 | 49 | encoded_image = context[image_feature] 50 | caption = sequence[caption_feature] 51 | return encoded_image, caption 52 | 53 | 54 | def prefetch_input_data(reader, 55 | file_pattern, 56 | is_training, 57 | batch_size, 58 | values_per_shard, 59 | input_queue_capacity_factor=16, 60 | num_reader_threads=1, 61 | shard_queue_name="filename_queue", 62 | value_queue_name="input_queue"): 63 | """Prefetches string values from disk into an input queue. 64 | 65 | In training the capacity of the queue is important because a larger queue 66 | means better mixing of training examples between shards. The minimum number of 67 | values kept in the queue is values_per_shard * input_queue_capacity_factor, 68 | where input_queue_memory factor should be chosen to trade-off better mixing 69 | with memory usage. 70 | 71 | Args: 72 | reader: Instance of tf.ReaderBase. 73 | file_pattern: Comma-separated list of file patterns (e.g. 74 | /tmp/train_data-?????-of-00100). 75 | is_training: Boolean; whether prefetching for training or eval. 76 | batch_size: Model batch size used to determine queue capacity. 77 | values_per_shard: Approximate number of values per shard. 78 | input_queue_capacity_factor: Minimum number of values to keep in the queue 79 | in multiples of values_per_shard. See comments above. 80 | num_reader_threads: Number of reader threads to fill the queue. 81 | shard_queue_name: Name for the shards filename queue. 82 | value_queue_name: Name for the values input queue. 83 | 84 | Returns: 85 | A Queue containing prefetched string values. 86 | """ 87 | data_files = [] 88 | for pattern in file_pattern.split(","): 89 | data_files.extend(tf.gfile.Glob(pattern)) 90 | if not data_files: 91 | tf.logging.fatal("Found no input files matching %s", file_pattern) 92 | else: 93 | tf.logging.info("Prefetching values from %d files matching %s", 94 | len(data_files), file_pattern) 95 | 96 | if is_training: 97 | filename_queue = tf.train.string_input_producer( 98 | data_files, shuffle=True, capacity=16, name=shard_queue_name) 99 | min_queue_examples = values_per_shard * input_queue_capacity_factor 100 | capacity = min_queue_examples + 100 * batch_size 101 | values_queue = tf.RandomShuffleQueue( 102 | capacity=capacity, 103 | min_after_dequeue=min_queue_examples, 104 | dtypes=[tf.string], 105 | name="random_" + value_queue_name) 106 | else: 107 | filename_queue = tf.train.string_input_producer( 108 | data_files, shuffle=False, capacity=1, name=shard_queue_name) 109 | capacity = values_per_shard + 3 * batch_size 110 | values_queue = tf.FIFOQueue( 111 | capacity=capacity, dtypes=[tf.string], name="fifo_" + value_queue_name) 112 | 113 | enqueue_ops = [] 114 | for _ in range(num_reader_threads): 115 | _, value = reader.read(filename_queue) 116 | enqueue_ops.append(values_queue.enqueue([value])) 117 | tf.train.queue_runner.add_queue_runner(tf.train.queue_runner.QueueRunner( 118 | values_queue, enqueue_ops)) 119 | tf.summary.scalar( 120 | "queue/%s/fraction_of_%d_full" % (values_queue.name, capacity), 121 | tf.cast(values_queue.size(), tf.float32) * (1. / capacity)) 122 | 123 | return values_queue 124 | 125 | 126 | def batch_with_dynamic_pad(images_and_captions, 127 | batch_size, 128 | queue_capacity, 129 | add_summaries=True): 130 | """Batches input images and captions. 131 | 132 | This function splits the caption into an input sequence and a target sequence, 133 | where the target sequence is the input sequence right-shifted by 1. Input and 134 | target sequences are batched and padded up to the maximum length of sequences 135 | in the batch. A mask is created to distinguish real words from padding words. 136 | 137 | Example: 138 | Actual captions in the batch ('-' denotes padded character): 139 | [ 140 | [ 1 2 5 4 5 ], 141 | [ 1 2 3 4 - ], 142 | [ 1 2 3 - - ], 143 | ] 144 | 145 | input_seqs: 146 | [ 147 | [ 1 2 3 4 ], 148 | [ 1 2 3 - ], 149 | [ 1 2 - - ], 150 | ] 151 | 152 | target_seqs: 153 | [ 154 | [ 2 3 4 5 ], 155 | [ 2 3 4 - ], 156 | [ 2 3 - - ], 157 | ] 158 | 159 | mask: 160 | [ 161 | [ 1 1 1 1 ], 162 | [ 1 1 1 0 ], 163 | [ 1 1 0 0 ], 164 | ] 165 | 166 | Args: 167 | images_and_captions: A list of pairs [image, caption], where image is a 168 | Tensor of shape [height, width, channels] and caption is a 1-D Tensor of 169 | any length. Each pair will be processed and added to the queue in a 170 | separate thread. 171 | batch_size: Batch size. 172 | queue_capacity: Queue capacity. 173 | add_summaries: If true, add caption length summaries. 174 | 175 | Returns: 176 | images: A Tensor of shape [batch_size, height, width, channels]. 177 | input_seqs: An int32 Tensor of shape [batch_size, padded_length]. 178 | target_seqs: An int32 Tensor of shape [batch_size, padded_length]. 179 | mask: An int32 0/1 Tensor of shape [batch_size, padded_length]. 180 | """ 181 | enqueue_list = [] 182 | for image, caption in images_and_captions: 183 | caption_length = tf.shape(caption)[0] 184 | input_length = tf.expand_dims(tf.subtract(caption_length, 1), 0) 185 | 186 | input_seq = tf.slice(caption, [0], input_length) 187 | target_seq = tf.slice(caption, [1], input_length) 188 | indicator = tf.ones(input_length, dtype=tf.int32) 189 | enqueue_list.append([image, input_seq, target_seq, indicator]) 190 | 191 | images, input_seqs, target_seqs, mask = tf.train.batch_join( 192 | enqueue_list, 193 | batch_size=batch_size, 194 | capacity=queue_capacity, 195 | dynamic_pad=True, 196 | name="batch_and_pad") 197 | 198 | if add_summaries: 199 | lengths = tf.add(tf.reduce_sum(mask, 1), 1) 200 | tf.summary.scalar("caption_length/batch_min", tf.reduce_min(lengths)) 201 | tf.summary.scalar("caption_length/batch_max", tf.reduce_max(lengths)) 202 | tf.summary.scalar("caption_length/batch_mean", tf.reduce_mean(lengths)) 203 | 204 | return images, input_seqs, target_seqs, mask 205 | -------------------------------------------------------------------------------- /course_8_image2txt/readme.md: -------------------------------------------------------------------------------- 1 | # Show and Tell: A Neural Image Caption Generator 2 | 3 | A TensorFlow implementation of the image-to-text model described in the paper: 4 | 5 | "Show and Tell: Lessons learned from the 2015 MSCOCO Image Captioning 6 | Challenge." 7 | 8 | Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan. 9 | 10 | *IEEE transactions on pattern analysis and machine intelligence (2016).* 11 | 12 | Full text available at: http://arxiv.org/abs/1609.06647 13 | 14 | ## Contact 15 | ***Author:*** Chris Shallue 16 | 17 | ***Pull requests and issues:*** @cshallue 18 | 19 | ## Contents 20 | * [Model Overview](#model-overview) 21 | * [Introduction](#introduction) 22 | * [Architecture](#architecture) 23 | * [Getting Started](#getting-started) 24 | * [A Note on Hardware and Training Time](#a-note-on-hardware-and-training-time) 25 | * [Install Required Packages](#install-required-packages) 26 | * [Prepare the Training Data](#prepare-the-training-data) 27 | * [Download the Inception v3 Checkpoint](#download-the-inception-v3-checkpoint) 28 | * [Training a Model](#training-a-model) 29 | * [Initial Training](#initial-training) 30 | * [Fine Tune the Inception v3 Model](#fine-tune-the-inception-v3-model) 31 | * [Generating Captions](#generating-captions) 32 | 33 | ## Model Overview 34 | 35 | ### Introduction 36 | 37 | The *Show and Tell* model is a deep neural network that learns how to describe 38 | the content of images. For example: 39 | 40 | ![Example captions](../g3doc/example_captions.jpg) 41 | 42 | ### Architecture 43 | 44 | The *Show and Tell* model is an example of an *encoder-decoder* neural network. 45 | It works by first "encoding" an image into a fixed-length vector representation, 46 | and then "decoding" the representation into a natural language description. 47 | 48 | The image encoder is a deep convolutional neural network. This type of 49 | network is widely used for image tasks and is currently state-of-the-art for 50 | object recognition and detection. Our particular choice of network is the 51 | [*Inception v3*](http://arxiv.org/abs/1512.00567) image recognition model 52 | pretrained on the 53 | [ILSVRC-2012-CLS](http://www.image-net.org/challenges/LSVRC/2012/) image 54 | classification dataset. 55 | 56 | The decoder is a long short-term memory (LSTM) network. This type of network is 57 | commonly used for sequence modeling tasks such as language modeling and machine 58 | translation. In the *Show and Tell* model, the LSTM network is trained as a 59 | language model conditioned on the image encoding. 60 | 61 | Words in the captions are represented with an embedding model. Each word in the 62 | vocabulary is associated with a fixed-length vector representation that is 63 | learned during training. 64 | 65 | The following diagram illustrates the model architecture. 66 | 67 | ![Show and Tell Architecture](../g3doc/show_and_tell_architecture.png) 68 | 69 | In this diagram, \{*s*0, *s*1, ..., *s**N*-1\} 70 | are the words of the caption and \{*w**e**s*0, 71 | *w**e**s*1, ..., *w**e**s**N*-1\} 72 | are their corresponding word embedding vectors. The outputs \{*p*1, 73 | *p*2, ..., *p**N*\} of the LSTM are probability 74 | distributions generated by the model for the next word in the sentence. The 75 | terms \{log *p*1(*s*1), 76 | log *p*2(*s*2), ..., 77 | log *p**N*(*s**N*)\} are the log-likelihoods of the 78 | correct word at each step; the negated sum of these terms is the minimization 79 | objective of the model. 80 | 81 | During the first phase of training the parameters of the *Inception v3* model 82 | are kept fixed: it is simply a static image encoder function. A single trainable 83 | layer is added on top of the *Inception v3* model to transform the image 84 | embedding into the word embedding vector space. The model is trained with 85 | respect to the parameters of the word embeddings, the parameters of the layer on 86 | top of *Inception v3* and the parameters of the LSTM. In the second phase of 87 | training, all parameters - including the parameters of *Inception v3* - are 88 | trained to jointly fine-tune the image encoder and the LSTM. 89 | 90 | Given a trained model and an image we use *beam search* to generate captions for 91 | that image. Captions are generated word-by-word, where at each step *t* we use 92 | the set of sentences already generated with length *t* - 1 to generate a new set 93 | of sentences with length *t*. We keep only the top *k* candidates at each step, 94 | where the hyperparameter *k* is called the *beam size*. We have found the best 95 | performance with *k* = 3. 96 | 97 | ## Getting Started 98 | 99 | ### A Note on Hardware and Training Time 100 | 101 | The time required to train the *Show and Tell* model depends on your specific 102 | hardware and computational capacity. In this guide we assume you will be running 103 | training on a single machine with a GPU. In our experience on an NVIDIA Tesla 104 | K20m GPU the initial training phase takes 1-2 weeks. The second training phase 105 | may take several additional weeks to achieve peak performance (but you can stop 106 | this phase early and still get reasonable results). 107 | 108 | It is possible to achieve a speed-up by implementing distributed training across 109 | a cluster of machines with GPUs, but that is not covered in this guide. 110 | 111 | Whilst it is possible to run this code on a CPU, beware that this may be 112 | approximately 10 times slower. 113 | 114 | ### Install Required Packages 115 | First ensure that you have installed the following required packages: 116 | 117 | * **Bazel** ([instructions](http://bazel.io/docs/install.html)) 118 | * **TensorFlow** 1.0 or greater ([instructions](https://www.tensorflow.org/install/)) 119 | * **NumPy** ([instructions](http://www.scipy.org/install.html)) 120 | * **Natural Language Toolkit (NLTK)**: 121 | * First install NLTK ([instructions](http://www.nltk.org/install.html)) 122 | * Then install the NLTK data ([instructions](http://www.nltk.org/data.html)) 123 | 124 | ### Prepare the Training Data 125 | 126 | To train the model you will need to provide training data in native TFRecord 127 | format. The TFRecord format consists of a set of sharded files containing 128 | serialized `tf.SequenceExample` protocol buffers. Each `tf.SequenceExample` 129 | proto contains an image (JPEG format), a caption and metadata such as the image 130 | id. 131 | 132 | Each caption is a list of words. During preprocessing, a dictionary is created 133 | that assigns each word in the vocabulary to an integer-valued id. Each caption 134 | is encoded as a list of integer word ids in the `tf.SequenceExample` protos. 135 | 136 | We have provided a script to download and preprocess the [MSCOCO](http://mscoco.org/) image captioning data set into this format. Downloading 137 | and preprocessing the data may take several hours depending on your network and 138 | computer speed. Please be patient. 139 | 140 | Before running the script, ensure that your hard disk has at least 150GB of 141 | available space for storing the downloaded and processed data. 142 | 143 | ```shell 144 | # Location to save the MSCOCO data. 145 | MSCOCO_DIR="${HOME}/im2txt/data/mscoco" 146 | 147 | # Build the preprocessing script. 148 | bazel build im2txt/download_and_preprocess_mscoco 149 | 150 | # Run the preprocessing script. 151 | bazel-bin/im2txt/download_and_preprocess_mscoco "${MSCOCO_DIR}" 152 | ``` 153 | 154 | The final line of the output should read: 155 | 156 | ``` 157 | 2016-09-01 16:47:47.296630: Finished processing all 20267 image-caption pairs in data set 'test'. 158 | ``` 159 | 160 | When the script finishes you will find 256 training, 4 validation and 8 testing 161 | files in `DATA_DIR`. The files will match the patterns `train-?????-of-00256`, 162 | `val-?????-of-00004` and `test-?????-of-00008`, respectively. 163 | 164 | ### Download the Inception v3 Checkpoint 165 | 166 | The *Show and Tell* model requires a pretrained *Inception v3* checkpoint file 167 | to initialize the parameters of its image encoder submodel. 168 | 169 | This checkpoint file is provided by the 170 | [TensorFlow-Slim image classification library](https://github.com/tensorflow/models/tree/master/slim#tensorflow-slim-image-classification-library) 171 | which provides a suite of pre-trained image classification models. You can read 172 | more about the models provided by the library 173 | [here](https://github.com/tensorflow/models/tree/master/slim#pre-trained-models). 174 | 175 | 176 | Run the following commands to download the *Inception v3* checkpoint. 177 | 178 | ```shell 179 | # Location to save the Inception v3 checkpoint. 180 | INCEPTION_DIR="${HOME}/im2txt/data" 181 | mkdir -p ${INCEPTION_DIR} 182 | 183 | wget "http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz" 184 | tar -xvf "inception_v3_2016_08_28.tar.gz" -C ${INCEPTION_DIR} 185 | rm "inception_v3_2016_08_28.tar.gz" 186 | ``` 187 | 188 | Note that the *Inception v3* checkpoint will only be used for initializing the 189 | parameters of the *Show and Tell* model. Once the *Show and Tell* model starts 190 | training it will save its own checkpoint files containing the values of all its 191 | parameters (including copies of the *Inception v3* parameters). If training is 192 | stopped and restarted, the parameter values will be restored from the latest 193 | *Show and Tell* checkpoint and the *Inception v3* checkpoint will be ignored. In 194 | other words, the *Inception v3* checkpoint is only used in the 0-th global step 195 | (initialization) of training the *Show and Tell* model. 196 | 197 | ## Training a Model 198 | 199 | ### Initial Training 200 | 201 | Run the training script. 202 | 203 | ```shell 204 | # Directory containing preprocessed MSCOCO data. 205 | MSCOCO_DIR="${HOME}/im2txt/data/mscoco" 206 | 207 | # Inception v3 checkpoint file. 208 | INCEPTION_CHECKPOINT="${HOME}/im2txt/data/inception_v3.ckpt" 209 | 210 | # Directory to save the model. 211 | MODEL_DIR="${HOME}/im2txt/model" 212 | 213 | # Build the model. 214 | bazel build -c opt im2txt/... 215 | 216 | # Run the training script. 217 | bazel-bin/im2txt/train \ 218 | --input_file_pattern="${MSCOCO_DIR}/train-?????-of-00256" \ 219 | --inception_checkpoint_file="${INCEPTION_CHECKPOINT}" \ 220 | --train_dir="${MODEL_DIR}/train" \ 221 | --train_inception=false \ 222 | --number_of_steps=1000000 223 | ``` 224 | 225 | Run the evaluation script in a separate process. This will log evaluation 226 | metrics to TensorBoard which allows training progress to be monitored in 227 | real-time. 228 | 229 | Note that you may run out of memory if you run the evaluation script on the same 230 | GPU as the training script. You can run the command 231 | `export CUDA_VISIBLE_DEVICES=""` to force the evaluation script to run on CPU. 232 | If evaluation runs too slowly on CPU, you can decrease the value of 233 | `--num_eval_examples`. 234 | 235 | ```shell 236 | MSCOCO_DIR="${HOME}/im2txt/data/mscoco" 237 | MODEL_DIR="${HOME}/im2txt/model" 238 | 239 | # Ignore GPU devices (only necessary if your GPU is currently memory 240 | # constrained, for example, by running the training script). 241 | export CUDA_VISIBLE_DEVICES="" 242 | 243 | # Run the evaluation script. This will run in a loop, periodically loading the 244 | # latest model checkpoint file and computing evaluation metrics. 245 | bazel-bin/im2txt/evaluate \ 246 | --input_file_pattern="${MSCOCO_DIR}/val-?????-of-00004" \ 247 | --checkpoint_dir="${MODEL_DIR}/train" \ 248 | --eval_dir="${MODEL_DIR}/eval" 249 | ``` 250 | 251 | Run a TensorBoard server in a separate process for real-time monitoring of 252 | training progress and evaluation metrics. 253 | 254 | ```shell 255 | MODEL_DIR="${HOME}/im2txt/model" 256 | 257 | # Run a TensorBoard server. 258 | tensorboard --logdir="${MODEL_DIR}" 259 | ``` 260 | 261 | ### Fine Tune the Inception v3 Model 262 | 263 | Your model will already be able to generate reasonable captions after the first 264 | phase of training. Try it out! (See [Generating Captions](#generating-captions)). 265 | 266 | You can further improve the performance of the model by running a 267 | second training phase to jointly fine-tune the parameters of the *Inception v3* 268 | image submodel and the LSTM. 269 | 270 | ```shell 271 | # Restart the training script with --train_inception=true. 272 | bazel-bin/im2txt/train \ 273 | --input_file_pattern="${MSCOCO_DIR}/train-?????-of-00256" \ 274 | --train_dir="${MODEL_DIR}/train" \ 275 | --train_inception=true \ 276 | --number_of_steps=3000000 # Additional 2M steps (assuming 1M in initial training). 277 | ``` 278 | 279 | Note that training will proceed much slower now, and the model will continue to 280 | improve by a small amount for a long time. We have found that it will improve 281 | slowly for an additional 2-2.5 million steps before it begins to overfit. This 282 | may take several weeks on a single GPU. If you don't care about absolutely 283 | optimal performance then feel free to halt training sooner by stopping the 284 | training script or passing a smaller value to the flag `--number_of_steps`. Your 285 | model will still work reasonably well. 286 | 287 | ## Generating Captions 288 | 289 | Your trained *Show and Tell* model can generate captions for any JPEG image! The 290 | following command line will generate captions for an image from the test set. 291 | 292 | ```shell 293 | # Path to checkpoint file or a directory containing checkpoint files. Passing 294 | # a directory will only work if there is also a file named 'checkpoint' which 295 | # lists the available checkpoints in the directory. It will not work if you 296 | # point to a directory with just a copy of a model checkpoint: in that case, 297 | # you will need to pass the checkpoint path explicitly. 298 | CHECKPOINT_PATH="${HOME}/im2txt/model/train" 299 | 300 | # Vocabulary file generated by the preprocessing script. 301 | VOCAB_FILE="${HOME}/im2txt/data/mscoco/word_counts.txt" 302 | 303 | # JPEG image file to caption. 304 | IMAGE_FILE="${HOME}/im2txt/data/mscoco/raw-data/val2014/COCO_val2014_000000224477.jpg" 305 | 306 | # Build the inference binary. 307 | bazel build -c opt im2txt/run_inference 308 | 309 | # Ignore GPU devices (only necessary if your GPU is currently memory 310 | # constrained, for example, by running the training script). 311 | export CUDA_VISIBLE_DEVICES="" 312 | 313 | # Run inference to generate captions. 314 | bazel-bin/im2txt/run_inference \ 315 | --checkpoint_path=${CHECKPOINT_PATH} \ 316 | --vocab_file=${VOCAB_FILE} \ 317 | --input_files=${IMAGE_FILE} 318 | ``` 319 | 320 | Example output: 321 | 322 | ```shell 323 | Captions for image COCO_val2014_000000224477.jpg: 324 | 0) a man riding a wave on top of a surfboard . (p=0.040413) 325 | 1) a person riding a surf board on a wave (p=0.017452) 326 | 2) a man riding a wave on a surfboard in the ocean . (p=0.005743) 327 | ``` 328 | 329 | Note: you may get different results. Some variation between different models is 330 | expected. 331 | 332 | Here is the image: 333 | 334 | ![Surfer](../g3doc/COCO_val2014_000000224477.jpg) 335 | -------------------------------------------------------------------------------- /course_8_image2txt/run_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | r"""Generate captions for images using default beam search parameters.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import math 22 | import os 23 | 24 | 25 | import tensorflow as tf 26 | 27 | from im2txt import configuration 28 | from im2txt import inference_wrapper 29 | from im2txt.inference_utils import caption_generator 30 | from im2txt.inference_utils import vocabulary 31 | 32 | FLAGS = tf.flags.FLAGS 33 | 34 | tf.flags.DEFINE_string("checkpoint_path", "", 35 | "Model checkpoint file or directory containing a " 36 | "model checkpoint file.") 37 | tf.flags.DEFINE_string("vocab_file", "", "Text file containing the vocabulary.") 38 | tf.flags.DEFINE_string("input_files", "", 39 | "File pattern or comma-separated list of file patterns " 40 | "of image files.") 41 | 42 | tf.logging.set_verbosity(tf.logging.INFO) 43 | 44 | 45 | def main(_): 46 | # Build the inference graph. 47 | g = tf.Graph() 48 | with g.as_default(): 49 | model = inference_wrapper.InferenceWrapper() 50 | restore_fn = model.build_graph_from_config(configuration.ModelConfig(), 51 | FLAGS.checkpoint_path) 52 | g.finalize() 53 | 54 | # Create the vocabulary. 55 | vocab = vocabulary.Vocabulary(FLAGS.vocab_file) 56 | 57 | filenames = [] 58 | for file_pattern in FLAGS.input_files.split(","): 59 | filenames.extend(tf.gfile.Glob(file_pattern)) 60 | tf.logging.info("Running caption generation on %d files matching %s", 61 | len(filenames), FLAGS.input_files) 62 | 63 | with tf.Session(graph=g) as sess: 64 | # Load the model from checkpoint. 65 | restore_fn(sess) 66 | 67 | # Prepare the caption generator. Here we are implicitly using the default 68 | # beam search parameters. See caption_generator.py for a description of the 69 | # available beam search parameters. 70 | generator = caption_generator.CaptionGenerator(model, vocab) 71 | 72 | for filename in filenames: 73 | with tf.gfile.GFile(filename, "r") as f: 74 | image = f.read() 75 | captions = generator.beam_search(sess, image) 76 | print("Captions for image %s:" % os.path.basename(filename)) 77 | for i, caption in enumerate(captions): 78 | # Ignore begin and end words. 79 | sentence = [vocab.id_to_word(w) for w in caption.sentence[1:-1]] 80 | sentence = " ".join(sentence) 81 | print(" %d) %s (p=%f)" % (i, sentence, math.exp(caption.logprob))) 82 | 83 | 84 | if __name__ == "__main__": 85 | tf.app.run() 86 | -------------------------------------------------------------------------------- /course_8_image2txt/show_and_tell_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Image-to-text implementation based on http://arxiv.org/abs/1411.4555. 17 | 18 | "Show and Tell: A Neural Image Caption Generator" 19 | Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan 20 | """ 21 | 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | 26 | 27 | import tensorflow as tf 28 | 29 | from im2txt.ops import image_embedding 30 | from im2txt.ops import image_processing 31 | from im2txt.ops import inputs as input_ops 32 | 33 | 34 | class ShowAndTellModel(object): 35 | """Image-to-text implementation based on http://arxiv.org/abs/1411.4555. 36 | 37 | "Show and Tell: A Neural Image Caption Generator" 38 | Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan 39 | """ 40 | 41 | def __init__(self, config, mode, train_inception=False): 42 | """Basic setup. 43 | 44 | Args: 45 | config: Object containing configuration parameters. 46 | mode: "train", "eval" or "inference". 47 | train_inception: Whether the inception submodel variables are trainable. 48 | """ 49 | assert mode in ["train", "eval", "inference"] 50 | self.config = config 51 | self.mode = mode 52 | self.train_inception = train_inception 53 | 54 | # Reader for the input data. 55 | self.reader = tf.TFRecordReader() 56 | 57 | # To match the "Show and Tell" paper we initialize all variables with a 58 | # random uniform initializer. 59 | self.initializer = tf.random_uniform_initializer( 60 | minval=-self.config.initializer_scale, 61 | maxval=self.config.initializer_scale) 62 | 63 | # A float32 Tensor with shape [batch_size, height, width, channels]. 64 | self.images = None 65 | 66 | # An int32 Tensor with shape [batch_size, padded_length]. 67 | self.input_seqs = None 68 | 69 | # An int32 Tensor with shape [batch_size, padded_length]. 70 | self.target_seqs = None 71 | 72 | # An int32 0/1 Tensor with shape [batch_size, padded_length]. 73 | self.input_mask = None 74 | 75 | # A float32 Tensor with shape [batch_size, embedding_size]. 76 | self.image_embeddings = None 77 | 78 | # A float32 Tensor with shape [batch_size, padded_length, embedding_size]. 79 | self.seq_embeddings = None 80 | 81 | # A float32 scalar Tensor; the total loss for the trainer to optimize. 82 | self.total_loss = None 83 | 84 | # A float32 Tensor with shape [batch_size * padded_length]. 85 | self.target_cross_entropy_losses = None 86 | 87 | # A float32 Tensor with shape [batch_size * padded_length]. 88 | self.target_cross_entropy_loss_weights = None 89 | 90 | # Collection of variables from the inception submodel. 91 | self.inception_variables = [] 92 | 93 | # Function to restore the inception submodel from checkpoint. 94 | self.init_fn = None 95 | 96 | # Global step Tensor. 97 | self.global_step = None 98 | 99 | def is_training(self): 100 | """Returns true if the model is built for training mode.""" 101 | return self.mode == "train" 102 | 103 | def process_image(self, encoded_image, thread_id=0): 104 | """Decodes and processes an image string. 105 | 106 | Args: 107 | encoded_image: A scalar string Tensor; the encoded image. 108 | thread_id: Preprocessing thread id used to select the ordering of color 109 | distortions. 110 | 111 | Returns: 112 | A float32 Tensor of shape [height, width, 3]; the processed image. 113 | """ 114 | return image_processing.process_image(encoded_image, 115 | is_training=self.is_training(), 116 | height=self.config.image_height, 117 | width=self.config.image_width, 118 | thread_id=thread_id, 119 | image_format=self.config.image_format) 120 | 121 | def build_inputs(self): 122 | """Input prefetching, preprocessing and batching. 123 | 124 | Outputs: 125 | self.images 126 | self.input_seqs 127 | self.target_seqs (training and eval only) 128 | self.input_mask (training and eval only) 129 | """ 130 | if self.mode == "inference": 131 | # In inference mode, images and inputs are fed via placeholders. 132 | image_feed = tf.placeholder(dtype=tf.string, shape=[], name="image_feed") 133 | input_feed = tf.placeholder(dtype=tf.int64, 134 | shape=[None], # batch_size 135 | name="input_feed") 136 | 137 | # Process image and insert batch dimensions. 138 | images = tf.expand_dims(self.process_image(image_feed), 0) 139 | input_seqs = tf.expand_dims(input_feed, 1) 140 | 141 | # No target sequences or input mask in inference mode. 142 | target_seqs = None 143 | input_mask = None 144 | else: 145 | # Prefetch serialized SequenceExample protos. 146 | input_queue = input_ops.prefetch_input_data( 147 | self.reader, 148 | self.config.input_file_pattern, 149 | is_training=self.is_training(), 150 | batch_size=self.config.batch_size, 151 | values_per_shard=self.config.values_per_input_shard, 152 | input_queue_capacity_factor=self.config.input_queue_capacity_factor, 153 | num_reader_threads=self.config.num_input_reader_threads) 154 | 155 | # Image processing and random distortion. Split across multiple threads 156 | # with each thread applying a slightly different distortion. 157 | assert self.config.num_preprocess_threads % 2 == 0 158 | images_and_captions = [] 159 | for thread_id in range(self.config.num_preprocess_threads): 160 | serialized_sequence_example = input_queue.dequeue() 161 | encoded_image, caption = input_ops.parse_sequence_example( 162 | serialized_sequence_example, 163 | image_feature=self.config.image_feature_name, 164 | caption_feature=self.config.caption_feature_name) 165 | image = self.process_image(encoded_image, thread_id=thread_id) 166 | images_and_captions.append([image, caption]) 167 | 168 | # Batch inputs. 169 | queue_capacity = (2 * self.config.num_preprocess_threads * 170 | self.config.batch_size) 171 | images, input_seqs, target_seqs, input_mask = ( 172 | input_ops.batch_with_dynamic_pad(images_and_captions, 173 | batch_size=self.config.batch_size, 174 | queue_capacity=queue_capacity)) 175 | 176 | self.images = images 177 | self.input_seqs = input_seqs 178 | self.target_seqs = target_seqs 179 | self.input_mask = input_mask 180 | 181 | def build_image_embeddings(self): 182 | """Builds the image model subgraph and generates image embeddings. 183 | 184 | Inputs: 185 | self.images 186 | 187 | Outputs: 188 | self.image_embeddings 189 | """ 190 | inception_output = image_embedding.inception_v3( 191 | self.images, 192 | trainable=self.train_inception, 193 | is_training=self.is_training()) 194 | self.inception_variables = tf.get_collection( 195 | tf.GraphKeys.GLOBAL_VARIABLES, scope="InceptionV3") 196 | 197 | # Map inception output into embedding space. 198 | with tf.variable_scope("image_embedding") as scope: 199 | image_embeddings = tf.contrib.layers.fully_connected( 200 | inputs=inception_output, 201 | num_outputs=self.config.embedding_size, 202 | activation_fn=None, 203 | weights_initializer=self.initializer, 204 | biases_initializer=None, 205 | scope=scope) 206 | 207 | # Save the embedding size in the graph. 208 | tf.constant(self.config.embedding_size, name="embedding_size") 209 | 210 | self.image_embeddings = image_embeddings 211 | 212 | def build_seq_embeddings(self): 213 | """Builds the input sequence embeddings. 214 | 215 | Inputs: 216 | self.input_seqs 217 | 218 | Outputs: 219 | self.seq_embeddings 220 | """ 221 | with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"): 222 | embedding_map = tf.get_variable( 223 | name="map", 224 | shape=[self.config.vocab_size, self.config.embedding_size], 225 | initializer=self.initializer) 226 | seq_embeddings = tf.nn.embedding_lookup(embedding_map, self.input_seqs) 227 | 228 | self.seq_embeddings = seq_embeddings 229 | 230 | def build_model(self): 231 | """Builds the model. 232 | 233 | Inputs: 234 | self.image_embeddings 235 | self.seq_embeddings 236 | self.target_seqs (training and eval only) 237 | self.input_mask (training and eval only) 238 | 239 | Outputs: 240 | self.total_loss (training and eval only) 241 | self.target_cross_entropy_losses (training and eval only) 242 | self.target_cross_entropy_loss_weights (training and eval only) 243 | """ 244 | # This LSTM cell has biases and outputs tanh(new_c) * sigmoid(o), but the 245 | # modified LSTM in the "Show and Tell" paper has no biases and outputs 246 | # new_c * sigmoid(o). 247 | lstm_cell = tf.contrib.rnn.BasicLSTMCell( 248 | num_units=self.config.num_lstm_units, state_is_tuple=True) 249 | if self.mode == "train": 250 | lstm_cell = tf.contrib.rnn.DropoutWrapper( 251 | lstm_cell, 252 | input_keep_prob=self.config.lstm_dropout_keep_prob, 253 | output_keep_prob=self.config.lstm_dropout_keep_prob) 254 | 255 | with tf.variable_scope("lstm", initializer=self.initializer) as lstm_scope: 256 | # Feed the image embeddings to set the initial LSTM state. 257 | zero_state = lstm_cell.zero_state( 258 | batch_size=self.image_embeddings.get_shape()[0], dtype=tf.float32) 259 | _, initial_state = lstm_cell(self.image_embeddings, zero_state) 260 | 261 | # Allow the LSTM variables to be reused. 262 | lstm_scope.reuse_variables() 263 | 264 | if self.mode == "inference": 265 | # In inference mode, use concatenated states for convenient feeding and 266 | # fetching. 267 | tf.concat(axis=1, values=initial_state, name="initial_state") 268 | 269 | # Placeholder for feeding a batch of concatenated states. 270 | state_feed = tf.placeholder(dtype=tf.float32, 271 | shape=[None, sum(lstm_cell.state_size)], 272 | name="state_feed") 273 | state_tuple = tf.split(value=state_feed, num_or_size_splits=2, axis=1) 274 | 275 | # Run a single LSTM step. 276 | lstm_outputs, state_tuple = lstm_cell( 277 | inputs=tf.squeeze(self.seq_embeddings, axis=[1]), 278 | state=state_tuple) 279 | 280 | # Concatentate the resulting state. 281 | tf.concat(axis=1, values=state_tuple, name="state") 282 | else: 283 | # Run the batch of sequence embeddings through the LSTM. 284 | sequence_length = tf.reduce_sum(self.input_mask, 1) 285 | lstm_outputs, _ = tf.nn.dynamic_rnn(cell=lstm_cell, 286 | inputs=self.seq_embeddings, 287 | sequence_length=sequence_length, 288 | initial_state=initial_state, 289 | dtype=tf.float32, 290 | scope=lstm_scope) 291 | 292 | # Stack batches vertically. 293 | lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size]) 294 | 295 | with tf.variable_scope("logits") as logits_scope: 296 | logits = tf.contrib.layers.fully_connected( 297 | inputs=lstm_outputs, 298 | num_outputs=self.config.vocab_size, 299 | activation_fn=None, 300 | weights_initializer=self.initializer, 301 | scope=logits_scope) 302 | 303 | if self.mode == "inference": 304 | tf.nn.softmax(logits, name="softmax") 305 | else: 306 | targets = tf.reshape(self.target_seqs, [-1]) 307 | weights = tf.to_float(tf.reshape(self.input_mask, [-1])) 308 | 309 | # Compute losses. 310 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, 311 | logits=logits) 312 | batch_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)), 313 | tf.reduce_sum(weights), 314 | name="batch_loss") 315 | tf.losses.add_loss(batch_loss) 316 | total_loss = tf.losses.get_total_loss() 317 | 318 | # Add summaries. 319 | tf.summary.scalar("losses/batch_loss", batch_loss) 320 | tf.summary.scalar("losses/total_loss", total_loss) 321 | for var in tf.trainable_variables(): 322 | tf.summary.histogram("parameters/" + var.op.name, var) 323 | 324 | self.total_loss = total_loss 325 | self.target_cross_entropy_losses = losses # Used in evaluation. 326 | self.target_cross_entropy_loss_weights = weights # Used in evaluation. 327 | 328 | def setup_inception_initializer(self): 329 | """Sets up the function to restore inception variables from checkpoint.""" 330 | if self.mode != "inference": 331 | # Restore inception variables only. 332 | saver = tf.train.Saver(self.inception_variables) 333 | 334 | def restore_fn(sess): 335 | tf.logging.info("Restoring Inception variables from checkpoint file %s", 336 | self.config.inception_checkpoint_file) 337 | saver.restore(sess, self.config.inception_checkpoint_file) 338 | 339 | self.init_fn = restore_fn 340 | 341 | def setup_global_step(self): 342 | """Sets up the global step Tensor.""" 343 | global_step = tf.Variable( 344 | initial_value=0, 345 | name="global_step", 346 | trainable=False, 347 | collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) 348 | 349 | self.global_step = global_step 350 | 351 | def build(self): 352 | """Creates all ops for training and evaluation.""" 353 | self.build_inputs() 354 | self.build_image_embeddings() 355 | self.build_seq_embeddings() 356 | self.build_model() 357 | self.setup_inception_initializer() 358 | self.setup_global_step() 359 | -------------------------------------------------------------------------------- /course_8_image2txt/show_and_tell_model_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Tests for tensorflow_models.im2txt.show_and_tell_model.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import numpy as np 24 | import tensorflow as tf 25 | 26 | from im2txt import configuration 27 | from im2txt import show_and_tell_model 28 | 29 | 30 | class ShowAndTellModel(show_and_tell_model.ShowAndTellModel): 31 | """Subclass of ShowAndTellModel without the disk I/O.""" 32 | 33 | def build_inputs(self): 34 | if self.mode == "inference": 35 | # Inference mode doesn't read from disk, so defer to parent. 36 | return super(ShowAndTellModel, self).build_inputs() 37 | else: 38 | # Replace disk I/O with random Tensors. 39 | self.images = tf.random_uniform( 40 | shape=[self.config.batch_size, self.config.image_height, 41 | self.config.image_width, 3], 42 | minval=-1, 43 | maxval=1) 44 | self.input_seqs = tf.random_uniform( 45 | [self.config.batch_size, 15], 46 | minval=0, 47 | maxval=self.config.vocab_size, 48 | dtype=tf.int64) 49 | self.target_seqs = tf.random_uniform( 50 | [self.config.batch_size, 15], 51 | minval=0, 52 | maxval=self.config.vocab_size, 53 | dtype=tf.int64) 54 | self.input_mask = tf.ones_like(self.input_seqs) 55 | 56 | 57 | class ShowAndTellModelTest(tf.test.TestCase): 58 | 59 | def setUp(self): 60 | super(ShowAndTellModelTest, self).setUp() 61 | self._model_config = configuration.ModelConfig() 62 | 63 | def _countModelParameters(self): 64 | """Counts the number of parameters in the model at top level scope.""" 65 | counter = {} 66 | for v in tf.global_variables(): 67 | name = v.op.name.split("/")[0] 68 | num_params = v.get_shape().num_elements() 69 | assert num_params 70 | counter[name] = counter.get(name, 0) + num_params 71 | return counter 72 | 73 | def _checkModelParameters(self): 74 | """Verifies the number of parameters in the model.""" 75 | param_counts = self._countModelParameters() 76 | expected_param_counts = { 77 | "InceptionV3": 21802784, 78 | # inception_output_size * embedding_size 79 | "image_embedding": 1048576, 80 | # vocab_size * embedding_size 81 | "seq_embedding": 6144000, 82 | # (embedding_size + num_lstm_units + 1) * 4 * num_lstm_units 83 | "lstm": 2099200, 84 | # (num_lstm_units + 1) * vocab_size 85 | "logits": 6156000, 86 | "global_step": 1, 87 | } 88 | self.assertDictEqual(expected_param_counts, param_counts) 89 | 90 | def _checkOutputs(self, expected_shapes, feed_dict=None): 91 | """Verifies that the model produces expected outputs. 92 | 93 | Args: 94 | expected_shapes: A dict mapping Tensor or Tensor name to expected output 95 | shape. 96 | feed_dict: Values of Tensors to feed into Session.run(). 97 | """ 98 | fetches = expected_shapes.keys() 99 | 100 | with self.test_session() as sess: 101 | sess.run(tf.global_variables_initializer()) 102 | outputs = sess.run(fetches, feed_dict) 103 | 104 | for index, output in enumerate(outputs): 105 | tensor = fetches[index] 106 | expected = expected_shapes[tensor] 107 | actual = output.shape 108 | if expected != actual: 109 | self.fail("Tensor %s has shape %s (expected %s)." % 110 | (tensor, actual, expected)) 111 | 112 | def testBuildForTraining(self): 113 | model = ShowAndTellModel(self._model_config, mode="train") 114 | model.build() 115 | 116 | self._checkModelParameters() 117 | 118 | expected_shapes = { 119 | # [batch_size, image_height, image_width, 3] 120 | model.images: (32, 299, 299, 3), 121 | # [batch_size, sequence_length] 122 | model.input_seqs: (32, 15), 123 | # [batch_size, sequence_length] 124 | model.target_seqs: (32, 15), 125 | # [batch_size, sequence_length] 126 | model.input_mask: (32, 15), 127 | # [batch_size, embedding_size] 128 | model.image_embeddings: (32, 512), 129 | # [batch_size, sequence_length, embedding_size] 130 | model.seq_embeddings: (32, 15, 512), 131 | # Scalar 132 | model.total_loss: (), 133 | # [batch_size * sequence_length] 134 | model.target_cross_entropy_losses: (480,), 135 | # [batch_size * sequence_length] 136 | model.target_cross_entropy_loss_weights: (480,), 137 | } 138 | self._checkOutputs(expected_shapes) 139 | 140 | def testBuildForEval(self): 141 | model = ShowAndTellModel(self._model_config, mode="eval") 142 | model.build() 143 | 144 | self._checkModelParameters() 145 | 146 | expected_shapes = { 147 | # [batch_size, image_height, image_width, 3] 148 | model.images: (32, 299, 299, 3), 149 | # [batch_size, sequence_length] 150 | model.input_seqs: (32, 15), 151 | # [batch_size, sequence_length] 152 | model.target_seqs: (32, 15), 153 | # [batch_size, sequence_length] 154 | model.input_mask: (32, 15), 155 | # [batch_size, embedding_size] 156 | model.image_embeddings: (32, 512), 157 | # [batch_size, sequence_length, embedding_size] 158 | model.seq_embeddings: (32, 15, 512), 159 | # Scalar 160 | model.total_loss: (), 161 | # [batch_size * sequence_length] 162 | model.target_cross_entropy_losses: (480,), 163 | # [batch_size * sequence_length] 164 | model.target_cross_entropy_loss_weights: (480,), 165 | } 166 | self._checkOutputs(expected_shapes) 167 | 168 | def testBuildForInference(self): 169 | model = ShowAndTellModel(self._model_config, mode="inference") 170 | model.build() 171 | 172 | self._checkModelParameters() 173 | 174 | # Test feeding an image to get the initial LSTM state. 175 | images_feed = np.random.rand(1, 299, 299, 3) 176 | feed_dict = {model.images: images_feed} 177 | expected_shapes = { 178 | # [batch_size, embedding_size] 179 | model.image_embeddings: (1, 512), 180 | # [batch_size, 2 * num_lstm_units] 181 | "lstm/initial_state:0": (1, 1024), 182 | } 183 | self._checkOutputs(expected_shapes, feed_dict) 184 | 185 | # Test feeding a batch of inputs and LSTM states to get softmax output and 186 | # LSTM states. 187 | input_feed = np.random.randint(0, 10, size=3) 188 | state_feed = np.random.rand(3, 1024) 189 | feed_dict = {"input_feed:0": input_feed, "lstm/state_feed:0": state_feed} 190 | expected_shapes = { 191 | # [batch_size, 2 * num_lstm_units] 192 | "lstm/state:0": (3, 1024), 193 | # [batch_size, vocab_size] 194 | "softmax:0": (3, 12000), 195 | } 196 | self._checkOutputs(expected_shapes, feed_dict) 197 | 198 | 199 | if __name__ == "__main__": 200 | tf.test.main() 201 | -------------------------------------------------------------------------------- /course_8_image2txt/train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Train the model.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | 22 | import tensorflow as tf 23 | 24 | from im2txt import configuration 25 | from im2txt import show_and_tell_model 26 | 27 | FLAGS = tf.app.flags.FLAGS 28 | 29 | tf.flags.DEFINE_string("input_file_pattern", "", 30 | "File pattern of sharded TFRecord input files.") 31 | tf.flags.DEFINE_string("inception_checkpoint_file", "", 32 | "Path to a pretrained inception_v3 model.") 33 | tf.flags.DEFINE_string("train_dir", "", 34 | "Directory for saving and loading model checkpoints.") 35 | tf.flags.DEFINE_boolean("train_inception", False, 36 | "Whether to train inception submodel variables.") 37 | tf.flags.DEFINE_integer("number_of_steps", 1000000, "Number of training steps.") 38 | tf.flags.DEFINE_integer("log_every_n_steps", 1, 39 | "Frequency at which loss and global step are logged.") 40 | 41 | tf.logging.set_verbosity(tf.logging.INFO) 42 | 43 | 44 | def main(unused_argv): 45 | assert FLAGS.input_file_pattern, "--input_file_pattern is required" 46 | assert FLAGS.train_dir, "--train_dir is required" 47 | 48 | model_config = configuration.ModelConfig() 49 | model_config.input_file_pattern = FLAGS.input_file_pattern 50 | model_config.inception_checkpoint_file = FLAGS.inception_checkpoint_file 51 | training_config = configuration.TrainingConfig() 52 | 53 | # Create training directory. 54 | train_dir = FLAGS.train_dir 55 | if not tf.gfile.IsDirectory(train_dir): 56 | tf.logging.info("Creating training directory: %s", train_dir) 57 | tf.gfile.MakeDirs(train_dir) 58 | 59 | # Build the TensorFlow graph. 60 | g = tf.Graph() 61 | with g.as_default(): 62 | # Build the model. 63 | model = show_and_tell_model.ShowAndTellModel( 64 | model_config, mode="train", train_inception=FLAGS.train_inception) 65 | model.build() 66 | 67 | # Set up the learning rate. 68 | learning_rate_decay_fn = None 69 | if FLAGS.train_inception: 70 | learning_rate = tf.constant(training_config.train_inception_learning_rate) 71 | else: 72 | learning_rate = tf.constant(training_config.initial_learning_rate) 73 | if training_config.learning_rate_decay_factor > 0: 74 | num_batches_per_epoch = (training_config.num_examples_per_epoch / 75 | model_config.batch_size) 76 | decay_steps = int(num_batches_per_epoch * 77 | training_config.num_epochs_per_decay) 78 | 79 | def _learning_rate_decay_fn(learning_rate, global_step): 80 | return tf.train.exponential_decay( 81 | learning_rate, 82 | global_step, 83 | decay_steps=decay_steps, 84 | decay_rate=training_config.learning_rate_decay_factor, 85 | staircase=True) 86 | 87 | learning_rate_decay_fn = _learning_rate_decay_fn 88 | 89 | # Set up the training ops. 90 | train_op = tf.contrib.layers.optimize_loss( 91 | loss=model.total_loss, 92 | global_step=model.global_step, 93 | learning_rate=learning_rate, 94 | optimizer=training_config.optimizer, 95 | clip_gradients=training_config.clip_gradients, 96 | learning_rate_decay_fn=learning_rate_decay_fn) 97 | 98 | # Set up the Saver for saving and restoring model checkpoints. 99 | saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep) 100 | 101 | # Run training. 102 | tf.contrib.slim.learning.train( 103 | train_op, 104 | train_dir, 105 | log_every_n_steps=FLAGS.log_every_n_steps, 106 | graph=g, 107 | global_step=model.global_step, 108 | number_of_steps=FLAGS.number_of_steps, 109 | init_fn=model.init_fn, 110 | saver=saver) 111 | 112 | 113 | if __name__ == "__main__": 114 | tf.app.run() 115 | -------------------------------------------------------------------------------- /course_9_pix2pix_file.md: -------------------------------------------------------------------------------- 1 | https://github.com/wiibrew/pix2pix-tensorflow-1 2 | -------------------------------------------------------------------------------- /course_example_vgg/.gitignore: -------------------------------------------------------------------------------- 1 | vgg16.npy 2 | -------------------------------------------------------------------------------- /course_example_vgg/course_4_vgg16_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | # import matplotlib.pyplot as plt 4 | import matplotlib.image as mpimg 5 | import skimage 6 | import vgg16 7 | import utils 8 | 9 | 10 | img1 = utils.load_image("./test_data/dog.png") 11 | 12 | print img1.shape 13 | 14 | 15 | batch = img1.reshape((1, 224, 224, 3)) 16 | 17 | #plot the image 18 | 19 | # imgshow1=plt.imshow(img1) 20 | 21 | # with tf.Session(config=tf.ConfigProto(gpu_options=(tf.GPUOptions(per_process_gpu_memory_fraction=0.7)))) as sess: 22 | with tf.device('/cpu:0'): 23 | with tf.Session() as sess: 24 | images = tf.placeholder("float", [1, 224, 224, 3]) 25 | feed_dict = {images: batch} 26 | 27 | vgg = vgg16.Vgg16() 28 | with tf.name_scope("content_vgg"): 29 | vgg.build(images) 30 | 31 | prob = sess.run(vgg.prob, feed_dict=feed_dict) 32 | top5 = np.argsort(prob[0])[-1:-6:-1] 33 | for n, label in enumerate(top5): 34 | print label 35 | pool1 = sess.run(vgg.pool1, feed_dict=feed_dict) 36 | print pool1.shape 37 | conv3_3=sess.run(vgg.conv3_3, feed_dict=feed_dict) 38 | print conv3_3.shape 39 | #now let's plot the model filters 40 | vgg = vgg16.Vgg16() 41 | 42 | #get the saved parameter dict keys 43 | print vgg.data_dict.keys() 44 | 45 | #show the first conv layer 46 | filter_conv1=vgg.get_conv_filter("conv1_1") 47 | print 'filter_conv1', filter_conv1.shape 48 | 49 | tf.Print(filter_conv1[:,:,:,:5],[filter_conv1[:,:,:,:5]]) 50 | 51 | filter_conv3=vgg.get_conv_filter("conv3_3") 52 | print 'filter_conv3', filter_conv3.shape 53 | 54 | tf.Print(filter_conv3[:,:,:3,:5],[filter_conv3[:,:,:3,:5]]) -------------------------------------------------------------------------------- /course_example_vgg/np_plot.py: -------------------------------------------------------------------------------- 1 | #the script is for geneate a figure for deep learning model 2 | #datablob and filter parameter visualization 3 | import numpy as np 4 | import skimage 5 | 6 | def plot_array(arr4d,size,stride=5): 7 | # 8 | _,h,w,d=arr4d.shape 9 | N_blocks=size/(h+stride) 10 | step=h+stride 11 | Im_arr=np.zeros((size,size)) 12 | cnt=0 13 | for i in range(N_blocks): 14 | for j in range(N_blocks): 15 | Im_arr[i*step:(i+1)*step,j*step:(j+1)*step]=arr4d[0,:,:,cnt] 16 | cnt+=1 17 | return Im_arr 18 | def plot_filter(arr4d,filter_num=6, stride=1): 19 | h=3 20 | N_blocks=filter_num 21 | step=4 22 | Im_arr=np.zeros((filter_num*step,filter_num*step)) 23 | cnt=0 24 | for i in range(N_blocks): 25 | for j in range(N_blocks): 26 | Im_arr[i*step:(i+1)*step,j*step:(j+1)*step]=arr4d[:,:,0,cnt] 27 | cnt+=1 28 | skimage.transform.resize(img, (200, 200)) 29 | return Im_arr -------------------------------------------------------------------------------- /course_example_vgg/np_plot.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/course_example_vgg/np_plot.pyc -------------------------------------------------------------------------------- /course_example_vgg/test_data/dog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/course_example_vgg/test_data/dog.png -------------------------------------------------------------------------------- /course_example_vgg/test_data/puzzle.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/course_example_vgg/test_data/puzzle.jpeg -------------------------------------------------------------------------------- /course_example_vgg/test_data/tiger.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/course_example_vgg/test_data/tiger.jpeg -------------------------------------------------------------------------------- /course_example_vgg/utils.py: -------------------------------------------------------------------------------- 1 | import skimage 2 | import skimage.io 3 | import skimage.transform 4 | import numpy as np 5 | 6 | 7 | # synset = [l.strip() for l in open('synset.txt').readlines()] 8 | 9 | 10 | # returns image of shape [224, 224, 3] 11 | # [height, width, depth] 12 | def load_image(path): 13 | # load image 14 | img = skimage.io.imread(path) 15 | img = img / 255.0 16 | assert (0 <= img).all() and (img <= 1.0).all() 17 | # print "Original Image Shape: ", img.shape 18 | # we crop image from center 19 | short_edge = min(img.shape[:2]) 20 | yy = int((img.shape[0] - short_edge) / 2) 21 | xx = int((img.shape[1] - short_edge) / 2) 22 | crop_img = img[yy: yy + short_edge, xx: xx + short_edge] 23 | # resize to 224, 224 24 | resized_img = skimage.transform.resize(crop_img, (224, 224)) 25 | return resized_img 26 | 27 | 28 | # returns the top1 string 29 | def print_prob(prob, file_path): 30 | synset = [l.strip() for l in open(file_path).readlines()] 31 | 32 | # print prob 33 | pred = np.argsort(prob)[::-1] 34 | 35 | # Get top1 label 36 | top1 = synset[pred[0]] 37 | print(("Top1: ", top1, prob[pred[0]])) 38 | # Get top5 label 39 | top5 = [(synset[pred[i]], prob[pred[i]]) for i in range(5)] 40 | print(("Top5: ", top5)) 41 | return top1 42 | 43 | 44 | def load_image2(path, height=None, width=None): 45 | # load image 46 | img = skimage.io.imread(path) 47 | img = img / 255.0 48 | if height is not None and width is not None: 49 | ny = height 50 | nx = width 51 | elif height is not None: 52 | ny = height 53 | nx = img.shape[1] * ny / img.shape[0] 54 | elif width is not None: 55 | nx = width 56 | ny = img.shape[0] * nx / img.shape[1] 57 | else: 58 | ny = img.shape[0] 59 | nx = img.shape[1] 60 | return skimage.transform.resize(img, (ny, nx)) 61 | 62 | 63 | def test(): 64 | img = skimage.io.imread("./test_data/starry_night.jpg") 65 | ny = 300 66 | nx = img.shape[1] * ny / img.shape[0] 67 | img = skimage.transform.resize(img, (ny, nx)) 68 | skimage.io.imsave("./test_data/test/output.jpg", img) 69 | 70 | 71 | if __name__ == "__main__": 72 | test() 73 | -------------------------------------------------------------------------------- /course_example_vgg/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/course_example_vgg/utils.pyc -------------------------------------------------------------------------------- /course_example_vgg/vgg-model-download-link: -------------------------------------------------------------------------------- 1 | vgg16 model download link: 2 | https://mega.nz/#!YU1FWJrA!O1ywiCS2IiOlUCtCpI6HTJOMrneN-Qdv3ywQP5poecM 3 | vgg19 model download link: -------------------------------------------------------------------------------- /course_example_vgg/vgg-model-download-link.txt: -------------------------------------------------------------------------------- 1 | vgg16 model download link: 2 | https://mega.nz/#!YU1FWJrA!O1ywiCS2IiOlUCtCpI6HTJOMrneN-Qdv3ywQP5poecM 3 | vgg19 model download link: 4 | https://mega.nz/#!xZ8glS6J!MAnE91ND_WyfZ_8mvkuSa2YcA7q-1ehfSm-Q1fxOvvs -------------------------------------------------------------------------------- /course_example_vgg/vgg16.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | import time 7 | 8 | VGG_MEAN = [103.939, 116.779, 123.68] 9 | 10 | 11 | class Vgg16: 12 | def __init__(self, vgg16_npy_path=None): 13 | if vgg16_npy_path is None: 14 | path = inspect.getfile(Vgg16) 15 | path = os.path.abspath(os.path.join(path, os.pardir)) 16 | path = os.path.join(path, "vgg16.npy") 17 | vgg16_npy_path = path 18 | print(path) 19 | 20 | self.data_dict = np.load(vgg16_npy_path, encoding='latin1').item() 21 | print("npy file loaded") 22 | 23 | def build(self, rgb): 24 | """ 25 | load variable from npy to build the VGG 26 | :param rgb: rgb image [batch, height, width, 3] values scaled [0, 1] 27 | """ 28 | 29 | start_time = time.time() 30 | print("build model started") 31 | rgb_scaled = rgb * 255.0 32 | 33 | # Convert RGB to BGR 34 | red, green, blue = tf.split(axis=3, num_or_size_splits=3, value=rgb_scaled) 35 | assert red.get_shape().as_list()[1:] == [224, 224, 1] 36 | assert green.get_shape().as_list()[1:] == [224, 224, 1] 37 | assert blue.get_shape().as_list()[1:] == [224, 224, 1] 38 | bgr = tf.concat(axis=3, values=[ 39 | blue - VGG_MEAN[0], 40 | green - VGG_MEAN[1], 41 | red - VGG_MEAN[2], 42 | ]) 43 | assert bgr.get_shape().as_list()[1:] == [224, 224, 3] 44 | 45 | self.conv1_1 = self.conv_layer(bgr, "conv1_1") 46 | self.conv1_2 = self.conv_layer(self.conv1_1, "conv1_2") 47 | self.pool1 = self.max_pool(self.conv1_2, 'pool1') 48 | 49 | self.conv2_1 = self.conv_layer(self.pool1, "conv2_1") 50 | self.conv2_2 = self.conv_layer(self.conv2_1, "conv2_2") 51 | self.pool2 = self.max_pool(self.conv2_2, 'pool2') 52 | 53 | self.conv3_1 = self.conv_layer(self.pool2, "conv3_1") 54 | self.conv3_2 = self.conv_layer(self.conv3_1, "conv3_2") 55 | self.conv3_3 = self.conv_layer(self.conv3_2, "conv3_3") 56 | self.pool3 = self.max_pool(self.conv3_3, 'pool3') 57 | 58 | self.conv4_1 = self.conv_layer(self.pool3, "conv4_1") 59 | self.conv4_2 = self.conv_layer(self.conv4_1, "conv4_2") 60 | self.conv4_3 = self.conv_layer(self.conv4_2, "conv4_3") 61 | self.pool4 = self.max_pool(self.conv4_3, 'pool4') 62 | 63 | self.conv5_1 = self.conv_layer(self.pool4, "conv5_1") 64 | self.conv5_2 = self.conv_layer(self.conv5_1, "conv5_2") 65 | self.conv5_3 = self.conv_layer(self.conv5_2, "conv5_3") 66 | self.pool5 = self.max_pool(self.conv5_3, 'pool5') 67 | 68 | self.fc6 = self.fc_layer(self.pool5, "fc6") 69 | assert self.fc6.get_shape().as_list()[1:] == [4096] 70 | self.relu6 = tf.nn.relu(self.fc6) 71 | 72 | self.fc7 = self.fc_layer(self.relu6, "fc7") 73 | self.relu7 = tf.nn.relu(self.fc7) 74 | 75 | self.fc8 = self.fc_layer(self.relu7, "fc8") 76 | 77 | self.prob = tf.nn.softmax(self.fc8, name="prob") 78 | 79 | self.data_dict = None 80 | print(("build model finished: %ds" % (time.time() - start_time))) 81 | 82 | def avg_pool(self, bottom, name): 83 | return tf.nn.avg_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 84 | 85 | def max_pool(self, bottom, name): 86 | return tf.nn.max_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 87 | 88 | def conv_layer(self, bottom, name): 89 | with tf.variable_scope(name): 90 | filt = self.get_conv_filter(name) 91 | 92 | conv = tf.nn.conv2d(bottom, filt, [1, 1, 1, 1], padding='SAME') 93 | 94 | conv_biases = self.get_bias(name) 95 | bias = tf.nn.bias_add(conv, conv_biases) 96 | 97 | relu = tf.nn.relu(bias) 98 | return relu 99 | 100 | def fc_layer(self, bottom, name): 101 | with tf.variable_scope(name): 102 | shape = bottom.get_shape().as_list() 103 | dim = 1 104 | for d in shape[1:]: 105 | dim *= d 106 | x = tf.reshape(bottom, [-1, dim]) 107 | 108 | weights = self.get_fc_weight(name) 109 | biases = self.get_bias(name) 110 | 111 | # Fully connected layer. Note that the '+' operation automatically 112 | # broadcasts the biases. 113 | fc = tf.nn.bias_add(tf.matmul(x, weights), biases) 114 | 115 | return fc 116 | 117 | def get_conv_filter(self, name): 118 | return tf.constant(self.data_dict[name][0], name="filter") 119 | 120 | def get_bias(self, name): 121 | return tf.constant(self.data_dict[name][1], name="biases") 122 | 123 | def get_fc_weight(self, name): 124 | return tf.constant(self.data_dict[name][0], name="weights") -------------------------------------------------------------------------------- /course_example_vgg/vgg16_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | import vgg16 5 | import utils 6 | from imagenet1000_clsid_to_human import labels 7 | 8 | img1 = utils.load_image("./test_data/tiger.jpeg") 9 | img2 = utils.load_image("./test_data/puzzle.jpeg") 10 | 11 | batch1 = img1.reshape((1, 224, 224, 3)) 12 | batch2 = img2.reshape((1, 224, 224, 3)) 13 | 14 | batch = np.concatenate((batch1, batch2), 0) 15 | 16 | def percent(v): 17 | return '%.2f%%' % (v * 100) 18 | 19 | # with tf.Session(config=tf.ConfigProto(gpu_options=(tf.GPUOptions(per_process_gpu_memory_fraction=0.7)))) as sess: 20 | with tf.device('/cpu:0'): 21 | with tf.Session() as sess: 22 | images = tf.placeholder("float", [2, 224, 224, 3]) 23 | feed_dict = {images: batch} 24 | 25 | vgg = vgg16.Vgg16() 26 | with tf.name_scope("content_vgg"): 27 | vgg.build(images) 28 | 29 | prob = sess.run(vgg.prob, feed_dict=feed_dict) 30 | for i, p in enumerate(prob): 31 | v = sess.run(tf.nn.top_k(p, 5)) 32 | print('-'*4) 33 | for j, k in enumerate(v.indices): 34 | print(labels[k], ':', percent(v.values[j])) 35 | -------------------------------------------------------------------------------- /g3doc/COCO_val2014_000000224477.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/g3doc/COCO_val2014_000000224477.jpg -------------------------------------------------------------------------------- /g3doc/example_captions.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/g3doc/example_captions.jpg -------------------------------------------------------------------------------- /g3doc/show_and_tell_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/g3doc/show_and_tell_architecture.png -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/activations.py: -------------------------------------------------------------------------------- 1 | """Activations for TensorFlow. 2 | Parag K. Mital, Jan 2016.""" 3 | import tensorflow as tf 4 | 5 | 6 | def lrelu(x, leak=0.2, name="lrelu"): 7 | """Leaky rectifier. 8 | 9 | Parameters 10 | ---------- 11 | x : Tensor 12 | The tensor to apply the nonlinearity to. 13 | leak : float, optional 14 | Leakage parameter. 15 | name : str, optional 16 | Variable scope to use. 17 | 18 | Returns 19 | ------- 20 | x : Tensor 21 | Output of the nonlinearity. 22 | """ 23 | with tf.variable_scope(name): 24 | f1 = 0.5 * (1 + leak) 25 | f2 = 0.5 * (1 - leak) 26 | return f1 * x + f2 * abs(x) 27 | -------------------------------------------------------------------------------- /libs/batch_norm.py: -------------------------------------------------------------------------------- 1 | """Batch Normalization for TensorFlow. 2 | Parag K. Mital, Jan 2016. 3 | """ 4 | 5 | import tensorflow as tf 6 | 7 | 8 | def batch_norm(x, phase_train, scope='bn', affine=True): 9 | """ 10 | Batch normalization on convolutional maps. 11 | 12 | from: https://stackoverflow.com/questions/33949786/how-could-i- 13 | use-batch-normalization-in-tensorflow 14 | 15 | Only modified to infer shape from input tensor x. 16 | 17 | Parameters 18 | ---------- 19 | x 20 | Tensor, 4D BHWD input maps 21 | phase_train 22 | boolean tf.Variable, true indicates training phase 23 | scope 24 | string, variable scope 25 | affine 26 | whether to affine-transform outputs 27 | 28 | Return 29 | ------ 30 | normed 31 | batch-normalized maps 32 | """ 33 | with tf.variable_scope(scope): 34 | shape = x.get_shape().as_list() 35 | 36 | beta = tf.Variable(tf.constant(0.0, shape=[shape[-1]]), 37 | name='beta', trainable=True) 38 | gamma = tf.Variable(tf.constant(1.0, shape=[shape[-1]]), 39 | name='gamma', trainable=affine) 40 | 41 | batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments') 42 | ema = tf.train.ExponentialMovingAverage(decay=0.9) 43 | ema_mean, ema_var = ema.average(batch_mean), ema.average(batch_var) 44 | 45 | def mean_var_with_update(): 46 | """Summary 47 | 48 | Returns 49 | ------- 50 | name : TYPE 51 | Description 52 | """ 53 | ema_apply_op = ema.apply([batch_mean, batch_var]) 54 | with tf.control_dependencies([ema_apply_op]): 55 | return tf.identity(batch_mean), tf.identity(batch_var) 56 | mean, var = tf.cond(phase_train, 57 | mean_var_with_update, 58 | lambda: (ema_mean, ema_var)) 59 | 60 | normed = tf.nn.batch_norm_with_global_normalization( 61 | x, mean, var, beta, gamma, 1e-3, affine) 62 | return normed 63 | -------------------------------------------------------------------------------- /libs/connections.py: -------------------------------------------------------------------------------- 1 | """APL 2.0 code from github.com/pkmital/tensorflow_tutorials w/ permission 2 | from Parag K. Mital. 3 | """ 4 | import math 5 | import tensorflow as tf 6 | 7 | 8 | def batch_norm(x, phase_train, scope='bn', affine=True): 9 | """ 10 | Batch normalization on convolutional maps. 11 | from: https://stackoverflow.com/questions/33949786/how-could-i- 12 | use-batch-normalization-in-tensorflow 13 | Only modified to infer shape from input tensor x. 14 | Parameters 15 | ---------- 16 | x 17 | Tensor, 4D BHWD input maps 18 | phase_train 19 | boolean tf.Variable, true indicates training phase 20 | scope 21 | string, variable scope 22 | affine 23 | whether to affine-transform outputs 24 | Return 25 | ------ 26 | normed 27 | batch-normalized maps 28 | """ 29 | with tf.variable_scope(scope): 30 | og_shape = x.get_shape().as_list() 31 | if len(og_shape) == 2: 32 | x = tf.reshape(x, [-1, 1, 1, og_shape[1]]) 33 | shape = x.get_shape().as_list() 34 | beta = tf.Variable(tf.constant(0.0, shape=[shape[-1]]), 35 | name='beta', trainable=True) 36 | gamma = tf.Variable(tf.constant(1.0, shape=[shape[-1]]), 37 | name='gamma', trainable=affine) 38 | 39 | batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments') 40 | ema = tf.train.ExponentialMovingAverage(decay=0.9) 41 | ema_apply_op = ema.apply([batch_mean, batch_var]) 42 | ema_mean, ema_var = ema.average(batch_mean), ema.average(batch_var) 43 | 44 | def mean_var_with_update(): 45 | """Summary 46 | Returns 47 | ------- 48 | name : TYPE 49 | Description 50 | """ 51 | with tf.control_dependencies([ema_apply_op]): 52 | return tf.identity(batch_mean), tf.identity(batch_var) 53 | mean, var = tf.cond(phase_train, 54 | mean_var_with_update, 55 | lambda: (ema_mean, ema_var)) 56 | 57 | normed = tf.nn.batch_norm_with_global_normalization( 58 | x, mean, var, beta, gamma, 1e-3, affine) 59 | if len(og_shape) == 2: 60 | normed = tf.reshape(normed, [-1, og_shape[-1]]) 61 | return normed 62 | 63 | 64 | def lrelu(x, leak=0.2, name="lrelu"): 65 | """Leaky rectifier. 66 | Parameters 67 | ---------- 68 | x : Tensor 69 | The tensor to apply the nonlinearity to. 70 | leak : float, optional 71 | Leakage parameter. 72 | name : str, optional 73 | Variable scope to use. 74 | Returns 75 | ------- 76 | x : Tensor 77 | Output of the nonlinearity. 78 | """ 79 | with tf.variable_scope(name): 80 | f1 = 0.5 * (1 + leak) 81 | f2 = 0.5 * (1 - leak) 82 | return f1 * x + f2 * abs(x) 83 | 84 | 85 | def linear(x, n_units, scope=None, stddev=0.02, 86 | activation=lambda x: x): 87 | """Fully-connected network. 88 | Parameters 89 | ---------- 90 | x : Tensor 91 | Input tensor to the network. 92 | n_units : int 93 | Number of units to connect to. 94 | scope : str, optional 95 | Variable scope to use. 96 | stddev : float, optional 97 | Initialization's standard deviation. 98 | activation : arguments, optional 99 | Function which applies a nonlinearity 100 | Returns 101 | ------- 102 | x : Tensor 103 | Fully-connected output. 104 | """ 105 | shape = x.get_shape().as_list() 106 | 107 | with tf.variable_scope(scope or "Linear"): 108 | matrix = tf.get_variable("Matrix", [shape[1], n_units], tf.float32, 109 | tf.random_normal_initializer(stddev=stddev)) 110 | return activation(tf.matmul(x, matrix)) 111 | 112 | 113 | def conv2d(x, n_filters, 114 | k_h=5, k_w=5, 115 | stride_h=2, stride_w=2, 116 | stddev=0.02, 117 | activation=None, 118 | bias=True, 119 | padding='SAME', 120 | name="Conv2D"): 121 | """2D Convolution with options for kernel size, stride, and init deviation. 122 | 123 | Parameters 124 | ---------- 125 | x : Tensor 126 | Input tensor to convolve. 127 | n_filters : int 128 | Number of filters to apply. 129 | k_h : int, optional 130 | Kernel height. 131 | k_w : int, optional 132 | Kernel width. 133 | stride_h : int, optional 134 | Stride in rows. 135 | stride_w : int, optional 136 | Stride in cols. 137 | stddev : float, optional 138 | Initialization's standard deviation. 139 | activation : arguments, optional 140 | Function which applies a nonlinearity 141 | padding : str, optional 142 | 'SAME' or 'VALID' 143 | name : str, optional 144 | Variable scope to use. 145 | 146 | Returns 147 | ------- 148 | x : Tensor 149 | Convolved input. 150 | """ 151 | with tf.variable_scope(name): 152 | w = tf.get_variable( 153 | 'w', [k_h, k_w, x.get_shape()[-1], n_filters], 154 | initializer=tf.truncated_normal_initializer(stddev=stddev)) 155 | conv = tf.nn.conv2d( 156 | x, w, strides=[1, stride_h, stride_w, 1], padding=padding) 157 | if bias: 158 | b = tf.get_variable( 159 | 'b', [n_filters], 160 | initializer=tf.truncated_normal_initializer(stddev=stddev)) 161 | conv = tf.nn.bias_add(conv, b) 162 | if activation: 163 | conv = activation(conv) 164 | return conv 165 | -------------------------------------------------------------------------------- /libs/dataset_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | 5 | 6 | def cifar10_download(dst='cifar10'): 7 | from six.moves import urllib 8 | import tarfile 9 | if not os.path.exists(dst): 10 | os.makedirs(dst) 11 | path = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' 12 | filepath, _ = urllib.request.urlretrieve(path, './') 13 | tarfile.open(filepath, 'r:gz').extractall(dst) 14 | 15 | 16 | def cifar10_load(dst='cifar10'): 17 | if not os.path.exists(dst): 18 | cifar10_download(dst) 19 | Xs = None 20 | ys = None 21 | for f in range(1, 6): 22 | cf = pickle.load(open( 23 | '%s/data_batch_%d' % (dst, f), 'rb'), 24 | encoding='LATIN') 25 | if Xs is not None: 26 | Xs = np.r_[Xs, cf['data']] 27 | ys = np.r_[ys, np.array(cf['labels'])] 28 | else: 29 | Xs = cf['data'] 30 | ys = cf['labels'] 31 | return Xs, ys 32 | 33 | 34 | def dense_to_one_hot(labels, n_classes=2): 35 | """Convert class labels from scalars to one-hot vectors.""" 36 | labels = np.array(labels) 37 | n_labels = labels.shape[0] 38 | index_offset = np.arange(n_labels) * n_classes 39 | labels_one_hot = np.zeros((n_labels, n_classes), dtype=np.float32) 40 | labels_one_hot.flat[index_offset + labels.ravel()] = 1 41 | return labels_one_hot 42 | 43 | 44 | class DatasetSplit(object): 45 | def __init__(self, images, labels): 46 | self.images = np.array(images).astype(np.float32) 47 | self.labels = np.array(labels).astype(np.int32) 48 | self.n_labels = len(np.unique(labels)) 49 | self.num_examples = len(self.images) 50 | 51 | def next_batch(self, batch_size=100): 52 | # Shuffle each epoch 53 | current_permutation = np.random.permutation(range(len(self.images))) 54 | epoch_images = self.images[current_permutation, ...] 55 | epoch_labels = dense_to_one_hot( 56 | self.labels[current_permutation, ...], self.n_labels) 57 | 58 | # Then iterate over the epoch 59 | self.current_batch_idx = 0 60 | while self.current_batch_idx < len(self.images): 61 | end_idx = min( 62 | self.current_batch_idx + batch_size, len(self.images)) 63 | this_batch = { 64 | 'images': epoch_images[self.current_batch_idx:end_idx], 65 | 'labels': epoch_labels[self.current_batch_idx:end_idx] 66 | } 67 | self.current_batch_idx += batch_size 68 | yield this_batch['images'], this_batch['labels'] 69 | 70 | 71 | class Dataset(object): 72 | def __init__(self, Xs, ys, split=[0.8, 0.1, 0.1]): 73 | 74 | self.all_idxs = [] 75 | self.all_labels = [] 76 | self.all_inputs = [] 77 | self.train_idxs = [] 78 | self.valid_idxs = [] 79 | self.test_idxs = [] 80 | self.n_labels = 0 81 | self.split = split 82 | 83 | # Now mix all the labels that are currently stored as blocks 84 | self.all_inputs = Xs 85 | self.all_labels = ys 86 | n_idxs = len(self.all_inputs) 87 | idxs = range(n_idxs) 88 | rand_idxs = np.random.permutation(idxs) 89 | self.all_inputs = self.all_inputs[rand_idxs, ...] 90 | self.all_labels = self.all_labels[rand_idxs, ...] 91 | 92 | # Get splits 93 | self.train_idxs = idxs[:round(split[0] * n_idxs)] 94 | self.valid_idxs = idxs[len(self.train_idxs): 95 | len(self.train_idxs) + round(split[1] * n_idxs)] 96 | self.test_idxs = idxs[len(self.valid_idxs): 97 | len(self.valid_idxs) + round(split[2] * n_idxs)] 98 | 99 | @property 100 | def train(self): 101 | inputs = self.all_inputs[self.train_idxs, ...] 102 | labels = self.all_labels[self.train_idxs, ...] 103 | return DatasetSplit(inputs, labels) 104 | 105 | @property 106 | def valid(self): 107 | inputs = self.all_inputs[self.valid_idxs, ...] 108 | labels = self.all_labels[self.valid_idxs, ...] 109 | return DatasetSplit(inputs, labels) 110 | 111 | @property 112 | def test(self): 113 | inputs = self.all_inputs[self.test_idxs, ...] 114 | labels = self.all_labels[self.test_idxs, ...] 115 | return DatasetSplit(inputs, labels) 116 | 117 | def mean(self): 118 | return np.mean(self.all_inputs, axis=0) 119 | 120 | def std(self): 121 | return np.std(self.all_inputs, axis=0) 122 | -------------------------------------------------------------------------------- /libs/datasets.py: -------------------------------------------------------------------------------- 1 | """Loading datasets. 2 | 3 | Parag K. Mital, Jan. 2016 4 | """ 5 | import tensorflow.examples.tutorials.mnist.input_data as input_data 6 | from .dataset_utils import * 7 | 8 | 9 | def MNIST(one_hot=True): 10 | """Returns the MNIST dataset. 11 | 12 | Returns 13 | ------- 14 | mnist : DataSet 15 | DataSet object w/ convenienve props for accessing 16 | train/validation/test sets and batches. 17 | """ 18 | return input_data.read_data_sets('MNIST_data/', one_hot=one_hot) 19 | 20 | 21 | def CIFAR10(): 22 | # plt.imshow(np.transpose(np.reshape(cifar.train.images[10], (3, 32, 32)), [1, 2, 0])) 23 | Xs, ys = cifar10_load() 24 | return Dataset(Xs, ys) 25 | -------------------------------------------------------------------------------- /libs/utils.py: -------------------------------------------------------------------------------- 1 | """Some useful utilities when dealing with neural nets w/ tensorflow. 2 | 3 | Parag K. Mital, Jan. 2016 4 | """ 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | 9 | def montage_batch(images): 10 | """Draws all filters (n_input * n_output filters) as a 11 | montage image separated by 1 pixel borders. 12 | 13 | Parameters 14 | ---------- 15 | batch : numpy.ndarray 16 | Input array to create montage of. 17 | 18 | Returns 19 | ------- 20 | m : numpy.ndarray 21 | Montage image. 22 | """ 23 | img_h = images.shape[1] 24 | img_w = images.shape[2] 25 | n_plots = int(np.ceil(np.sqrt(images.shape[0]))) 26 | m = np.ones( 27 | (images.shape[1] * n_plots + n_plots + 1, 28 | images.shape[2] * n_plots + n_plots + 1, 3)) * 0.5 29 | 30 | for i in range(n_plots): 31 | for j in range(n_plots): 32 | this_filter = i * n_plots + j 33 | if this_filter < images.shape[0]: 34 | this_img = images[this_filter, ...] 35 | m[1 + i + i * img_h:1 + i + (i + 1) * img_h, 36 | 1 + j + j * img_w:1 + j + (j + 1) * img_w, :] = this_img 37 | return m 38 | 39 | 40 | # %% 41 | def montage(W): 42 | """Draws all filters (n_input * n_output filters) as a 43 | montage image separated by 1 pixel borders. 44 | 45 | Parameters 46 | ---------- 47 | W : numpy.ndarray 48 | Input array to create montage of. 49 | 50 | Returns 51 | ------- 52 | m : numpy.ndarray 53 | Montage image. 54 | """ 55 | W = np.reshape(W, [W.shape[0], W.shape[1], 1, W.shape[2] * W.shape[3]]) 56 | n_plots = int(np.ceil(np.sqrt(W.shape[-1]))) 57 | m = np.ones( 58 | (W.shape[0] * n_plots + n_plots + 1, 59 | W.shape[1] * n_plots + n_plots + 1)) * 0.5 60 | for i in range(n_plots): 61 | for j in range(n_plots): 62 | this_filter = i * n_plots + j 63 | if this_filter < W.shape[-1]: 64 | m[1 + i + i * W.shape[0]:1 + i + (i + 1) * W.shape[0], 65 | 1 + j + j * W.shape[1]:1 + j + (j + 1) * W.shape[1]] = ( 66 | np.squeeze(W[:, :, :, this_filter])) 67 | return m 68 | 69 | 70 | 71 | 72 | # %% 73 | def corrupt(x): 74 | """Take an input tensor and add uniform masking. 75 | 76 | Parameters 77 | ---------- 78 | x : Tensor/Placeholder 79 | Input to corrupt. 80 | 81 | Returns 82 | ------- 83 | x_corrupted : Tensor 84 | 50 pct of values corrupted. 85 | """ 86 | return tf.multiply(x, tf.cast(tf.random_uniform(shape=tf.shape(x), 87 | minval=0, 88 | maxval=2, 89 | dtype=tf.int32), tf.float32)) 90 | 91 | 92 | # %% 93 | def weight_variable(shape): 94 | '''Helper function to create a weight variable initialized with 95 | a normal distribution 96 | 97 | Parameters 98 | ---------- 99 | shape : list 100 | Size of weight variable 101 | ''' 102 | initial = tf.random_normal(shape, mean=0.0, stddev=0.01) 103 | return tf.Variable(initial) 104 | 105 | 106 | # %% 107 | def bias_variable(shape): 108 | '''Helper function to create a bias variable initialized with 109 | a constant value. 110 | 111 | Parameters 112 | ---------- 113 | shape : list 114 | Size of weight variable 115 | ''' 116 | initial = tf.random_normal(shape, mean=0.0, stddev=0.01) 117 | return tf.Variable(initial) 118 | -------------------------------------------------------------------------------- /mnist/mnist.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/mnist/mnist.pkl.gz -------------------------------------------------------------------------------- /mnist/t10k-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/mnist/t10k-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /mnist/t10k-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/mnist/t10k-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /mnist/train-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/mnist/train-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /mnist/train-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wiibrew/DeepLearningCourseCodes/6b20c12415893f270b30c3cba640732c090b49ba/mnist/train-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /tf_1_try.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#\n", 12 | "import tensorflow as tf" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 4, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "5\n", 27 | "6\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "# direct sum with constand value\n", 33 | "a = tf.constant(2)\n", 34 | "b = tf.constant(3)\n", 35 | "c=a+b\n", 36 | "d=a*b\n", 37 | "\n", 38 | "sess=tf.Session()\n", 39 | "print sess.run(c)\n", 40 | "print sess.run(d)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "5\n", 55 | "6\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "# \n", 61 | "a = tf.placeholder(tf.int16)\n", 62 | "b = tf.placeholder(tf.int16)\n", 63 | "\n", 64 | "# \n", 65 | "add = tf.add(a, b)\n", 66 | "mul = tf.multiply(a, b)\n", 67 | "print sess.run(add, feed_dict={a: 2, b: 3})\n", 68 | "print sess.run(mul, feed_dict={a: 2, b: 3})" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 6, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "[[ 6. 6.]\n", 83 | " [ 6. 6.]]\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "#\n", 89 | "matrix1 = tf.constant([[3., 3.]])\n", 90 | "matrix2 = tf.constant([[2.],[2.]])\n", 91 | "product = tf.matmul(matrix2, matrix1)\n", 92 | "print sess.run(product)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 7, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "[[ 6. 8. 10.]\n", 107 | " [ 7. 6. 5.]\n", 108 | " [ 5. 10. 15.]]\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "mat1=tf.Variable(tf.random_normal([3,2]))\n", 114 | "mat2=tf.Variable(tf.random_normal([2,3]))\n", 115 | "product=tf.matmul(mat1,mat2)\n", 116 | "\n", 117 | "m1=[[1,3],[2,1],[0,5]]\n", 118 | "m2=[[3,2,1],[1,2,3]]\n", 119 | "\n", 120 | "print sess.run(product,feed_dict={mat1:m1,mat2:m2})" 121 | ] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 2", 127 | "language": "python", 128 | "name": "python2" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 2 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython2", 140 | "version": "2.7.12" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | --------------------------------------------------------------------------------
nameinputoutputtarget
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100