├── README.md ├── cifar100_curriculum_alexnet.py ├── cifar100_curriculum_resnet.py └── cifar100_kmeans.py /README.md: -------------------------------------------------------------------------------- 1 | # curriculum-learning-for-deep-learning 2 | 3 | What is the curriculum learning? 4 | * [Curriculum Learning](https://ronan.collobert.com/pub/matos/2009_curriculum_icml.pdf) 5 | * [Automated Curriculum Learning for Neural Networks](http://proceedings.mlr.press/v70/graves17a/graves17a.pdf) 6 | 7 | 8 | cifar100_curriculum_alexnet.py: curriculum learning for alexnet. 9 | 10 | cifar100_curriculum_resnet.py: curriculum learning for resnet. 11 | 12 | cifar100_kmeans.py: data clustering. 13 | 14 | 15 | Using k-means for clustering data hierarchically is quite simple. You can use [deep learning](https://github.com/elieJalbout/Clustering-with-Deep-learning) for clustering process. 16 | -------------------------------------------------------------------------------- /cifar100_curriculum_alexnet.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | import tensorflow as tf 3 | import os 4 | from pylab import * 5 | import numpy as np 6 | import pickle 7 | from numpy import * 8 | 9 | def unpickle(file): 10 | import cPickle 11 | fo = open(file, 'rb') 12 | dict = cPickle.load(fo) 13 | fo.close() 14 | if 'data' in dict: 15 | dict['data'] = dict['data'].reshape((-1, 3, 32, 32)).swapaxes(1, 3).swapaxes(1, 2).reshape(-1, 32*32*3) / 256. 16 | return dict 17 | 18 | def load_data_one(f): 19 | batch = unpickle(f) 20 | data = batch['data'] 21 | labels = batch['fine_labels'] 22 | print "Loading %s: %d" % (f, len(data)) 23 | return data, labels 24 | 25 | def load_data(files, data_dir, label_count): 26 | data, labels = load_data_one(data_dir + '/' + files[0]) 27 | for f in files[1:]: 28 | data_n, labels_n = load_data_one(data_dir + '/' + f) 29 | data = np.append(data, data_n, axis=0) 30 | labels = np.append(labels, labels_n, axis=0) 31 | labels = np.array([ [ float(i == label) for i in xrange(label_count) ] for label in labels ]) 32 | return data, labels 33 | 34 | TRAINING_ITERATIONS = 200000 35 | WEIGHT_DECAY = 0.0001 36 | batch_size = 32 37 | learning_rate = 0.01 38 | 39 | data_dir = '/home/binhdt/cifar100' 40 | image_size = 32 41 | image_dim = image_size * image_size * 3 42 | meta = unpickle(data_dir + '/meta') 43 | label_names = meta['fine_label_names'] 44 | label_count = len(label_names) 45 | 46 | train_data, train_labels = load_data(['train'], data_dir, label_count) 47 | test_data, test_labels = load_data(['test'], data_dir, label_count) 48 | print "Train:", np.shape(train_data), np.shape(train_labels) 49 | print "Test:", np.shape(test_data), np.shape(test_labels) 50 | data = {'train_data': train_data, 'train_labels': train_labels, 'test_data': test_data, 'test_labels': test_labels} 51 | cluster_density_sorted = pickle.load(open("cluster.p", "rb")) 52 | nb_cluster = len(cluster_density_sorted) 53 | 54 | def print_activations(t): 55 | print(t.op.name, ' ', t.get_shape().as_list()) 56 | 57 | def dense_to_one_hot(labels_dense, num_classes): 58 | num_labels = labels_dense.shape[0] 59 | index_offset = np.arange(num_labels) * num_classes 60 | labels_one_hot = np.zeros((num_labels, num_classes)) 61 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 62 | return labels_one_hot 63 | 64 | def read_images_from_disk(input_queue): 65 | label = input_queue[1] 66 | file_contents = tf.read_file(input_queue[0]) 67 | example = tf.image.decode_jpeg(file_contents, channels=3) 68 | return example, label 69 | 70 | def weight_variable(shape, name): 71 | initial = tf.truncated_normal(shape, stddev=0.01, name=name) 72 | return tf.Variable(initial) 73 | 74 | def bias_variable(shape, name): 75 | initial = tf.constant(0.0, shape=shape, name=name) 76 | return tf.Variable(initial) 77 | 78 | def conv(input, kernel, biases, k_h, k_w, c_o, s_h, s_w, padding="VALID", group=1): 79 | c_i = input.get_shape()[-1] 80 | assert c_i % group == 0 81 | assert c_o % group == 0 82 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) 83 | 84 | if group == 1: 85 | conv = convolve(input, kernel) 86 | else: 87 | input_groups = tf.split(3, group, input) 88 | kernel_groups = tf.split(3, group, kernel) 89 | output_groups = [convolve(i, k) for i, k in zip(input_groups, kernel_groups)] 90 | conv = tf.concat(3, output_groups) 91 | return tf.reshape(tf.nn.bias_add(conv, biases), [-1] + conv.get_shape().as_list()[1:]) 92 | 93 | def conv2d(x, W, stride_h, stride_w, padding='SAME'): 94 | return tf.nn.conv2d(x, W, strides=[1, stride_h, stride_w, 1], padding=padding) 95 | 96 | graph = tf.Graph() 97 | with graph.as_default(): 98 | x = tf.placeholder('float', shape=[None, image_dim]) 99 | y_ = tf.placeholder('float', shape=[None, label_count]) 100 | lr = tf.placeholder("float", shape=[]) 101 | 102 | conv1W = weight_variable([3, 3, 3, 64], 'conv1W') 103 | conv1b = bias_variable([64], 'conv1b') 104 | conv2W = weight_variable([5, 5, 64, 192], 'conv2W') 105 | conv2b = bias_variable([192], 'conv2b') 106 | conv3W = weight_variable([3, 3, 192, 256], 'conv3W') 107 | conv3b = bias_variable([256], 'conv3b') 108 | fc8W = weight_variable([1 * 1 * 256, label_count], 'fc8W') 109 | fc8b = bias_variable([label_count], 'fc8b') 110 | keep_prob = tf.placeholder('float') 111 | 112 | def model(x): 113 | k_h = 3; k_w = 3; c_o = 64; s_h = 4; s_w = 4; group = 1 114 | conv1_in = conv(x, conv1W, conv1b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group) 115 | conv1 = tf.nn.relu(conv1_in) 116 | radius = 5; alpha = 0.0001; beta = 0.75; bias = 1.0 117 | lrn1 = tf.nn.local_response_normalization(conv1, depth_radius=radius, alpha=alpha, beta=beta, bias=bias) 118 | maxpool1 = tf.nn.max_pool(lrn1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID') 119 | k_h = 5; k_w = 5; c_o = 192; s_h = 1; s_w = 1; group = 1 120 | conv2_in = conv(maxpool1, conv2W, conv2b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group) 121 | conv2 = tf.nn.relu(conv2_in) 122 | maxpool2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID') 123 | k_h = 3; k_w = 3; c_o = 256; s_h = 1; s_w = 1; group = 1 124 | conv3_in = conv(maxpool2, conv3W, conv3b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group) 125 | conv3 = tf.nn.relu(conv3_in) 126 | maxpool3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID') 127 | fc7_drop = tf.nn.dropout(maxpool3, keep_prob) 128 | print_activations(fc7_drop) 129 | fc8 = tf.nn.xw_plus_b(tf.reshape(fc7_drop, [-1, int(prod(fc7_drop.get_shape()[1:]))]), fc8W, fc8b) 130 | return fc8 131 | 132 | logits = model(tf.reshape(x, [ -1, 32, 32, 3 ])) 133 | cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_)) 134 | regularizers = tf.nn.l2_loss(conv1W) + tf.nn.l2_loss(conv1b) +\ 135 | tf.nn.l2_loss(conv2W) + tf.nn.l2_loss(conv2b) +\ 136 | tf.nn.l2_loss(conv3W) + tf.nn.l2_loss(conv3b) +\ 137 | tf.nn.l2_loss(fc8W) + tf.nn.l2_loss(fc8b) 138 | loss = tf.reduce_mean(cross_entropy + WEIGHT_DECAY * regularizers) 139 | 140 | train_step = tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True).minimize(loss) 141 | correct_prediction = tf.equal(tf.argmax(tf.nn.softmax(logits),1), tf.argmax(y_,1)) 142 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float')) 143 | saver = tf.train.Saver() 144 | 145 | with tf.Session(graph=graph) as sess: 146 | init = tf.global_variables_initializer() 147 | sess.run(init) 148 | for i in range(nb_cluster): 149 | id = [] 150 | for j in range(i + 1): 151 | id = id + cluster_density_sorted[j][1] 152 | xtrain = train_data[id] 153 | ytrain = train_labels[id] 154 | 155 | pi = np.random.permutation(len(xtrain)) 156 | xtrain, ytrain = xtrain[pi], ytrain[pi] 157 | 158 | if i > 0: 159 | saver.restore(sess, './curriculum_alexnet_cluster' + str(i - 1) + '.ckpt') 160 | 161 | for it in range(TRAINING_ITERATIONS): 162 | if it == TRAINING_ITERATIONS * 50/100: learning_rate = 0.001 163 | if it == TRAINING_ITERATIONS * 75/100: learning_rate = 0.0001 164 | if it * batch_size % len(xtrain) + batch_size <= len(xtrain): 165 | start = it * batch_size % len(xtrain) 166 | end = start + batch_size 167 | else: 168 | start = it * batch_size % len(xtrain) 169 | end = len(xtrain) 170 | 171 | _, train_accuracy, cost = sess.run([train_step, accuracy, cross_entropy], 172 | feed_dict={x: xtrain[start:end], y_: ytrain[start:end], keep_prob: 0.5, lr: learning_rate}) 173 | 174 | if it % 200 == 0: 175 | print i, it, train_accuracy, cost, accuracy.eval(feed_dict={x: test_data, y_: test_labels, keep_prob: 1.0}) 176 | 177 | saver.save(sess, './curriculum_alexnet_cluster' + str(i) + '.ckpt') 178 | 179 | sess.close() -------------------------------------------------------------------------------- /cifar100_curriculum_resnet.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | import tensorflow as tf 3 | import os 4 | from pylab import * 5 | import numpy as np 6 | import pickle 7 | from numpy import * 8 | import math 9 | 10 | def unpickle(file): 11 | import cPickle 12 | fo = open(file, 'rb') 13 | dict = cPickle.load(fo) 14 | fo.close() 15 | if 'data' in dict: 16 | dict['data'] = dict['data'].reshape((-1, 3, 32, 32)).swapaxes(1, 3).swapaxes(1, 2).reshape(-1, 32*32*3) / 256. 17 | return dict 18 | 19 | def load_data_one(f): 20 | batch = unpickle(f) 21 | data = batch['data'] 22 | labels = batch['fine_labels'] 23 | print "Loading %s: %d" % (f, len(data)) 24 | return data, labels 25 | 26 | def load_data(files, data_dir, label_count): 27 | data, labels = load_data_one(data_dir + '/' + files[0]) 28 | for f in files[1:]: 29 | data_n, labels_n = load_data_one(data_dir + '/' + f) 30 | data = np.append(data, data_n, axis=0) 31 | labels = np.append(labels, labels_n, axis=0) 32 | labels = np.array([ [ float(i == label) for i in xrange(label_count) ] for label in labels ]) 33 | return data, labels 34 | 35 | TRAINING_ITERATIONS = 200000 36 | WEIGHT_DECAY = 0.0001 37 | batch_size = 64 38 | 39 | data_dir = '/home/binhdt/cifar100' 40 | image_size = 32 41 | image_dim = image_size * image_size * 3 42 | meta = unpickle(data_dir + '/meta') 43 | label_names = meta['fine_label_names'] 44 | label_count = len(label_names) 45 | 46 | train_data, train_labels = load_data(['train'], data_dir, label_count) 47 | test_data, test_labels = load_data(['test'], data_dir, label_count) 48 | print "Train:", np.shape(train_data), np.shape(train_labels) 49 | print "Test:", np.shape(test_data), np.shape(test_labels) 50 | data = {'train_data': train_data, 'train_labels': train_labels, 'test_data': test_data, 'test_labels': test_labels} 51 | cluster_density_sorted = pickle.load(open("cluster.p", "rb")) 52 | nb_cluster = len(cluster_density_sorted) 53 | 54 | def print_activations(t): 55 | print(t.op.name, ' ', t.get_shape().as_list()) 56 | 57 | def dense_to_one_hot(labels_dense, num_classes): 58 | num_labels = labels_dense.shape[0] 59 | index_offset = np.arange(num_labels) * num_classes 60 | labels_one_hot = np.zeros((num_labels, num_classes)) 61 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 62 | return labels_one_hot 63 | 64 | def read_images_from_disk(input_queue): 65 | label = input_queue[1] 66 | file_contents = tf.read_file(input_queue[0]) 67 | example = tf.image.decode_jpeg(file_contents, channels=3) 68 | return example, label 69 | 70 | 71 | def run_in_batch_avg(session, tensors, batch_placeholders, feed_dict={}, batch_size=200): 72 | res = [ 0 ] * len(tensors) 73 | batch_tensors = [ (placeholder, feed_dict[ placeholder ]) for placeholder in batch_placeholders ] 74 | total_size = len(batch_tensors[0][1]) 75 | batch_count = (total_size + batch_size - 1) / batch_size 76 | for batch_idx in xrange(batch_count): 77 | current_batch_size = None 78 | for (placeholder, tensor) in batch_tensors: 79 | batch_tensor = tensor[ batch_idx*batch_size : (batch_idx+1)*batch_size ] 80 | current_batch_size = len(batch_tensor) 81 | feed_dict[placeholder] = tensor[ batch_idx*batch_size : (batch_idx+1)*batch_size ] 82 | tmp = session.run(tensors, feed_dict=feed_dict) 83 | res = [ r + t * current_batch_size for (r, t) in zip(res, tmp) ] 84 | return [ r / float(total_size) for r in res ] 85 | 86 | def weight_variable(shape): 87 | initial = tf.truncated_normal(shape, stddev=0.01) 88 | return tf.Variable(initial) 89 | 90 | def bias_variable(shape): 91 | initial = tf.constant(0.01, shape=shape) 92 | return tf.Variable(initial) 93 | 94 | def conv2d(input, in_features, out_features, kernel_size, stride): 95 | W = weight_variable([ kernel_size, kernel_size, in_features, out_features ]) 96 | return tf.nn.conv2d(input, W, [ 1, stride, stride, 1 ], padding='SAME') 97 | 98 | def basic_block(input, in_features, out_features, stride, is_training, keep_prob): 99 | if stride == 1: 100 | shortcut = input 101 | else: 102 | shortcut = tf.nn.avg_pool(input, [ 1, stride, stride, 1 ], [1, stride, stride, 1 ], 'VALID') 103 | shortcut = tf.pad(shortcut, [[0, 0], [0, 0], [0, 0], 104 | [(out_features-in_features)//2, (out_features-in_features)//2]]) 105 | current = conv2d(input, in_features, out_features, 3, stride) 106 | current = tf.nn.dropout(current, keep_prob) 107 | current = tf.contrib.layers.batch_norm(current, scale=True, is_training=is_training, updates_collections=None) 108 | current = tf.nn.relu(current) 109 | current = conv2d(current, out_features, out_features, 3, 1) 110 | current = tf.nn.dropout(current, keep_prob) 111 | current = tf.contrib.layers.batch_norm(current, scale=True, is_training=is_training, updates_collections=None) 112 | # No final relu as per http://torch.ch/blog/2016/02/04/resnets.html 113 | return current + shortcut 114 | 115 | def block_stack(input, in_features, out_features, stride, depth, is_training, keep_prob): 116 | current = basic_block(input, in_features, out_features, stride, is_training, keep_prob) 117 | for _d in xrange(depth - 1): 118 | current = basic_block(current, out_features, out_features, 1, is_training, keep_prob) 119 | return current 120 | 121 | graph = tf.Graph() 122 | with graph.as_default(): 123 | xs = tf.placeholder("float", shape=[None, image_dim]) 124 | ys = tf.placeholder("float", shape=[None, label_count]) 125 | lr = tf.placeholder("float", shape=[]) 126 | keep_prob = tf.placeholder(tf.float32) 127 | is_training = tf.placeholder("bool", shape=[]) 128 | 129 | current = tf.reshape(xs, [ -1, 32, 32, 3 ]) 130 | current = conv2d(current, 3, 16, 3, 1) 131 | current = tf.nn.relu(current) 132 | 133 | # dimension is 32x32x16 134 | current = block_stack(current, 16, 16, 1, 18, is_training, keep_prob) 135 | current = block_stack(current, 16, 32, 2, 18, is_training, keep_prob) 136 | # dimension is 16x16x32 137 | current = block_stack(current, 32, 64, 2, 18, is_training, keep_prob) 138 | # dimension is 8x8x64 139 | 140 | current = tf.reduce_mean(current, reduction_indices=[1, 2], name="avg_pool") 141 | final_dim = 64 142 | current = tf.reshape(current, [ -1, final_dim ]) 143 | Wfc = weight_variable([ final_dim, label_count ]) 144 | bfc = bias_variable([ label_count ]) 145 | ys_ = tf.nn.softmax( tf.matmul(current, Wfc) + bfc ) 146 | 147 | cross_entropy = -tf.reduce_mean(ys * tf.log(ys_ + 1e-12)) 148 | train_step = tf.train.MomentumOptimizer(lr, 0.9, use_nesterov=True).minimize(cross_entropy) 149 | correct_prediction = tf.equal(tf.argmax(ys_, 1), tf.argmax(ys, 1)) 150 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 151 | 152 | with tf.Session(graph=graph) as session: 153 | session.run(tf.global_variables_initializer()) 154 | saver = tf.train.Saver() 155 | 156 | for i in xrange(0, nb_cluster): 157 | id = [] 158 | for j in range(i + 1): 159 | id = id + cluster_density_sorted[nb_cluster-j-1][1] 160 | xtrain = train_data[id] 161 | ytrain = train_labels[id] 162 | 163 | pi = np.random.permutation(len(xtrain)) 164 | xtrain, ytrain = xtrain[pi], ytrain[pi] 165 | 166 | if i > 0: 167 | saver.restore(session, './curriculum_alexnet_cluster' + str(i - 1) + '.ckpt') 168 | 169 | batch_count = len(xtrain) / batch_size 170 | batches_data = np.split(xtrain[:batch_count*batch_size], batch_count) 171 | batches_labels = np.split(ytrain[:batch_count*batch_size], batch_count) 172 | learning_rate = 0.1 173 | 174 | if i < nb_cluster - 1: 175 | nb_epoch = 9 176 | else: 177 | nb_epoch = 300 178 | 179 | for epoch in xrange(1, 1+nb_epoch): 180 | if epoch == math.floor(nb_epoch/3): learning_rate = 0.01 181 | if epoch == math.floor(2*nb_epoch/3): learning_rate = 0.001 182 | for batch_idx in xrange(batch_count): 183 | batch_data = batches_data[batch_idx] 184 | batch_labels = batches_labels[batch_idx] 185 | 186 | batch_res = session.run([ train_step, cross_entropy, accuracy ], 187 | feed_dict = { xs: batch_data, ys: batch_labels, lr: learning_rate, is_training: True, keep_prob: 0.8 }) 188 | 189 | test_results = run_in_batch_avg(session, [ cross_entropy, accuracy ], [ xs, ys ], 190 | feed_dict = { xs: data['test_data'], ys: data['test_labels'], is_training: False, keep_prob: 1. }) 191 | print epoch, batch_res[1:], test_results 192 | 193 | saver.save(session, './curriculum_alexnet_cluster' + str(i) + '.ckpt') -------------------------------------------------------------------------------- /cifar100_kmeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import KMeans 3 | import operator 4 | from collections import defaultdict 5 | import pickle 6 | from sklearn.decomposition import PCA 7 | 8 | def unpickle(file): 9 | import cPickle 10 | fo = open(file, 'rb') 11 | dict = cPickle.load(fo) 12 | fo.close() 13 | return dict 14 | 15 | def load_data_one(f): 16 | batch = unpickle(f) 17 | print batch.keys() 18 | data = batch['data'] 19 | labels = batch['fine_labels'] 20 | print "Loading %s: %d" % (f, len(data)) 21 | return data, labels 22 | 23 | def load_data(files, data_dir, label_count): 24 | data, labels = load_data_one(data_dir + '/' + files[0]) 25 | for f in files[1:]: 26 | data_n, labels_n = load_data_one(data_dir + '/' + f) 27 | data = np.append(data, data_n, axis=0) 28 | labels = np.append(labels, labels_n, axis=0) 29 | labels = np.array([ [ float(i == label) for i in xrange(label_count) ] for label in labels ]) 30 | return data, labels 31 | 32 | def grayscale(a): 33 | return a.reshape(a.shape[0], 3, 32, 32).mean(1).reshape(a.shape[0], -1) / 256. 34 | 35 | def run(): 36 | data_dir = '../../cifar100' 37 | image_size = 32 38 | image_dim = image_size * image_size * 3 39 | meta = unpickle(data_dir + '/meta') 40 | label_names = meta['fine_label_names'] 41 | label_count = len(label_names) 42 | 43 | train_files = [ 'train' ] 44 | train_data, train_labels = load_data(train_files, data_dir, label_count) 45 | train_data = grayscale(train_data) 46 | test_data, test_labels = load_data([ 'test' ], data_dir, label_count) 47 | test_data = grayscale(test_data) 48 | print "Train:", np.shape(train_data), np.shape(train_labels) 49 | print "Test:", np.shape(test_data), np.shape(test_labels) 50 | data = { 'train_data': train_data, 51 | 'train_labels': train_labels, 52 | 'test_data': test_data, 53 | 'test_labels': test_labels } 54 | 55 | reduced_data = PCA(n_components=2).fit_transform(train_data) 56 | kmeans = KMeans(n_clusters=100, random_state=0, precompute_distances=True, max_iter=1000, n_init=20).fit(reduced_data) 57 | 58 | cluster_density = dict() 59 | cluster = defaultdict(list) 60 | 61 | for i in range(len(kmeans.labels_)): 62 | cluster[kmeans.labels_[i]].append(i) 63 | if kmeans.labels_[i] in cluster_density: 64 | cluster_density[kmeans.labels_[i]] = cluster_density[kmeans.labels_[i]] + np.linalg.norm(reduced_data[i] - kmeans.cluster_centers_[kmeans.labels_[i]]) 65 | else: 66 | cluster_density[kmeans.labels_[i]] = np.linalg.norm(reduced_data[i] - kmeans.cluster_centers_[kmeans.labels_[i]]) 67 | 68 | for i in set(kmeans.labels_): 69 | cluster_density[i] = cluster_density[i]/len(cluster[i]) 70 | 71 | curriculum_cluster = sorted(cluster_density.items(), key=operator.itemgetter(1)) 72 | print curriculum_cluster 73 | cluster_density_sorted = list() 74 | for tup in curriculum_cluster: 75 | cluster_density_sorted.append((tup[0], cluster[tup[0]])) 76 | pickle.dump(cluster_density_sorted, open( "cluster.p", "wb" )) 77 | 78 | run() --------------------------------------------------------------------------------