├── README.md ├── convertscript.sh ├── ensemble.sh ├── kaggle_dataset_decaf.py ├── kaggle_test.py ├── kaggle_test_multi.py ├── kaggle_train.py ├── kaggle_train_full.py └── predict_no_batches.py /README.md: -------------------------------------------------------------------------------- 1 | kaggle-cifar10 2 | ============== 3 | 4 | Code for Kaggle Dogs vs. Cats competition 5 | -------------------------------------------------------------------------------- /convertscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Make sure the convert program is installed sudo apt-get install imagemagick 3 | for i in `ls *.jpg`; do convert $i -resize 221x221\! "${i%.jpg}.png"; done 4 | -------------------------------------------------------------------------------- /ensemble.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm log.log 3 | rm tmp.log 4 | for i in `seq 1 3`; do 5 | ./kaggle_train_full.py 2>&1 | tee -a log.log 6 | mv saved_clf.pkl ensemble_clf/clf$i.pkl 7 | echo "FINISHED RUN $i" >> tmp.log 8 | done 9 | -------------------------------------------------------------------------------- /kaggle_dataset_decaf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from decaf.util import transform 3 | from decaf.scripts import imagenet 4 | import logging 5 | import numpy as np 6 | from glob import glob 7 | import matplotlib.image as mpimg 8 | from random import shuffle 9 | import pickle 10 | 11 | 12 | def load_and_preprocess(net, imagepath, center_only=False, 13 | scale=True, center_size=256): 14 | image = mpimg.imread(imagepath) 15 | # first, extract the center_sizexcenter_size center. 16 | image = transform.scale_and_extract(transform.as_rgb(image), center_size) 17 | # convert to [0,255] float32 18 | if scale: 19 | image = image.astype(np.float32) * 255. 20 | # Flip the image 21 | image = image[::-1, :].copy() 22 | # subtract the mean 23 | image -= net._data_mean 24 | # oversample the images 25 | images = net.oversample(image, center_only) 26 | return images 27 | 28 | 29 | def activate(net, im): 30 | image = load_and_preprocess(net, im, center_only=True) 31 | # Need to classify to pull features back 32 | net.classify_direct(image) 33 | # Activation of all convolutional layers and first fully connected 34 | feat = net.feature('fc6_cudanet_out')[0] 35 | return feat 36 | 37 | 38 | def png_to_np(basedir, fetch_target=False): 39 | logging.getLogger().setLevel(logging.INFO) 40 | data_root = '/home/kkastner/decaf_models/' 41 | net = imagenet.DecafNet(data_root + 'imagenet.decafnet.epoch90', 42 | data_root + 'imagenet.decafnet.meta') 43 | files = glob(basedir + '*.png') 44 | if fetch_target: 45 | shuffle(files) 46 | # Sort the files so they match the labels 47 | target = np.array([1. if 'dog' in f.split("/")[-1] else 0. 48 | for f in files], 49 | dtype='float32') 50 | else: 51 | #Must sort the files for the test sort to assure order! 52 | files = sorted(files, 53 | key=lambda x: int(x.split("/")[-1].split(".")[-2])) 54 | feature_info = activate(net, files[0]) 55 | feature_count = feature_info.shape[0] 56 | feature_dtype = feature_info.dtype 57 | data = np.zeros((len(files), feature_count), dtype=feature_dtype) 58 | for n, im in enumerate(files): 59 | data[n, :] = activate(net, im) 60 | if n % 1000 == 0: 61 | print 'Reading in image', n 62 | if fetch_target: 63 | return data, target 64 | else: 65 | return data 66 | 67 | x, y = png_to_np( 68 | '/home/kkastner/kaggle_data/kaggle-dogs-vs-cats/train/', fetch_target=True) 69 | tst = png_to_np('/home/kkastner/kaggle_data/kaggle-dogs-vs-cats/test1/') 70 | pickle.dump(x, open('saved_x.pkl', 'wb')) 71 | pickle.dump(y, open('saved_y.pkl', 'wb')) 72 | pickle.dump(tst, open('saved_tst.pkl', 'wb')) 73 | -------------------------------------------------------------------------------- /kaggle_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from pylearn2.datasets import DenseDesignMatrix 3 | from pylearn2.utils import serial 4 | from theano import tensor as T 5 | from theano import function 6 | import pickle 7 | import numpy as np 8 | import csv 9 | 10 | 11 | def process(mdl, ds, batch_size=100): 12 | # This batch size must be evenly divisible into number of total samples! 13 | mdl.set_batch_size(batch_size) 14 | X = mdl.get_input_space().make_batch_theano() 15 | Y = mdl.fprop(X) 16 | y = T.argmax(Y, axis=1) 17 | f = function([X], y) 18 | yhat = [] 19 | for i in xrange(ds.X.shape[0] / batch_size): 20 | x_arg = ds.X[i * batch_size:(i + 1) * batch_size, :] 21 | yhat.append(f(x_arg.astype(X.dtype))) 22 | return np.array(yhat) 23 | 24 | tst = pickle.load(open('saved_tst.pkl', 'rb')) 25 | ds = DenseDesignMatrix(X=tst) 26 | mdl = serial.load('saved_clf.pkl') 27 | 28 | fname = 'results.csv' 29 | test_size = ds.X.shape[0] 30 | sets = 1 31 | res = np.zeros((sets, test_size), dtype='float32') 32 | for n, i in enumerate([test_size * x for x in range(sets)]): 33 | yhat = process(mdl, ds) 34 | res[n, :] = yhat.ravel() 35 | 36 | converted_results = [['id', 'label']] + [[n + 1, int(x)] 37 | for n, x in enumerate(res.ravel())] 38 | with open(fname, 'w') as f: 39 | csv_f = csv.writer(f, delimiter=',', quoting=csv.QUOTE_NONE) 40 | csv_f.writerows(converted_results) 41 | -------------------------------------------------------------------------------- /kaggle_test_multi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from pylearn2.datasets import DenseDesignMatrix 3 | from pylearn2.utils import serial 4 | from theano import tensor as T 5 | from theano import function 6 | from glob import glob 7 | import pickle 8 | import numpy as np 9 | import csv 10 | import gc 11 | 12 | def process(mdl, ds, batch_size=100): 13 | # This batch size must be evenly divisible into number of total samples! 14 | mdl.set_batch_size(batch_size) 15 | X = mdl.get_input_space().make_batch_theano() 16 | Y = mdl.fprop(X) 17 | y = T.argmax(Y, axis=1) 18 | f = function([X], y) 19 | yhat = [] 20 | for i in xrange(ds.X.shape[0] / batch_size): 21 | x_arg = ds.X[i * batch_size:(i + 1) * batch_size, :] 22 | yhat.append(f(x_arg.astype(X.dtype))) 23 | return np.array(yhat).ravel() 24 | 25 | tst = pickle.load(open('saved_tst.pkl', 'rb')) 26 | ds = DenseDesignMatrix(X=tst) 27 | clfs = glob('ensemble_clf/*.pkl') 28 | if (len(clfs) % 2) == 0: 29 | raise AttributeError('Use an odd number of voters to avoid ties!') 30 | mdls = (serial.load(f) for f in clfs) 31 | 32 | fname = 'results.csv' 33 | test_size = ds.X.shape[0] 34 | res = np.zeros((len(clfs), test_size), dtype='float32') 35 | for n,mdl in enumerate(mdls): 36 | res[n, :] = process(mdl, ds, batch_size=500) 37 | print "Processed model ",n 38 | #Fix for CUDA memory issues - wut? 39 | del mdl 40 | gc.collect() 41 | 42 | yhat = np.round(np.mean(res, axis=0)) 43 | converted_results = [['id', 'label']] + [[n + 1, int(x)] 44 | for n, x in enumerate(yhat)] 45 | with open(fname, 'w') as f: 46 | csv_f = csv.writer(f, delimiter=',', quoting=csv.QUOTE_NONE) 47 | csv_f.writerows(converted_results) 48 | -------------------------------------------------------------------------------- /kaggle_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from kaggle_train_full import * 3 | from sklearn.cross_validation import train_test_split 4 | 5 | X_train, X_test, y_train, y_test = train_test_split(x, y, 6 | test_size=.2, 7 | random_state=42) 8 | trn = DenseDesignMatrix(X=X_train, y=y_train) 9 | tst = DenseDesignMatrix(X=X_test, y=y_test) 10 | trainer.monitoring_dataset={'valid': tst, 11 | 'train': trn} 12 | experiment.main_loop() 13 | -------------------------------------------------------------------------------- /kaggle_train_full.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from pylearn2.models import mlp 3 | from pylearn2.costs.mlp.dropout import Dropout 4 | from pylearn2.training_algorithms import sgd, learning_rule 5 | from pylearn2.termination_criteria import EpochCounter 6 | from pylearn2.datasets import DenseDesignMatrix 7 | from pylearn2.train import Train 8 | from pylearn2.train_extensions import best_params 9 | from pylearn2.space import VectorSpace 10 | import pickle 11 | import numpy as np 12 | 13 | def to_one_hot(l): 14 | out = np.zeros((len(l), len(set(l)))) 15 | for n, i in enumerate(l): 16 | out[n, i] = 1. 17 | return out 18 | 19 | x = pickle.load(open('saved_x.pkl', 'rb')) 20 | y = pickle.load(open('saved_y.pkl', 'rb')) 21 | y = to_one_hot(y) 22 | in_space = VectorSpace(dim=x.shape[1]) 23 | full = DenseDesignMatrix(X=x, y=y) 24 | 25 | l1 = mlp.RectifiedLinear(layer_name='l1', 26 | sparse_init=12, 27 | dim=5000, 28 | max_col_norm=1.) 29 | 30 | l2 = mlp.RectifiedLinear(layer_name='l2', 31 | sparse_init=12, 32 | dim=5000, 33 | max_col_norm=1.) 34 | 35 | l3 = mlp.RectifiedLinear(layer_name='l3', 36 | sparse_init=12, 37 | dim=5000, 38 | max_col_norm=1.) 39 | 40 | l4 = mlp.RectifiedLinear(layer_name='l4', 41 | sparse_init=12, 42 | dim=5000, 43 | max_col_norm=1.) 44 | 45 | output = mlp.HingeLoss(layer_name='y', 46 | n_classes=2, 47 | irange=.0001) 48 | 49 | #output = mlp.Softmax(layer_name='y', 50 | # n_classes=2, 51 | # irange=.005) 52 | 53 | layers = [l1, l2, l3, output] 54 | 55 | mdl = mlp.MLP(layers, 56 | input_space=in_space) 57 | 58 | lr = .0001 59 | epochs = 100 60 | trainer = sgd.SGD(learning_rate=lr, 61 | batch_size=128, 62 | learning_rule=learning_rule.Momentum(.5), 63 | # Remember, default dropout is .5 64 | cost=Dropout(input_include_probs={'l1': .8}, 65 | input_scales={'l1': 1.}), 66 | termination_criterion=EpochCounter(epochs), 67 | monitoring_dataset={'train': full}) 68 | 69 | watcher = best_params.MonitorBasedSaveBest( 70 | channel_name='train_y_misclass', 71 | save_path='saved_clf.pkl') 72 | 73 | velocity = learning_rule.MomentumAdjustor(final_momentum=.6, 74 | start=1, 75 | saturate=250) 76 | 77 | decay = sgd.LinearDecayOverEpoch(start=1, 78 | saturate=250, 79 | decay_factor=lr*.05) 80 | 81 | experiment = Train(dataset=full, 82 | model=mdl, 83 | algorithm=trainer, 84 | extensions=[watcher, velocity, decay]) 85 | 86 | if __name__ == "__main__": 87 | experiment.main_loop() 88 | -------------------------------------------------------------------------------- /predict_no_batches.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | prediction code without batches 6 | see http://fastml.com/how-to-get-predictions-from-pylearn2/ 7 | author: Zygmunt Zając 8 | """ 9 | 10 | import sys 11 | import os 12 | import numpy as np 13 | import cPickle as pickle 14 | 15 | from pylearn2.utils import serial 16 | from theano import tensor as T 17 | from theano import function 18 | 19 | try: 20 | model_path = sys.argv[1] 21 | test_path = sys.argv[2] 22 | out_path = sys.argv[3] 23 | except IndexError: 24 | print "Usage: predict.py " 25 | print " predict.py saved_clf.pkl saved_tst.pkl results.csv\n" 26 | quit() 27 | 28 | print "loading model..." 29 | 30 | try: 31 | model = serial.load(model_path) 32 | except Exception, e: 33 | print model_path + "doesn't seem to be a valid model path, got this error when trying to load it:" 34 | print e 35 | 36 | print "setting up symbolic expressions..." 37 | 38 | X = model.get_input_space().make_theano_batch() 39 | Y = model.fprop(X) 40 | Y = T.argmax(Y, axis=1) 41 | 42 | f = function([X], Y) 43 | 44 | print "loading data and predicting..." 45 | 46 | x = pickle.load(open(test_path, 'rb')) 47 | y = f(x) 48 | 49 | print "writing predictions..." 50 | 51 | out = open(out_path, 'w') 52 | out.write('id,label\n') 53 | 54 | for i in xrange(y.shape[0]): 55 | p = y[i] 56 | out.write('{},{}\n'.format(i + 1, p)) 57 | 58 | out.close() 59 | 60 | 61 | --------------------------------------------------------------------------------