├── README.md
├── convertscript.sh
├── ensemble.sh
├── kaggle_dataset_decaf.py
├── kaggle_test.py
├── kaggle_test_multi.py
├── kaggle_train.py
├── kaggle_train_full.py
└── predict_no_batches.py


/README.md:
--------------------------------------------------------------------------------
1 | kaggle-cifar10
2 | ==============
3 | 
4 | Code for Kaggle Dogs vs. Cats competition
5 | 


--------------------------------------------------------------------------------
/convertscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #Make sure the convert program is installed sudo apt-get install imagemagick
3 | for i in `ls *.jpg`; do convert $i -resize 221x221\! "${i%.jpg}.png"; done
4 | 


--------------------------------------------------------------------------------
/ensemble.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rm log.log
3 | rm tmp.log
4 | for i in `seq 1 3`; do
5 |     ./kaggle_train_full.py 2>&1 | tee -a log.log
6 |     mv saved_clf.pkl ensemble_clf/clf$i.pkl
7 |     echo "FINISHED RUN $i" >> tmp.log
8 | done
9 | 


--------------------------------------------------------------------------------
/kaggle_dataset_decaf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from decaf.util import transform
 3 | from decaf.scripts import imagenet
 4 | import logging
 5 | import numpy as np
 6 | from glob import glob
 7 | import matplotlib.image as mpimg
 8 | from random import shuffle
 9 | import pickle
10 | 
11 | 
12 | def load_and_preprocess(net, imagepath, center_only=False,
13 |                         scale=True, center_size=256):
14 |     image = mpimg.imread(imagepath)
15 |     # first, extract the center_sizexcenter_size center.
16 |     image = transform.scale_and_extract(transform.as_rgb(image), center_size)
17 |     # convert to [0,255] float32
18 |     if scale:
19 |         image = image.astype(np.float32) * 255.
20 |     # Flip the image
21 |     image = image[::-1, :].copy()
22 |     # subtract the mean
23 |     image -= net._data_mean
24 |     # oversample the images
25 |     images = net.oversample(image, center_only)
26 |     return images
27 | 
28 | 
29 | def activate(net, im):
30 |     image = load_and_preprocess(net, im, center_only=True)
31 |     # Need to classify to pull features back
32 |     net.classify_direct(image)
33 |     # Activation of all convolutional layers and first fully connected
34 |     feat = net.feature('fc6_cudanet_out')[0]
35 |     return feat
36 | 
37 | 
38 | def png_to_np(basedir, fetch_target=False):
39 |     logging.getLogger().setLevel(logging.INFO)
40 |     data_root = '/home/kkastner/decaf_models/'
41 |     net = imagenet.DecafNet(data_root + 'imagenet.decafnet.epoch90',
42 |                             data_root + 'imagenet.decafnet.meta')
43 |     files = glob(basedir + '*.png')
44 |     if fetch_target:
45 |         shuffle(files)
46 |         # Sort the files so they match the labels
47 |         target = np.array([1. if 'dog' in f.split("/")[-1] else 0.
48 |                            for f in files],
49 |                           dtype='float32')
50 |     else:
51 |         #Must sort the files for the test sort to assure order!
52 |         files = sorted(files,
53 |                        key=lambda x: int(x.split("/")[-1].split(".")[-2]))
54 |     feature_info = activate(net, files[0])
55 |     feature_count = feature_info.shape[0]
56 |     feature_dtype = feature_info.dtype
57 |     data = np.zeros((len(files), feature_count), dtype=feature_dtype)
58 |     for n, im in enumerate(files):
59 |         data[n, :] = activate(net, im)
60 |         if n % 1000 == 0:
61 |             print 'Reading in image', n
62 |     if fetch_target:
63 |         return data, target
64 |     else:
65 |         return data
66 | 
67 | x, y = png_to_np(
68 |     '/home/kkastner/kaggle_data/kaggle-dogs-vs-cats/train/', fetch_target=True)
69 | tst = png_to_np('/home/kkastner/kaggle_data/kaggle-dogs-vs-cats/test1/')
70 | pickle.dump(x, open('saved_x.pkl', 'wb'))
71 | pickle.dump(y, open('saved_y.pkl', 'wb'))
72 | pickle.dump(tst, open('saved_tst.pkl', 'wb'))
73 | 


--------------------------------------------------------------------------------
/kaggle_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from pylearn2.datasets import DenseDesignMatrix
 3 | from pylearn2.utils import serial
 4 | from theano import tensor as T
 5 | from theano import function
 6 | import pickle
 7 | import numpy as np
 8 | import csv
 9 | 
10 | 
11 | def process(mdl, ds, batch_size=100):
12 |     # This batch size must be evenly divisible into number of total samples!
13 |     mdl.set_batch_size(batch_size)
14 |     X = mdl.get_input_space().make_batch_theano()
15 |     Y = mdl.fprop(X)
16 |     y = T.argmax(Y, axis=1)
17 |     f = function([X], y)
18 |     yhat = []
19 |     for i in xrange(ds.X.shape[0] / batch_size):
20 |         x_arg = ds.X[i * batch_size:(i + 1) * batch_size, :]
21 |         yhat.append(f(x_arg.astype(X.dtype)))
22 |     return np.array(yhat)
23 | 
24 | tst = pickle.load(open('saved_tst.pkl', 'rb'))
25 | ds = DenseDesignMatrix(X=tst)
26 | mdl = serial.load('saved_clf.pkl')
27 | 
28 | fname = 'results.csv'
29 | test_size = ds.X.shape[0]
30 | sets = 1
31 | res = np.zeros((sets, test_size), dtype='float32')
32 | for n, i in enumerate([test_size * x for x in range(sets)]):
33 |     yhat = process(mdl, ds)
34 |     res[n, :] = yhat.ravel()
35 | 
36 | converted_results = [['id', 'label']] + [[n + 1, int(x)]
37 |                                          for n, x in enumerate(res.ravel())]
38 | with open(fname, 'w') as f:
39 |     csv_f = csv.writer(f, delimiter=',', quoting=csv.QUOTE_NONE)
40 |     csv_f.writerows(converted_results)
41 | 


--------------------------------------------------------------------------------
/kaggle_test_multi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from pylearn2.datasets import DenseDesignMatrix
 3 | from pylearn2.utils import serial
 4 | from theano import tensor as T
 5 | from theano import function
 6 | from glob import glob
 7 | import pickle
 8 | import numpy as np
 9 | import csv
10 | import gc
11 | 
12 | def process(mdl, ds, batch_size=100):
13 |     # This batch size must be evenly divisible into number of total samples!
14 |     mdl.set_batch_size(batch_size)
15 |     X = mdl.get_input_space().make_batch_theano()
16 |     Y = mdl.fprop(X)
17 |     y = T.argmax(Y, axis=1)
18 |     f = function([X], y)
19 |     yhat = []
20 |     for i in xrange(ds.X.shape[0] / batch_size):
21 |         x_arg = ds.X[i * batch_size:(i + 1) * batch_size, :]
22 |         yhat.append(f(x_arg.astype(X.dtype)))
23 |     return np.array(yhat).ravel()
24 | 
25 | tst = pickle.load(open('saved_tst.pkl', 'rb'))
26 | ds = DenseDesignMatrix(X=tst)
27 | clfs = glob('ensemble_clf/*.pkl')
28 | if (len(clfs) % 2) == 0:
29 |     raise AttributeError('Use an odd number of voters to avoid ties!')
30 | mdls = (serial.load(f) for f in clfs)
31 | 
32 | fname = 'results.csv'
33 | test_size = ds.X.shape[0]
34 | res = np.zeros((len(clfs), test_size), dtype='float32')
35 | for n,mdl in enumerate(mdls):
36 |     res[n, :] = process(mdl, ds, batch_size=500)
37 |     print "Processed model ",n
38 |     #Fix for CUDA memory issues - wut?
39 |     del mdl
40 |     gc.collect()
41 | 
42 | yhat = np.round(np.mean(res, axis=0))
43 | converted_results = [['id', 'label']] + [[n + 1, int(x)]
44 |                                          for n, x in enumerate(yhat)]
45 | with open(fname, 'w') as f:
46 |     csv_f = csv.writer(f, delimiter=',', quoting=csv.QUOTE_NONE)
47 |     csv_f.writerows(converted_results)
48 | 


--------------------------------------------------------------------------------
/kaggle_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from kaggle_train_full import *
 3 | from sklearn.cross_validation import train_test_split
 4 | 
 5 | X_train, X_test, y_train, y_test = train_test_split(x, y,
 6 |                                                     test_size=.2,
 7 |                                                     random_state=42)
 8 | trn = DenseDesignMatrix(X=X_train, y=y_train)
 9 | tst = DenseDesignMatrix(X=X_test, y=y_test)
10 | trainer.monitoring_dataset={'valid': tst,
11 |                             'train': trn}
12 | experiment.main_loop()
13 | 


--------------------------------------------------------------------------------
/kaggle_train_full.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from pylearn2.models import mlp
 3 | from pylearn2.costs.mlp.dropout import Dropout
 4 | from pylearn2.training_algorithms import sgd, learning_rule
 5 | from pylearn2.termination_criteria import EpochCounter
 6 | from pylearn2.datasets import DenseDesignMatrix
 7 | from pylearn2.train import Train
 8 | from pylearn2.train_extensions import best_params
 9 | from pylearn2.space import VectorSpace
10 | import pickle
11 | import numpy as np
12 | 
13 | def to_one_hot(l):
14 |     out = np.zeros((len(l), len(set(l))))
15 |     for n, i in enumerate(l):
16 |         out[n, i] = 1.
17 |     return out
18 | 
19 | x = pickle.load(open('saved_x.pkl', 'rb'))
20 | y = pickle.load(open('saved_y.pkl', 'rb'))
21 | y = to_one_hot(y)
22 | in_space = VectorSpace(dim=x.shape[1])
23 | full = DenseDesignMatrix(X=x, y=y)
24 | 
25 | l1 = mlp.RectifiedLinear(layer_name='l1',
26 |                          sparse_init=12,
27 |                          dim=5000,
28 |                          max_col_norm=1.)
29 | 
30 | l2 = mlp.RectifiedLinear(layer_name='l2',
31 |                          sparse_init=12,
32 |                          dim=5000,
33 |                          max_col_norm=1.)
34 | 
35 | l3 = mlp.RectifiedLinear(layer_name='l3',
36 |                          sparse_init=12,
37 |                          dim=5000,
38 |                          max_col_norm=1.)
39 | 
40 | l4 = mlp.RectifiedLinear(layer_name='l4',
41 |                          sparse_init=12,
42 |                          dim=5000,
43 |                          max_col_norm=1.)
44 | 
45 | output = mlp.HingeLoss(layer_name='y',
46 |                        n_classes=2,
47 |                        irange=.0001)
48 | 
49 | #output = mlp.Softmax(layer_name='y',
50 | #                     n_classes=2,
51 | #                     irange=.005)
52 | 
53 | layers = [l1, l2, l3, output]
54 | 
55 | mdl = mlp.MLP(layers,
56 |               input_space=in_space)
57 | 
58 | lr = .0001
59 | epochs = 100
60 | trainer = sgd.SGD(learning_rate=lr,
61 |                   batch_size=128,
62 |                   learning_rule=learning_rule.Momentum(.5),
63 |                   # Remember, default dropout is .5
64 |                   cost=Dropout(input_include_probs={'l1': .8},
65 |                                input_scales={'l1': 1.}),
66 |                   termination_criterion=EpochCounter(epochs),
67 |                   monitoring_dataset={'train': full})
68 | 
69 | watcher = best_params.MonitorBasedSaveBest(
70 |     channel_name='train_y_misclass',
71 |     save_path='saved_clf.pkl')
72 | 
73 | velocity = learning_rule.MomentumAdjustor(final_momentum=.6,
74 |                                           start=1,
75 |                                           saturate=250)
76 | 
77 | decay = sgd.LinearDecayOverEpoch(start=1,
78 |                                  saturate=250,
79 |                                  decay_factor=lr*.05)
80 | 
81 | experiment = Train(dataset=full,
82 |                    model=mdl,
83 |                    algorithm=trainer,
84 |                    extensions=[watcher, velocity, decay])
85 | 
86 | if __name__ == "__main__":
87 |     experiment.main_loop()
88 | 


--------------------------------------------------------------------------------
/predict_no_batches.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | """
 5 | prediction code without batches
 6 | see http://fastml.com/how-to-get-predictions-from-pylearn2/
 7 | author: Zygmunt Zając
 8 | """
 9 | 
10 | import sys
11 | import os
12 | import numpy as np
13 | import cPickle as pickle
14 | 
15 | from pylearn2.utils import serial
16 | from theano import tensor as T
17 | from theano import function
18 | 
19 | try:
20 | 	model_path = sys.argv[1]
21 | 	test_path = sys.argv[2]
22 | 	out_path = sys.argv[3]
23 | except IndexError:
24 | 	print "Usage: predict.py <model file> <test file> <output file>"
25 | 	print "       predict.py saved_clf.pkl saved_tst.pkl results.csv\n"
26 | 	quit()
27 | 	
28 | print "loading model..."
29 | 
30 | try:
31 | 	model = serial.load(model_path)
32 | except Exception, e:
33 | 	print model_path + "doesn't seem to be a valid model path, got this error when trying to load it:"
34 | 	print e
35 | 
36 | print "setting up symbolic expressions..."
37 | 
38 | X = model.get_input_space().make_theano_batch()
39 | Y = model.fprop(X)
40 | Y = T.argmax(Y, axis=1)
41 | 
42 | f = function([X], Y)
43 | 
44 | print "loading data and predicting..."
45 | 
46 | x = pickle.load(open(test_path, 'rb'))
47 | y = f(x)
48 | 
49 | print "writing predictions..."
50 | 
51 | out = open(out_path, 'w')
52 | out.write('id,label\n')
53 | 
54 | for i in xrange(y.shape[0]):
55 | 	p = y[i]
56 | 	out.write('{},{}\n'.format(i + 1, p))
57 | 	
58 | out.close()
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------