├── log
    └── README.md
├── saved_models
    └── README.md
├── data
    ├── README.md
    ├── expt_42.jpg
    ├── SWTPF_counts.csv
    ├── original
    │   └── README.md
    └── SWTPF-leaderboard-scores.csv
├── submissions
    └── README.md
├── requirements.txt
├── requirements_dev_sklearn.txt
├── LICENSE
├── .gitignore
├── images.py
├── launch-processes.bash
├── Makefile
├── utils.py
├── convert-binary-data.py
├── expand-np-arrays.py
├── README.md
├── notes.md
└── models.py


/log/README.md:
--------------------------------------------------------------------------------
1 | this is where we'll write logs
2 | 


--------------------------------------------------------------------------------
/saved_models/README.md:
--------------------------------------------------------------------------------
1 | this is where we'll save our models
2 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | Data and intermediate files go here, but aren't commited.
2 | 


--------------------------------------------------------------------------------
/submissions/README.md:
--------------------------------------------------------------------------------
1 | This is where we'll write out all submission files.
2 | 


--------------------------------------------------------------------------------
/data/expt_42.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrmontag/mnist-sklearn/HEAD/data/expt_42.jpg


--------------------------------------------------------------------------------
/data/SWTPF_counts.csv:
--------------------------------------------------------------------------------
 1 | 997 0
 2 | 1135 1
 3 | 1039 2
 4 | 1011 3
 5 | 980 4
 6 | 880 5
 7 | 961 6
 8 | 1022 7
 9 | 969 8
10 | 1006 9
11 | 


--------------------------------------------------------------------------------
/data/original/README.md:
--------------------------------------------------------------------------------
1 | Put the provided data files here. They should be named: ``test-images.gz, train-images.gz, train-labels.gz`` 
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.10.2
2 | scipy==0.16.1
3 | jupyter==1.0.0
4 | scikit-learn==0.17
5 | matplotlib==1.5.0
6 | seaborn==0.6.0
7 | 


--------------------------------------------------------------------------------
/data/SWTPF-leaderboard-scores.csv:
--------------------------------------------------------------------------------
 1 | 0.99081633 0
 2 | 0.99030837 1
 3 | 0.9689922481 2    
 4 | 0.9653465347 3
 5 | 0.9735234216 4
 6 | 0.9674887892 5
 7 | 0.9791231733 6
 8 | 0.96692607 7
 9 | 0.9599589322 8
10 | 0.9554013875 9
11 | 


--------------------------------------------------------------------------------
/requirements_dev_sklearn.txt:
--------------------------------------------------------------------------------
1 | numpy==1.10.2
2 | scipy==0.16.1
3 | jupyter==1.0.0
4 | -e git@github.com:scikit-learn/scikit-learn.git@7cfa55452609c717c96b4c267466c80cc4038845
5 | matplotlib==1.5.0
6 | seaborn==0.6
7 | cython==0.23
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Josh Montague
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by https://www.gitignore.io/api/python
 2 | 
 3 | ### data + misc ###
 4 | *.gz
 5 | *.swp
 6 | *.npy
 7 | *.npy.z
 8 | *.submission
 9 | *.pdf
10 | *.pkl
11 | *.log
12 | *.out
13 | 
14 | 
15 | ### Python ###
16 | # Byte-compiled / optimized / DLL files
17 | __pycache__/
18 | *.py[cod]
19 | *$py.class
20 | 
21 | # IPython checkpoints
22 | .ipynb_checkpoints
23 | 
24 | # C extensions
25 | *.so
26 | 
27 | # Distribution / packaging
28 | .Python
29 | env/
30 | tmp-venv/
31 | build/
32 | develop-eggs/
33 | dist/
34 | downloads/
35 | eggs/
36 | .eggs/
37 | lib/
38 | lib64/
39 | parts/
40 | sdist/
41 | var/
42 | *.egg-info/
43 | .installed.cfg
44 | *.egg
45 | 
46 | # PyInstaller
47 | #  Usually these files are written by a python script from a template
48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
49 | *.manifest
50 | *.spec
51 | 
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 | 
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .coverage
60 | .coverage.*
61 | .cache
62 | nosetests.xml
63 | coverage.xml
64 | *,cover
65 | .hypothesis/
66 | 
67 | # Translations
68 | *.mo
69 | *.pot
70 | 
71 | # Django stuff:
72 | *.log
73 | 
74 | # Sphinx documentation
75 | docs/_build/
76 | 
77 | # PyBuilder
78 | target/
79 | 


--------------------------------------------------------------------------------
/images.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import fileinput
 3 | import struct
 4 | 
 5 | # read the labels from binary file
 6 | label_names = ["train-labels.gz"]
 7 | g = fileinput.FileInput(label_names, openhook=fileinput.hook_compressed)
 8 | # grab the first chunk of data for header info
 9 | x = g.next()
10 | head = []
11 | for i in range(2):
12 |     head.append(struct.unpack(">I", x[4*i:4*i+4])[0])
13 | magic, n_labels = head
14 | print "magic={}\nlabels={}".format(*head)
15 | # start reading the lables
16 | # unsigned binary ints - 1 byte each
17 | labels = []
18 | j = 8 # byte index on current chunk
19 | while len(labels) < n_labels:
20 |     try:
21 |         val = struct.unpack("B", x[j])[0]
22 |     except IndexError:
23 |         # read a new chuck from file
24 |         x = g.next()
25 |         j = 0
26 |         val = struct.unpack("B", x[j])[0]
27 |     labels.append(val)
28 |     j += 1
29 | 
30 | # read images from binary file
31 | infile_names = ["train-images.gz"]
32 | f = fileinput.FileInput(infile_names, openhook=fileinput.hook_compressed)
33 | x = f.next()
34 | head = []
35 | for i in range(4):
36 |     head.append(struct.unpack(">I", x[4*i:4*i+4])[0])
37 | magic, n_images, rows, columns = head
38 | print "magic={}\nimages={}\nrows={}\ncols={}".format(*head)
39 | j = 16 # index in current chunk
40 | for i in range(n_images):
41 |     for r in range(rows):
42 |         for c in range(columns):
43 |             try:
44 |                 val = struct.unpack("B", x[j])[0]
45 |             except IndexError:
46 |                 # need to read a new chunck of data from finle
47 |                 x = f.next()
48 |                 j = 0
49 |                 val = struct.unpack("B", x[j])[0]
50 |             ##################################
51 |             # simple image plots using screen text layout
52 |             # 3 levels of grey
53 |             if val > 170:
54 |                 print "#",
55 |             elif val > 85:
56 |                 print ".",
57 |             else:
58 |                 print " ",
59 |             ##################################
60 |             j += 1
61 |         print "row={:2}, j={:4}".format(r,j)
62 |     print "image={}, label={}".format(i, labels[i])
63 |                 
64 |     
65 | 


--------------------------------------------------------------------------------
/launch-processes.bash:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #
 4 | # This script is used to launch a set of experiments or processes 
 5 | #
 6 | 
 7 | ####################  config  #############################
 8 | # range of experiments to run 
 9 | SEQUENCE=`seq 1 3`
10 | #SEQUENCE=42
11 | 
12 | # are we doing a cv split ("experiment")? (vs. train and predict on test) 
13 | EXPERIMENT=true
14 | 
15 | # server (ubuntu)? (vs. laptop)
16 | SERVER=true
17 | 
18 | # expanded dataset? (vs. as-given)
19 | EXPANDED=false
20 | 
21 | # stagger the workload (seconds)
22 | SLEEPTIME=10
23 | # 60 = 1 min
24 | # 300 = 5 min
25 | # 600 = 10 min
26 | # 1200 = 20 min
27 | ###########################################################
28 | 
29 | ## variables
30 | # log dir
31 | LOGDIR=log
32 | # virtualenv
33 | #VENV=/home/jmontague/CCC-venv
34 | VENV=tmp-venv
35 | #VENV=dev-tmp-venv
36 | # Python script
37 | PY_SCRIPT=build-model.py
38 | 
39 | # activate the appropriate virtualenv
40 | source ${VENV}/bin/activate 
41 | 
42 | # set some defaults
43 | ARGS="--verbose"
44 | NICE="nice -n10"
45 | 
46 | 
47 | # remember bash boolean conditionals are confusing; just 
48 | #   use convenient strings
49 | # http://stackoverflow.com/a/21210966/1851811
50 | if [ "${EXPERIMENT}" = true ]; then
51 |     # be polite 
52 |     NICE="nice -n15"
53 | else
54 |     # be slightly less polite 
55 |     NICE="nice -n5"
56 |     ARGS="${ARGS} --submission"
57 | fi
58 | 
59 | if [ "${SERVER}" = true ]; then
60 |     ARGS="${ARGS} --ubuntu"
61 | fi
62 | 
63 | if [ "${EXPANDED}" = true ]; then
64 |     ARGS="${ARGS} --expanded"
65 | fi
66 | 
67 | 
68 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- started running $0"
69 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- using python interpreter: $(which python)"
70 | 
71 | # name all of these processes similarly 
72 | filedate="$(date +%Y-%m-%dT%H:%M:%S)"
73 | 
74 | for i in ${SEQUENCE}; do
75 |     echo "$(date +%Y-%m-%d\ %H:%M:%S) -- launching experiment ${i} with ${PY_SCRIPT} and ARGS=${ARGS}"
76 |     # launch the appropriate process 
77 |     #   - run this bash script w/ nohup & all of the python procs will inherit it 
78 |     ${NICE} python ${PY_SCRIPT} expt_${i} ${ARGS} > ${LOGDIR}/${filedate}_expt_${i}.log & 
79 |     # note this will also sleep after the last process
80 |     echo "$(date +%Y-%m-%d\ %H:%M:%S) -- sleeping for ${SLEEPTIME} seconds"
81 |     sleep ${SLEEPTIME} 
82 | done
83 | 
84 | 
85 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- deactivating virtualenv"
86 | deactivate
87 | 
88 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- interpreter now set to: $(which python)"
89 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- finished launching experiments"
90 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Josh Montague, 2015-12
 2 | #   MIT License
 3 | 
 4 | # locations
 5 | BASEDIR=$(PWD)
 6 | DATADIR=$(BASEDIR)/data
 7 | SAVEDIR=$(BASEDIR)/saved_models
 8 | SUBDIR=$(BASEDIR)/submissions
 9 | LOGDIR=$(BASEDIR)/log
10 | 
11 | 
12 | # Python 
13 | BASE_PY=python2
14 | # rename virtualenv if desired
15 | VENV=tmp-venv
16 | DEVVENV=dev-tmp-venv
17 | # virtualenv-specific locations
18 | VBIN=$(BASEDIR)/$(VENV)/bin
19 | 
20 | # code 
21 | #CONVERT=convert-binary-data.py
22 | #EXPAND=expand-np-arrays.py
23 | 
24 | 
25 | # datetime
26 | DATE := $(shell date +'%Y-%m-%dT%H:%M:%S')
27 | #DATE := $(shell date +'%Y-%m-%d')
28 | TIME := $(shell date +'%H:%M:%S')
29 | 
30 | help:
31 | 	@echo 'Makefile for reproducible analysis                                 '
32 | 	@echo '                                                                   '
33 | 
34 | 
35 | # run everything in the setup
36 | demo: $(SAVEDIR)/knn_cv-split_*.pdf 
37 | 
38 | 
39 | # example experiment
40 | $(SAVEDIR)/knn_cv-split_*.pdf: $(DATADIR)/train-images.npy
41 | 	@echo 
42 | 	@echo 'Sample experiments will now run for ~45 seconds. The corresponding '  
43 | 	@echo ' log file swill be available in log/ afterward.'  
44 | 	@echo 
45 | 	@echo 'When complete, cross-validation confusion matrices will open' 
46 | 	@echo ' automatically.' 
47 | 	nohup nice bash launch-processes.bash > $(LOGDIR)/$(DATE)_sample-log.nohup.out 
48 | 	open $(SAVEDIR)/*.pdf 
49 | 
50 | 
51 | # binary data ==> npy arrays
52 | $(DATADIR)/train-images.npy: $(VBIN)/activate $(DATADIR)/original/train-images.gz
53 | 	. $(VENV)/bin/activate; \
54 | 	python convert-binary-data.py 
55 | 
56 | 
57 | # local environment
58 | $(VBIN)/activate: requirements.txt
59 | 	virtualenv -p $(BASE_PY) $(VENV) 
60 | 	. $(VENV)/bin/activate ; \
61 | 	pip install -r $< 
62 | 	touch $(VENV)/bin/activate
63 | 
64 | 
65 | # download binary data from web
66 | $(DATADIR)/original/train-images.gz: 
67 | 	curl http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz -o data/original/train-images.gz & 
68 | 	curl http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz -o data/original/train-labels.gz & 
69 | 	curl http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz -o data/original/test-images.gz  
70 | 
71 | 
72 | # additional training data
73 | expanded: $(DATADIR)/train-images.npy 
74 | 	test -e data/expanded-train-images.npy || \
75 | 	. $(VENV)/bin/activate ; \
76 | 	python expand-np-arrays.py 
77 | 	touch $<
78 | 
79 | 
80 | # !!! delete all generated npy arrays !!! 
81 | clean:
82 | 	[ ! -d data ] || rm data/*.npy
83 |    
84 | 
85 | # build env with dev version of sklearn 
86 | skl-dev-env: requirements_dev_sklearn.txt 
87 | 	virtualenv -p $(BASE_PY) $(DEVVENV) 
88 | 	. $(DEVVENV)/bin/activate ; \
89 | 	pip install -r $< 
90 | 	touch $(DEVVENV)/bin/activate
91 | 
92 | 
93 | 
94 | .PHONY: clean expanded all 
95 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | __author__="Josh Montague"
 3 | __license__="MIT License"
 4 | 
 5 | #
 6 | # This module defines a number of helper functions. 
 7 | #
 8 | 
 9 | from datetime import datetime
10 | import logging
11 | import numpy as np
12 | import os
13 | import sys
14 | 
15 | 
16 | # set up a logger
17 | util_logr = logging.getLogger(__name__)
18 | util_logr.setLevel(logging.DEBUG)
19 | util_sh = logging.StreamHandler(stream=sys.stdout)
20 | formatter = logging.Formatter('%(asctime)s : %(name)s : %(levelname)s : %(message)s')
21 | util_sh.setFormatter(formatter)
22 | util_logr.addHandler(util_sh)
23 | 
24 | 
25 | def short_name(model):
26 |     """Return a simplified name for this model. A bit brittle."""
27 |     # for a single model, this will work
28 |     name = model.__class__.__name__
29 |     try:
30 |         if hasattr(model, 'steps'):
31 |             # pipeline
32 |             name = '-'.join( [ pair[0] for pair in model.steps ] )
33 |         elif hasattr(model, 'best_estimator_'):
34 |             if hasattr(model.estimator, 'steps'):
35 |                 # gridsearchcv
36 |                 name = 'gscv_' + '-'.join( [x[0] for x in model.estimator.steps ])
37 |             elif hasattr(model.estimator, 'estimators'):
38 |                 # votingclassifier
39 |                 name = 'gscv_vc_' + '-'.join( [x[0] for x in model.estimator.estimators ])
40 |         elif hasattr(model, 'base_estimator_'):
41 |             # bagging
42 |             name = 'bag_' + short_name(model.base_estimator)
43 |     except AttributeError, e:
44 |         util_logr.info('utils.short_name() couldnt generate quality name')
45 |         # for a single model, this will work
46 |         name = model.__class__.__name__
47 |         util_logr.info('falling back to generic name={}'.format(name))
48 |     return name 
49 | 
50 | 
51 | def create_submission(predictions, sub_name, comment=None, team='DrJ'):
52 |     """Include the specified array of image predictions in a 
53 |     properly-formatted submission file.
54 |     """
55 |     now = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')
56 |     submission_name = '-'.join(sub_name.split())
57 |     with open('submissions/{}_{}.submission'.format(
58 |             now, submission_name), 'w') as f:
59 | 
60 |         f.write('#'*20 + ' Generated submission file\n')
61 |         if comment is not None:
62 |             f.write('# ' + comment + '\n')
63 |         f.write('{}\n'.format(team))
64 |         f.write('{}\n'.format(now))
65 |         f.write('{}\n'.format(sub_name))
66 |         for p in predictions:
67 |             f.write('{}\n'.format(p))
68 |     return True
69 | 
70 | 
71 | def load_np_arrays(files='original'):
72 |     """
73 |     Return numpy arrays for training dataset, training labels, 
74 |     and test dataset (in that order). If files='original', 
75 |     return the image data, as given. If files='expanded', 
76 |     return the perturbed image files (~5 times larger). 
77 |     
78 |     files='original', 'expanded' 
79 |     """
80 |     # nb: path assumes that we call this function from project root  
81 |     train_imgs_f = 'train-images.npy'
82 |     train_labels_f = 'train-labels.npy'
83 | 
84 |     if files == 'expanded':
85 |         train_imgs_f = 'expanded-' + train_imgs_f 
86 |         train_labels_f = 'expanded-' + train_labels_f 
87 | 
88 |     X_train = np.load(os.path.join('data', train_imgs_f))
89 |     y_train = np.load(os.path.join('data', train_labels_f))
90 |     X_test = np.load(os.path.join('data', 'test-images.npy'))
91 | 
92 |     return (X_train, y_train, X_test)
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/convert-binary-data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # 
  4 | # modified from images.py (included), by Scott Hendrickson 
  5 | #
  6 | 
  7 | import fileinput
  8 | import logging
  9 | import numpy as np
 10 | import struct
 11 | import sys
 12 | 
 13 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
 14 |                     #level=logging.DEBUG,
 15 |                     level=logging.INFO,
 16 |                     stream=sys.stdout 
 17 |                     )
 18 | 
 19 | 
 20 | logging.info('Beginning data conversion')
 21 | 
 22 | # display ascii versions of training data 
 23 | PLOT = False 
 24 | logging.info('ASCII plotting enabled? {}'.format(PLOT))
 25 | 
 26 | 
 27 | ######### 
 28 | # Get the training labels 
 29 | ######### 
 30 | logging.info("reading training labels")
 31 | label_names = ["data/original/train-labels.gz"]
 32 | g = fileinput.FileInput(label_names, openhook=fileinput.hook_compressed)
 33 | # grab the first chunk of data for header info
 34 | logging.info(" reading header")
 35 | x = g.next()
 36 | head = []
 37 | for i in range(2):
 38 |     head.append(struct.unpack(">I", x[4*i:4*i+4])[0])
 39 | magic, n_labels = head
 40 | logging.info(" magic={}, labels={}".format(*head))
 41 | 
 42 | # unsigned binary ints - 1 byte each
 43 | logging.info(" reading data")
 44 | labels = []
 45 | j = 8 # byte index on current chunk
 46 | while len(labels) < n_labels:
 47 |     try:
 48 |         val = struct.unpack("B", x[j])[0]
 49 |     except IndexError:
 50 |         # read a new chuck from file
 51 |         x = g.next()
 52 |         j = 0
 53 |         val = struct.unpack("B", x[j])[0]
 54 |     labels.append(val)
 55 |     j += 1
 56 | logging.debug("observed labels: {}".format(labels))
 57 | 
 58 | label_array = np.array(labels)
 59 | logging.debug(" label_array type: {} (length: {})".format(type(label_array), len(label_array)))
 60 | 
 61 | logging.info("writing numpy label array to disk") 
 62 | with open('data/train-labels.npy', 'wb') as f:
 63 |     np.save(f, label_array)
 64 | 
 65 | 
 66 | ################################
 67 | 
 68 | datasets = ( ('train-images', 'data/original/train-images.gz'),
 69 |             ('test-images', 'data/original/test-images.gz') )
 70 | 
 71 | for dataset in datasets:
 72 |     data_name, data_file = dataset
 73 |     logging.info('reading dataset={}, from file={}'.format(data_name, data_file))
 74 |     f = fileinput.FileInput([data_file], openhook=fileinput.hook_compressed)
 75 |     x = f.next()
 76 |     # start with the relevant header data 
 77 |     head = []
 78 |     logging.info(" reading header")
 79 |     for i in range(4):
 80 |         head.append(struct.unpack(">I", x[4*i:4*i+4])[0])
 81 |     magic, n_images, rows, columns = head
 82 |     logging.info(" magic={}, images={}, rows={}, cols={}".format(*head))
 83 | 
 84 |     # now we know the shape of the data, so we can allocate an array
 85 |     data_array = np.zeros((n_images, rows*columns), dtype=int)
 86 | 
 87 |     # onto the main file data
 88 |     logging.info(" reading data")
 89 |     j = 16 # index in current chunk
 90 |     for i in range(n_images):
 91 |         # keep track of all values for this sample (image)
 92 |         sample_i_values = []
 93 |         for r in range(rows):
 94 |             # keep appending to sample array all the way through 
 95 |             #   the rows and cols of sample i
 96 |             for c in range(columns):
 97 |                 try:
 98 |                     val = struct.unpack("B", x[j])[0]
 99 |                 except IndexError:
100 |                     # need to read a new chunck of data from finle
101 |                     x = f.next()
102 |                     j = 0
103 |                     val = struct.unpack("B", x[j])[0]
104 |                 if PLOT:
105 |                     ##################################
106 |                     # simple image plots using screen text layout
107 |                     # 3 levels of grey
108 |                     if val > 170:
109 |                         print "#",
110 |                     elif val > 85:
111 |                         print ".",
112 |                     else:
113 |                         print " ",
114 |                     ##################################
115 |                 # append this value to the sample row 
116 |                 sample_i_values.append(val)
117 |                 j += 1
118 |             if PLOT:
119 |                 print "row={:2}, j={:4}".format(r,j)
120 |         if PLOT and data_name is 'train-images':
121 |             # there are no labels for the test dataset
122 |             print "image={}, label={}".format(i, labels[i])
123 |         # visually verify that our numeric values are similar to the ascii art
124 |         logging.debug("sample_i_values (len={}): {}".format(len(sample_i_values), sample_i_values))
125 | 
126 |         # update the row in our cumulative array that corresponds to this sample (image) 
127 |         data_array[i] = np.array(sample_i_values)
128 | 
129 |     # after the for loop, save for later use (more transparently, with .npy arrays) 
130 |     logging.info("writing {} to disk as numpy array".format(data_name)) 
131 |     with open('data/{}.npy'.format(data_name), 'wb') as f:
132 |         np.save(f, data_array)
133 | 
134 | 


--------------------------------------------------------------------------------
/expand-np-arrays.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | __author__="Josh Montague"
  4 | __license__="MIT License"
  5 | 
  6 | """
  7 | Adapted from: 
  8 | https://github.com/mnielsen/neural-networks-and-deep-learning
  9 | """
 10 | 
 11 | import argparse
 12 | import gc
 13 | import logging
 14 | import numpy as np
 15 | import random
 16 | import sys
 17 | 
 18 | import utils
 19 | 
 20 | 
 21 | # this will be helpful for displaying arrays
 22 | np.set_printoptions(linewidth=200)
 23 | 
 24 | parser = argparse.ArgumentParser()
 25 | parser.add_argument("-v", "--verbose", action="store_true",
 26 |                                   help="increase output verbosity")
 27 | args = parser.parse_args()
 28 | 
 29 | # use a simple logger - get the level from the cmd line
 30 | loglevel = logging.DEBUG if args.verbose else logging.INFO
 31 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
 32 |                     stream=sys.stdout, level=loglevel)
 33 | logging.debug('logging enabled - preparing for work')
 34 | 
 35 | 
 36 | logging.info('reading original input data from disk') 
 37 | try:
 38 |     # nb: we don't need X_test right now
 39 |     X_train_full, y_train_full, X_test = utils.load_np_arrays() 
 40 |     logging.debug('observed data dimensions: {}, {}. {}'.format(
 41 |                     X_train_full.shape, y_train_full.shape, X_test.shape))
 42 | except IOError, e:
 43 |     # let it crash, but give some insight in the log 
 44 |     logging.warn('Error reading data from files (do they exist yet?)') 
 45 |     logging.warn('Error message={}'.format(e)) 
 46 |     raise e
 47 | 
 48 | # define the perturbations
 49 | perturb_list = [
 50 |     # displacement, axis, index position, index
 51 |     (1,  0, "first", 0),
 52 |     (-1, 0, "first", 27),
 53 |     (1,  1, "last",  0),
 54 |     (-1, 1, "last",  27)]
 55 | 
 56 | # a container for our new data set
 57 | expanded_data = []
 58 | 
 59 | # loop over all images in training set 
 60 | for i, (img, label) in enumerate(zip(X_train_full, y_train_full)):
 61 |     if i % 100 == 0:
 62 |         logging.info('perturbing image number {}, current length of expanded_data={}'.format(i, len(expanded_data)))
 63 |     # add the original array to the new array
 64 |     expanded_data.append((img, label)) 
 65 |     # reshape back into square for roll()
 66 |     img = np.reshape(img, (28,28))
 67 |     # perturb in each direction
 68 |     for d, ax, position, idx in perturb_list:
 69 |         #logging.debug('current perturb_list={}'.format([d,ax,position,idx]))
 70 |         # shift pixels in this image by d along ax
 71 |         perturbed_img = np.roll(img, d, ax)
 72 |         # in case pixels get shifted across the edge boundaries, 
 73 |         #   we can just set the corresponding edge to 0s (first 
 74 |         #   reshape the array so we can slice efficiently) 
 75 |         #perturbed_img = np.reshape(perturbed_img, (28,28))
 76 |         if position == "first": 
 77 |             # first row/column
 78 |             perturbed_img[idx, :] = np.zeros(28)
 79 |         else: 
 80 |             # last row/column
 81 |             perturbed_img[:, idx] = np.zeros(28)
 82 |         # add new (flattened) image and label to the expanded list 
 83 |         expanded_data.append( (np.reshape(perturbed_img, 784), label) )
 84 |         # e_d ~ [(np.arr, int), ... ]
 85 |         
 86 |         logging.debug('current pertub_list={}'.format([d,ax,position,idx]))
 87 |         logging.debug('current label={}'.format(label))
 88 |         logging.debug('original image array=\n{}'.format(img))
 89 |         logging.debug('shifted data array=\n{}'.format(perturbed_img))
 90 | 
 91 | 
 92 | # shuffle to avoid bias in array positions
 93 | logging.info('shuffling expanded data set')
 94 | random.shuffle(expanded_data)
 95 | logging.debug('expanded_data=\n{}'.format(expanded_data))
 96 | 
 97 | # e_d is a list of (img-array, label) tuples 
 98 | #   - zip(*e_d) pairs the elements of each img-array  
 99 | logging.info('converting expanded data to list of numpy arrays')
100 | expanded_data_array_list = [np.array(x) for x in zip(*expanded_data)] 
101 | logging.info('length of expanded array list={}'.format(len(expanded_data_array_list))) 
102 | 
103 | # extract the labels from the last column of the array 
104 | #y_expanded = expanded_data_array[:,-1]
105 | # and the data from everything *but* the last column 
106 | #X_expanded = expanded_data_array[:,:-1]
107 | X_expanded = expanded_data_array_list[0]
108 | y_expanded = expanded_data_array_list[1]
109 | 
110 | logging.debug('X_expanded (length={})=\n{}'.format(len(X_expanded), X_expanded))
111 | logging.debug('y_expanded (length={})=\n{}'.format(len(y_expanded), y_expanded))
112 | 
113 | # verify
114 | n=5
115 | for i in range(n):
116 |     logging.debug('expanded image {} (label={}):\n{}'.format(i, y_expanded[i], np.reshape(X_expanded[i,:], (28,28))))
117 | 
118 | 
119 | # after the for loop, save for later use (more transparently, with .npy arrays) 
120 | for name, dataset in [('expanded-train-images', X_expanded) , ('expanded-train-labels', y_expanded)]:
121 |     logging.info("writing {} to disk as numpy array".format(name)) 
122 |     with open('data/{}.npy'.format(name), 'wb') as f:
123 |         np.save(f, dataset)
124 | 
125 | logging.info('done expanding data')
126 | 
127 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MNIST + ``scikit-learn`` = :star2: 
 2 | 
 3 | This code was developed for a intra-team Kaggle-like modeling competition on the canonical [MNIST handwritten digits dataset](https://en.wikipedia.org/wiki/MNIST_database). For more narrative on the approach and process, [you can read this article.](http://joshmontague.com/posts/2016/mnist-scikit-learn/) 
 4 | 
 5 | We were given the training images and labels, the test images, and a simple Python script that read (and "displayed") the binary image data (also included here: ``images.py``). We had two weeks to submit any number of prediction files for the test images, one prediction per line. My highest accuracy model ([#42](https://github.com/jrmontag/mnist-sklearn/blob/master/models.py#L419)) scored 98.18% with no feature engineering. The same model with a minor amount of feature engineering ("added samples" [via image translation](https://github.com/jrmontag/mnist-sklearn/blob/master/expand-np-arrays.py)) scored a 98.68% and was my highest score. 
 6 | 
 7 | ![Model #42 confusion matrix (original data)](data/expt_42.jpg "Model #42 confusion matrix (original data)")
 8 | 
 9 | The rules were pretty simple: 
10 | 
11 | - write code in whatever language you want
12 | - use whatever platform you want (laptop, EC2, tea leaves, stone tablet)  
13 | - be prepared to present what you did to the team (in particular, this meant if you cheated and just downloaded the test labels, you were in for A Bad Time) 
14 | 
15 | Given the small size of the data and how much I :heart: ``scikit-learn``, I chose to use Python on a single server (a combination of a MacBook and AWS EC2). All of the data processing and modeling code here is written in Python, though there are a couple of additional bash scripts to facilitate various pieces of the workflow. I think most of this code should work out of the box with Python 2.7 on OS X and Ubuntu (an isolated environment is recommended - the requirements.txt I used is included). 
16 | 
17 | **BUT WAIT!** I can do you one better: I included the entire setup that I used within this repo. If all goes as intended, the following commands will get you up and running\*. 
18 | 
19 | ```bash
20 | $ git clone <this repo>
21 | $ cd mnist-sklearn
22 | $ make demo 
23 | ```
24 | 
25 | \*the prerequisites for using this code are having Python 2.7 (yeah, I know it's old), ``make``, and ``virtualenv`` installed. Your machine likely has ``make`` installed already. If needed, you can ``sudo pip install virtualenv``. 
26 | 
27 | ## What's happening here?
28 | 
29 | The ``make demo`` command will do the following things:
30 | 
31 | - use ``virtualenv`` to create an isolated Python environment in this directory (and install all the necessary libraries) 
32 | - download the raw binary data from [Yann LeCun's website](http://yann.lecun.com/exdb/mnist/) (about 10 MB) 
33 | - convert those binary files to ``numpy`` arrays and write them to disk (about 400 MB) 
34 | - start a set of sample model runs 
35 | - display the confusion matrices from the best cross-validation model 
36 | 
37 | The one-time environment setup may take a few minutes. The three sample models are staggered by the ``bash`` script and should be done in about 45 seconds. You can look at the data in ``log/`` to see what's going on, and look in ``saved_models/`` to see both the serialized models and per-model confusion matrices. 
38 | 
39 | ## What should I do next?
40 | 
41 | For more standard usage, the designed approach is to add new Pipelines to ``models.py`` (with accompanying descriptions and names, used in file-nameing conventions). Then, update the ``SEQUENCE`` variable in ``launch-processes.bash`` - either using a single value, or a range via ``seq``. Each "experiment" (as the ``expt_*`` convention was intended), will create a new log file and all logs from a single use of ``launch-process.bash`` will share a timestamp for ease of separating your trials. 
42 | 
43 | Since some of the models can take minutes to hours to run, the recommended syntax is something like:
44 | 
45 | ```bash
46 | $ nohup bash launch-processes.bash > log/2016-01-12_expt-4-12.nohup.log &  
47 | ``` 
48 | 
49 | This will let you disconnect from the session while things are still running, and also log (in the nohup log) any unexpected exceptions that crash your code.  
50 | 
51 | Given the relatively small size of data, most of these models seem to be CPU bound. For optimal iteration time (and fun of watching ``htop``), a high-CPU-count server is the best approach. Go ask AWS for something from the C3 or C4 family of EC2 instances. 
52 | 
53 | As you dig in further, review the Notes section below for some important details.
54 | 
55 | Questions? [Let me know](https://www.twitter.com/jrmontag)! Otherwise, have fun classifying! 
56 | 
57 | -----
58 | 
59 | ## Notes
60 | 
61 | - I'd recommend using an isolated environment; I used ``virtualenv`` and the Makefile will you set that up 
62 |     - for reasons unclear to me, when ``pip install``ing these requirements on Ubuntu, ``pip`` threw a ``Failed building wheel`` error for ``numpy, scipy,`` and ``scikit-learn``. It appeared to then recover and be happy by building them via ``setup.py``.  ``¯\_(ツ)_/¯`` 
63 | - As of the time of writing, the development branch of ``sklearn`` is required to use the ``MLPClassifer`` (Multi-layer perceptron). See the additional notes below for instructions on setting this up. 
64 | - This code was developed on both OS X and Ubuntu, which can lead to inconsistencies in the behavior of library behavior. At least once, I had to modify the ``matplotlib`` "backend" in the corresponding Python environment ``matplotlibrc`` file. In the end, I was using ``macosx`` and ``agg`` (on OS X and Ubuntu, respectively). 
65 | - The expanded data set (small, linear translations of the original data) used to obtain the top score is obtained by a separate ``make`` command, and is ~2 GB):
66 | - As the get more complicated, models can take from order seconds (default k-nearest neighbors) to order hours (Multi-layer Perceptron). If using shared resources (or, anecdotally, to increase the efficiency of running multiple processes), I recommend increasing the ``SLEEPTIME`` variable in the launch script to something like 1-5 minutes. 
67 | 
68 | 
69 | ### Installing the development branch of ``sklearn`` 
70 | 
71 | I sort of figured this out by trial an error (on Ubuntu, didn't test on OS X), so there may be a better way to do it. Nevertheless, here's what I did that worked:
72 | 
73 | - If you want to build a separate virtualenv from the one with stable ``sklearn`` (this is, after all part of the point of virtualenv), do the following:
74 | 
75 | ```bash
76 | # install similar libraries as before, plus cython
77 | $ make skl-dev-env
78 | # install sklearn from the git commit I used  
79 | $ source dev-tmp-venv/bin/activate
80 | $ pip install git+https://github.com/scikit-learn/scikit-learn.git@7cfa55452609c717c96b4c267466c80cc4038845
81 | ```
82 | 
83 | - Or, if you want to use the virtualenv that you've already built, you can:
84 | 
85 | ```bash
86 | # replace sklearn + install cython 
87 | $ source dev-tmp-venv/bin/activate
88 | $ pip uninstall sklearn
89 | $ pip install cython
90 | $ pip install git+https://github.com/scikit-learn/scikit-learn.git@7cfa55452609c717c96b4c267466c80cc4038845
91 | ``` 
92 | 
93 |  
94 | 
95 | 


--------------------------------------------------------------------------------
/notes.md:
--------------------------------------------------------------------------------
  1 | # CCC notes, 2015 edition
  2 | 
  3 | These are the notes that I was keeping while working through the development of this code. For posterity's sake, I decided to keep them attached to the repo. Enjoy the walk through my thought process! 
  4 | 
  5 | -----------
  6 | 
  7 | ## kick-off meeting 
  8 | 
  9 | - review project definition
 10 | - submission format (see "random" example, too): 
 11 | 
 12 |     <team name, make this consistent across all submissions. This will appear in the leader board.>
 13 |     <timestamp of sumbission in format 2015-12-xxT00:00:00>
 14 |     <submission name-anything you want. This will appear in the leader board.>
 15 | 
 16 | 
 17 | ## outline 
 18 | 
 19 | - [x] read some scikit examples 
 20 | - [x] copy the given images code into lab, create numpy arrays 
 21 |     - because of some differences in fileInput in the 3.4 interpreter, reverting back to 2.7 
 22 |     - able to run images.py as given
 23 |     - work on getting the arrays out into np arrays => write to file for easy import later
 24 |         - note: takes ~30s to read original data into np array
 25 | - [x] make first prediction file (interactively)
 26 |     - add test data to the existing image.py script 
 27 |     - use dummy classifier
 28 | - [x] move binary => numpy creation into Makefile 
 29 |     - still in notebook
 30 |     - other formats?
 31 |         - joblib (compressed) [link](https://pythonhosted.org/joblib/persistence.html)
 32 |         - HDF5 (groups) [link](http://docs.h5py.org/en/latest/quick.html#appendix-creating-a-file)
 33 |     - [this (old) post](https://robertdragan.wordpress.com/2012/08/31/comparying-various-methods-for-saving-and-loading-numpy-arrays/) shows that joblib/hdf5/numpy are all about the same in performance. joblib looks to win, slightly, and is recommended in the scikit-learn docs. so, use that one.
 34 |         - this seems to pertain more to the models that carry large arrays within them. 
 35 |     - stick to .npy arrays from original data, and clean up that code 
 36 | - [x] improve virtualenv setup
 37 |     - get virtualenv incorporated with existing conversion script 
 38 |     - can now ``make everything`` from scratch!
 39 | - [x] use basic SVM in notebook (memory of this being good baseline for MNIST)
 40 | - [x] move utils code out of notebook 
 41 |     - data reading, submissions creation
 42 |     -  fix import paths for bin/
 43 | - [x] build diagnostics (to save with each model)
 44 |     - scoring (accuracy + stdev) 
 45 |     - ``cross_val_predict`` + confusion matrix 
 46 | - [x] per-experiment executables
 47 |     - create python module that defines the list of models, steps 
 48 |         - use eg ``experiment-1.py`` to read that in and execute from bash 
 49 |     - include saving model, logging, saving confusion matrix 
 50 | - [x] next round of experiments
 51 |     - loop over: scaling v. no scaling X every default classifier 
 52 |     - summarize results (``$ cat log/*.log | grep "+/-" | cut -d"=" -f2,5- | sort -nr``):
 53 |         - k-NN (96.61%)
 54 |         - scaled rbf SVM (95.76%)
 55 |         - scaled RF (93.67%)
 56 |         - scaled k-NN (93.65%)
 57 |         - RF (93.59%)
 58 |     - train the top two (kNN [expt4] + scaled rbf SVM [expt7]) on all training data  => predict + submission
 59 |         - make another python utility that takes the experiment pipeline (from models.py) and creates a submission file 
 60 |     - the kNN is pretty fast, but the rbf SVM takes a while train/predict
 61 |         - also ran the wrong model (expt6); killed and running expt7 now
 62 |         - submitted kNN (Mac & Kelly)
 63 |         - submitted scaled rbf SVM (compact popcorn machine)
 64 |     - didn't include multi-class LR (scaled and not) in initial experiment -- do that now 
 65 |         - not stellar, ~90%
 66 | - [x] gridsearch (first level) the best performers from ^ 
 67 |     - gridsearchcv the top three performers above (kNN, scaled rbf SVM, scaled RF)
 68 |         - need to find reasonable params for each model
 69 |     - update any code that relies on on pipeline (instead of gscv) 
 70 |     - best gridsearch kNN had ~95%, lower than with default settings
 71 |     - killed and restarted a few times with attempts to boost efficiency
 72 |         - these are expts 24-26. best results: 
 73 |         - kNN ~ 95% ({'knn__n_neighbors': 3}) 
 74 |             - default k=10 (was 96.6%)
 75 |         - SVM ~ 93% ({'rbf_svm__C': 10, 'rbf_svm__gamma': 0.0001, 'rbf_svm__class_weight': 'balanced'} [tie w/ c_w=None]) 
 76 |             - default C=1, gamma='auto' (1/n_features ~ 0.008), classweight=None 
 77 |         - RF ~ 94% ({'random_forest__n_estimators': 100, 'random_forest__max_features': 'auto'})  
 78 |             - default n_estimators=10, max_features='auto'
 79 | - [x] gridsearch (second level) 
 80 |     - focus ranges around blend of "default" and last GS best performance 
 81 |         - these are expts 27-29; check logs for best
 82 |         - kNN: ~95%. k=3-5, all similar, weight='distance'; {'knn__weights': 'distance', 'knn__n_neighbors': 4}
 83 |         - SVM: ~95%. C=10, gamma=0.001/auto, similar; {'rbf_svm__C': 10, 'rbf_svm__gamma': 'auto', 'rbf_svm__class_weight': 'balanced'}
 84 |         - RF: ~95%. {'random_forest__n_estimators': 500, 'random_forest__max_features': 'auto'}
 85 |     - once these are fit, submit one or more
 86 |         - train RF on everything (expt 32), submit results 
 87 | - [x] intermission to clean up repo & file structure
 88 | - [x] didn't try ExtraTrees classifier earlier -- try this now
 89 |     - 94%, slightly higher with scaling (still 94.x%)
 90 | - [x] read about [ensemble methods](http://scikit-learn.org/stable/modules/ensemble.html#ensemble-methods) 
 91 |     - averaging
 92 |         - Bagging == use this one each of 3 best
 93 |     - boosting
 94 |         - AdaBoost ==> use large adaboost classifier with each of 3 best classifiers 
 95 |         - supposedly better with e.g. shallow trees, maybe our gridsearch'd models are opposed to this? 
 96 |     - VotingClassifer
 97 | 
 98 | 
 99 | - [x] set up baggingclassifier with each of the three best as base
100 |     - if the 'pl' is BaggingClassifier(Pipeline()), need to update e.g. utils.name() anything in run-experiment.py? 
101 |         - seems like the baggingclassifier params could also be gridsearched
102 |     - running all in parallel seems too much for # of cores... increase stagger
103 |         - bagging kNN didn't finish. 
104 |             - run this one again (expt_35) 
105 |             - ~96.8%, improvement over single RF
106 |         - bagging SVM (36) ~ 96.3% - improvement over the single svm
107 |         - bagging RF (37) ~ 96.3% - improvement over single RF
108 |     - submit each of these trained on full dataset
109 | - [x] set up adaboost with best RF 
110 |     - must have class weights and proper attrs in estimator (SVC doesn't, RF does, kNN doesn't)
111 |     - try 100 estimators (expt 38) 
112 |     - very fast. ~ 96.5% (slightly better than bagging)
113 |     - submit
114 | - [x] gridsearch votingclassifier on top of the 3 gs'd classifiers to find best 'vote' type
115 |     - then fit, train, predict, submit that one
116 |     - debug: JoblibAttributeError
117 |         - seems related to probability estimates in SVC; add probability=True to constructor (doesn't appear to effect other performance)
118 |     - 0.962 (+/-0.010) for {'voting': 'soft'}
119 |     - was expecting a better result by "averaging"
120 |         - trail full model with voting=soft & submit 
121 | - [x] gridsearch VC for 'vote' w/ 3x bagged or adaboost 
122 |     - 3x bagged: 0.960 (+/-0.009) for {'voting': 'soft'}
123 |     - 2x bagged + adaboost RF: 0.961 (+/-0.010) for {'voting': 'soft'}
124 |         - was expecting a better result by "averaging"
125 |     - train full model on 3x bagging w/ voting=soft & submit (43) 
126 |     - train full model on 2x bagging + RF boosting w/ voting=soft & submit (44)
127 | - [x] use ``class_weights`` in RF & SVM models to reverse-engineered values from scoreboard
128 |     - look at submission for Small Wooded Treatment Plant Fence (expt-32) and count up the predictions 
129 | 
130 | ```bash
131 | $ tail -n+6 submissions/2015-12-24T18:16:42_Small-Wooded-Treatment-Plant-Fence.submission | sort | uniq -c | sort -n | sed 's/^ *//' | sort -t" " -k2,2 > data/SWTPF_counts.csv
132 | 997 0
133 | 1135 1
134 | 1039 2
135 | 1011 3
136 | 980 4
137 | 880 5
138 | 961 6
139 | 1022 7
140 | 969 8
141 | 1006 9
142 | ```
143 | 
144 | - now look at per-count accuracy from scoreboard (data/SWTPF-leaderboard-scores.csv) 
145 | 
146 | ```bash
147 | 0.99081633 0 
148 | 0.99030837 1  
149 | 0.9689922481 2    
150 | 0.9653465347 3   
151 | 0.9735234216 4   
152 | 0.9674887892 5   
153 | 0.9791231733 6   
154 | 0.96692607 7 
155 | 0.9599589322 8   
156 | 0.9554013875 9
157 | ```
158 | - and now we can combine them to get the actual count of digits in the leaderboard test set (if we round) 
159 | 
160 | ```bash
161 | $ join SWTPF_counts.csv SWTPF-leaderboard-scores.csv -1 2 -2 2 | awk 'BEGIN { sum = 0 } { printf "%d %d\n", $1, $2/$3; sum+=$2/$3 } END { printf "\n%d \n", sum }'
162 | 0 1006
163 | 1 1146
164 | 2 1072
165 | 3 1047
166 | 4 1006
167 | 5 909
168 | 6 981
169 | 7 1056
170 | 8 1009
171 | 9 1052
172 | 
173 | 10288 
174 | 
175 | $ join SWTPF_counts.csv SWTPF-leaderboard-scores.csv -1 2 -2 2 | awk 'BEGIN { sum = 10288 } { printf "%d %1.3f\n", $1, $2/$3/sum }'
176 | 0 0.098
177 | 1 0.111
178 | 2 0.104
179 | 3 0.102
180 | 4 0.098
181 | 5 0.088
182 | 6 0.095
183 | 7 0.103
184 | 8 0.098
185 | 9 0.102
186 | ```
187 | 
188 | - in principal, we can now use the relative prevalence of these to weight the classes in eg the SVC model (want the weights to sum to one)
189 | - both SVC and RF support passing the class weights, repurpose the best-performing versions of those
190 |     - RF: expt 32 (scalded RF) performed best on leaderboard (97.2%)
191 |         - reuse with weights (45) => ~96.6%, decent
192 |         - leaderboard score =>  97.1%
193 |     - SVM: expt 36 (bagged, scaled, gs'd SVM) performed best on leaderboard (97.1%)
194 |         - reuse with weights (46) => ~95.6%, decent
195 |         - leaderboard score => 96.4%
196 | - submit these as stand-alone models [running now]
197 |     - relaunched them because they weren't named (overwrite log files)
198 |     - sent
199 | - neither were much higher than the original; don't bother updating VotingClassifier 
200 | - [x] sklearn's built-in NN (MLPClassifier)
201 | - big gridsearch
202 | - dang. MLPC only in dev version of scikit 
203 |     - see if we can create a local virtualenv for that
204 |     - looks ok, running as expt_47
205 | 
206 | ```bash
207 | jmontague@data-science-3:~
208 | $ virtualenv -p python ~/CCC-venv
209 | 
210 | jmontague@data-science-3:~/2015-12-21_CCC [master+*]
211 | $ source ~/CCC-venv/bin/activate
212 | 
213 | jmontague@data-science-3:~
214 | $ pip install -r requirements.txt
215 | $ pip uninstall scikit-learn
216 | 
217 | (CCC-venv)jmontague@data-science-3:~/2015-12-21_CCC [master+*]
218 | $ pip install -e git+git@github.com:scikit-learn/scikit-learn.git
219 | 
220 | (CCC-venv)jmontague@data-science-3:~/2015-12-21_CCC [master+*]
221 | $ pip install cython
222 | 
223 | (CCC-venv)jmontague@data-science-3:~/CCC-venv/lib/python2.7/site-packages/scikit-learn [master]
224 | $ python setup.py build_ext --inplace
225 | $ python
226 | >>> import sklearn; sklearn.__version__
227 | '0.18.dev0'
228 | 
229 | # but didn't build/install totally correctly, maybe ran setup.py in wrong place?
230 | #   - in virtualenv, get sklearn ImportError
231 | #   - resolve by either (in the launch-process.bash script):
232 | $ export PYTHONPATH=~/CCC-venv/lib/python2.7/site-packages/scikit-learn:$PYTHONPATH
233 | # or: 
234 | jmontague@data-science-3:~/CCC-venv/lib/python2.7/site-packages 
235 | $ ln -s scikit-learn/sklearn sklearn
236 | ```
237 | 
238 | - ran gridsearch on 47 - worked, high of ~94.8%
239 | 
240 | - [x] try again with updated grid based on scores 
241 |         - best alpha was on edge of grid - run again to extend on larger end
242 |         - also didn't think to add extra layers - add that, too: [(50,), (100,), (200,), (50,50), (100,100), (200,200), (50,50,50), (100,100,100), (200,200,200)] 
243 |         - drop 'sgd' algorithm 
244 |         - stick to 'relu' activation 
245 |     - best score (from 48) ~ 95.6%
246 |         - extra layer, alpha at edge of grid again
247 |         - look through this more & make next round of GS:
248 |         ``cat log/2015-12-29T03:43:29_expt_48.log | grep for | sort -n -t" " -k6,6``
249 | 
250 | - [x] run with larger range of layer sizes and other params 
251 |     - took ~4 hrs for GSCV 
252 |     - best model ~95.8%: {'mlp__hidden_layer_sizes': (1000, 1000), 'mlp__algorithm': 'l-bfgs', 'mlp__alpha': 10.0}
253 |     - convert best to train for submission  (52 - note: out of order bc of earlier long run times 
254 | 
255 | - [x] test other SVM kernels (in particular, poly w/ gs on degree) 
256 |     - running as expt_50 
257 |     - best ~95.1%, {'svm__degree': 2, 'svm__C': 15.0}
258 |     - not noticibly better than rbf kernel (which I think should do better with high-dimensional data) 
259 | - [x] combine dimensionality reduction with kNN
260 |     - t-SNE to 2-5 dimensions, then kNN (expt 51) 
261 |     - doesn't work because TSNE doesn't have a transform method
262 | 
263 | - [x] expand training data with perturbations
264 |     - then train this data on all of the simplest algorithms 
265 |     - new train data ~ 1 GB
266 | - [x] test this with default models (expt 27, 28, 29), compare scores 
267 |     - need to flag run-experiment.py to read the proper dataset
268 |     - these are slower to train, need to increase stagger time (killed 28 and 29 to let 27 run - restart them once 27 finishes)
269 |         - 27 running for >1 hr
270 |     - also much more ram (~30 GB for expt_27)
271 |     - all 3 were ~95% in first round (w/ the smaller dataset) 
272 |         - kNN (96.1%): {'knn__weights': 'distance', 'knn__n_neighbors': 4} (which is the same as expt_30 
273 |             - ^ from the gscv, killed before fully finished 
274 |         - SVM ():  
275 |         - RF ():  
276 |     - in the interest of trying to get predictions, I think I'll just run a full fit and predict on the gridsearch'd voting classifier (basic three party system, 42)  
277 |         - running now ("expanded 42"). note: original fit to data took 1.5 hrs, and this data is 5x bigger.
278 |         - took 16 hours!
279 |     - submitted (98.6% on hold-out set) 
280 |         
281 | 
282 | 
283 | ## other ideas
284 | 
285 | - scikit-neuralnetwork
286 | - tpot
287 | - sklearn-deap
288 | - nolearn
289 | - tensorflow 
290 | 
291 | 
292 | 
293 | ## Future work? 
294 | 
295 | - don't instantiate estimators in ``models.py``, wait until they're used in main script 
296 | - move matrix plotting into utils module (?)
297 | - make utils.short_name less fragile 
298 | - build funcs to read and display example images
299 | - look at feature importance 
300 |     - http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances_faces.html#example-ensemble-plot-forest-importances-faces-py
301 |     - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/)
302 |     - ex: in voting classifier, does per-digit accuracy vary by model?
303 | 
304 | 
305 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | __author__="Josh Montague"
  4 | __license__="MIT License"
  5 | 
  6 | # this module defines models and pipelines for import into 
  7 | #   individual experiment runs 
  8 | 
  9 | import logging
 10 | import numpy as np
 11 | import sys
 12 | 
 13 | # set up a logger, at least for the ImportError 
 14 | model_logr = logging.getLogger(__name__)
 15 | model_logr.setLevel(logging.DEBUG)
 16 | model_sh = logging.StreamHandler(stream=sys.stdout)
 17 | formatter = logging.Formatter('%(asctime)s : %(name)s : %(levelname)s : %(message)s')
 18 | model_sh.setFormatter(formatter)
 19 | model_logr.addHandler(model_sh)
 20 | 
 21 | 
 22 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 23 | from sklearn.dummy import DummyClassifier
 24 | from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, VotingClassifier 
 25 | from sklearn.grid_search import GridSearchCV
 26 | from sklearn.linear_model import SGDClassifier, LogisticRegression
 27 | from sklearn.manifold import TSNE
 28 | from sklearn.naive_bayes import GaussianNB, MultinomialNB
 29 | from sklearn.neighbors import KNeighborsClassifier
 30 | try:
 31 |     from sklearn.neural_network import MLPClassifier
 32 | except ImportError, e:
 33 |     model_logr.info('couldnt import sklearn.neural_network') 
 34 |     model_logr.info('... as of the time of writing, this requires a build of the dev release (see README)') 
 35 | from sklearn.preprocessing import StandardScaler
 36 | from sklearn.svm import SVC
 37 | from sklearn.tree import DecisionTreeClassifier
 38 | 
 39 | from sklearn.pipeline import Pipeline
 40 | 
 41 | 
 42 | experiment_dict = \
 43 |     { 
 44 |     # Note: keys are of the form expt_*, which are used to execute the 
 45 |     #   associated values of 'pl' keys     
 46 |     #
 47 |     # experiments to build pipeline ################################################
 48 |     'expt_1': { 
 49 |         'note': 'random guessing (maintains class distributions)',
 50 |         'name': 'Crash Test Dummies', 
 51 |         'pl': Pipeline([ ('dummy_clf', DummyClassifier()) ])
 52 |         },
 53 |     'expt_2': { 
 54 |         'note': 'vanilla linear svm (heard it through the grapevine)',
 55 |         'name': 'Grapevine',
 56 |         'pl': Pipeline([ ('linear_svm', SGDClassifier(n_jobs=-1)) ]) 
 57 |         },
 58 |     'expt_3': { 
 59 |         'note': 'add scaling prior to SVM (you must be this tall to ride)',
 60 |         'name': 'This tall to ride',
 61 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
 62 |                         ('linear_svm', SGDClassifier(n_jobs=-1)) ]) 
 63 |         },
 64 |     # systematic check of default classifiers + scaling ################################
 65 |     'expt_4': { 
 66 |         'note': 'vanilla knn (mac and kelly from 2014 "neighbors"',
 67 |         'name': 'Mac and Kelly',
 68 |         'pl': Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1)) ]) 
 69 |         },
 70 |     'expt_5': { 
 71 |         'note': 'scaled knn',
 72 |         'name': 'scaled knn',
 73 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
 74 |                         ('knn', KNeighborsClassifier(n_jobs=-1)) ]) 
 75 |         },
 76 |     'expt_6': { 
 77 |         'note': 'rbf kernel SVM', 
 78 |         'name': 'rbf kernel SVM', 
 79 |         'pl': Pipeline([ ('rbf-svm', SVC(kernel='rbf')) ]) 
 80 |         },
 81 |     'expt_7': { 
 82 |         'note': 'scaled rbf kernel SVM',
 83 |         'name': 'Portable popcorn machine',
 84 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
 85 |                         ('rbf-svm', SVC(kernel='rbf', cache_size=1000)) ]) 
 86 |         },
 87 |     'expt_8': { 
 88 |         'note': 'default decision tree',
 89 |         'name': 'default decision tree',
 90 |         'pl': Pipeline([ ('decision-tree', DecisionTreeClassifier()) ]) 
 91 |         },
 92 |     'expt_9': { 
 93 |         'note': 'scaled default decision tree',
 94 |         'name': 'scaled default decision tree',
 95 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
 96 |                         ('decision-tree', DecisionTreeClassifier()) ]) 
 97 |         },
 98 |     'expt_10': { 
 99 |         'note': 'default RF',
100 |         'name': 'default RF',
101 |         'pl': Pipeline([ ('random-forest', RandomForestClassifier()) ]) 
102 |         },
103 |     'expt_11': { 
104 |         'note': 'scaled default RF',
105 |         'name': 'scaled default RF',
106 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
107 |                         ('random-forest', RandomForestClassifier()) ]) 
108 |         },
109 |     'expt_12': { 
110 |         'note': 'default adaboost',
111 |         'name': 'default adaboost',
112 |         'pl': Pipeline([ ('DT-adaboost', AdaBoostClassifier()) ]) 
113 |         },
114 |     'expt_13': { 
115 |         'note': 'scaled default adaboost',
116 |         'name': 'scaled default adaboost',
117 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
118 |                         ('DT-adaboost', AdaBoostClassifier()) ]) 
119 |         },
120 |     'expt_14': { 
121 |         'note': 'default Gaussian NB',
122 |         'name': 'default Gaussian NB',
123 |         'pl': Pipeline([ ('gaussian-nb', GaussianNB()) ]) 
124 |         },
125 |     'expt_15': { 
126 |         'note': 'scaled Gaussian NB',
127 |         'name': 'scaled Gaussian NB',
128 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
129 |                         ('gaussian-nb', GaussianNB()) ]) 
130 |         },
131 |     'expt_16': { 
132 |         'note': 'default Multinomial NB',
133 |         'name': 'default Multinomial NB',
134 |         'pl': Pipeline([ ('multi-nb', MultinomialNB()) ]) 
135 |         },
136 |     'expt_17': { 
137 |         'note': 'scaled Multinomial NB',
138 |         'name': 'scaled Multinomial NB',
139 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
140 |                         ('multi-nb', MultinomialNB()) ]) 
141 |         },
142 |     'expt_18': { 
143 |         'note': 'default LDA',
144 |         'name': 'default LDA',
145 |         'pl': Pipeline([ ('linear-da', LinearDiscriminantAnalysis()) ]) 
146 |         },
147 |     'expt_19': { 
148 |         'note': 'scaled LDA',
149 |         'name': 'scaled LDA',
150 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
151 |                         ('linear-da', LinearDiscriminantAnalysis()) ]) 
152 |         },
153 |     'expt_20': { 
154 |         'note': 'default QDA',
155 |         'name': 'default QDA',
156 |         'pl': Pipeline([ ('Quadratic-da', QuadraticDiscriminantAnalysis()) ]) 
157 |         },
158 |     'expt_21': { 
159 |         'note': 'scaled QDA',
160 |         'name': 'scaled QDA',
161 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
162 |                         ('Quadratic-da', QuadraticDiscriminantAnalysis()) ]) 
163 |         },
164 |     'expt_22': { 
165 |         'note': 'default (multi-class) Logistic regression',
166 |         'name': 'default (multi-class) Logistic regression',
167 |         'pl': Pipeline([ ('log-reg', LogisticRegression(n_jobs=-1)) ]) 
168 |         },
169 |     'expt_23': { 
170 |         'note': 'scaled default (multi-class) Logistic regression',
171 |         'name': 'scaled default (multi-class) Logistic regression',
172 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
173 |                         ('log-reg', LogisticRegression(n_jobs=-1)) ]) 
174 |         },
175 |     # gridsearch cv the best performers from above ################################
176 |     # - kNN
177 |     'expt_24': { 
178 |         'note': 'gridsearch cv on kNN',
179 |         'name': 'gridsearch cv on kNN',
180 |         'pl': GridSearchCV( Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1)) ]), 
181 |                             param_grid=dict(knn__n_neighbors=[3,12,20]), 
182 |                             n_jobs=-1 ) 
183 |         },
184 |     # - scaled rbf SVM
185 |     'expt_25': { 
186 |         'note': 'gridsearch cv on scaled rbf svm',
187 |         'name': 'gridsearch cv on scaled rbf svm',
188 |         'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 
189 |                                         ('rbf_svm', SVC(kernel='rbf', cache_size=1000)) ]),
190 |                             param_grid=dict(rbf_svm__C=[0.1,1.0,10], 
191 |                                             rbf_svm__gamma=[0.0001,0.01,0.1],
192 |                                             rbf_svm__class_weight=[None, 'balanced']),
193 |                             n_jobs=-1) 
194 |         },
195 |     # - scaled RF
196 |     'expt_26': { 
197 |         'note': 'gridsearch cv on scaled default RF',
198 |         'name': 'gridsearch cv on scaled default RF',
199 |         'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 
200 |                                         ('random_forest', RandomForestClassifier(n_jobs=-1)) ]), 
201 |                             param_grid=dict(random_forest__n_estimators=[3,50,100],
202 |                                             random_forest__max_features=[10,100,'auto']),
203 |                             n_jobs=-1)
204 |         },
205 |     # narrower gridsearch on three models above #################################### 
206 |     # - kNN
207 |     'expt_27': { 
208 |         'note': 'focused gridsearch cv on kNN',
209 |         'name': 'focused gridsearch cv on kNN',
210 |         'pl': GridSearchCV( Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1)) ]), 
211 |                             param_grid=dict(knn__n_neighbors=range(2,12), 
212 |                                             knn__weights=['distance','uniform']), 
213 |                             n_jobs=-1 ) 
214 |         },
215 |     # - scaled rbf SVM      
216 |     'expt_28': { 
217 |         'note': 'focussed gridsearch cv on scaled rbf svm',
218 |         'name': 'focussed gridsearch cv on scaled rbf svm',
219 |         'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 
220 |                                         ('rbf_svm', SVC(kernel='rbf', cache_size=2000)) ]),
221 |                             param_grid=dict(rbf_svm__C=[1,2,5,10], 
222 |                                             rbf_svm__gamma=[0.001,0.005,0.01,'auto'],
223 |                                             rbf_svm__class_weight=[None, 'balanced']),
224 |                             n_jobs=-1) 
225 |         },
226 |     # - scaled RF
227 |     'expt_29': { 
228 |         'note': 'focussed gridsearch cv on scaled default RF',
229 |         'name': 'focussed gridsearch cv on scaled default RF',
230 |         'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 
231 |                                         ('random_forest', RandomForestClassifier(n_jobs=-1)) ]), 
232 |                             param_grid=dict(random_forest__n_estimators=[10,100,500,1000],
233 |                                             random_forest__max_features=[10,20,30,'auto']),
234 |                             n_jobs=-1)
235 |         },
236 |     # best results of gridsearch'd models above #################################### 
237 |     # - best kNN
238 |     'expt_30': { 
239 |         'note': 'best gridsearch result for kNN',
240 |         'name': 'Neighborhood Treatment Plant Fence',
241 |         'pl': Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1, 
242 |                                                         weights='distance', 
243 |                                                         n_neighbors=4)) ])
244 |         },
245 |     # - best scaled rbf SVM      
246 |     'expt_31': { 
247 |         'note': 'best gridsearch result for scaled rbf svm',
248 |         'name': 'Small Popcorn Treatment Plant Fence',
249 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
250 |                         ('rbf_svm', SVC(kernel='rbf', 
251 |                                         cache_size=2000,
252 |                                         C=10.0,
253 |                                         gamma='auto',
254 |                                         class_weight='balanced')) ])    
255 |         },
256 |     # - best scaled RF
257 |     'expt_32': { 
258 |         'note': 'best gridsearch result for scaled RF',
259 |         'name': 'Small Wooded Treatment Plant Fence',
260 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
261 |                         ('random_forest', RandomForestClassifier(n_jobs=-1,
262 |                                                                     n_estimators=500,
263 |                                                                     max_features='auto')) ])
264 |         },
265 |     # ensemble decision tree classifer that didn't get run earlier #################################### 
266 |     'expt_33': { 
267 |         'note': 'ExtraTrees',
268 |         'name': 'ExtraTrees',
269 |         'pl': Pipeline([ ('extra-trees', ExtraTreesClassifier(n_jobs=-1)) ]) 
270 |         },
271 |     'expt_34': { 
272 |         'note': 'scaled default ExtraTrees',
273 |         'name': 'scaled default ExtraTrees',
274 |         'pl': Pipeline([ ('scaling', StandardScaler()), ('extra-trees', ExtraTreesClassifier(n_jobs=-1)) ]) 
275 |         },
276 |     # bagging versions of three best classifiers ##################################
277 |     # - kNN
278 |     'expt_35': { 
279 |         'note': 'bagging on best gridsearched kNN estimator',
280 |         'name': 'Sack of Flanders',
281 |         'pl': BaggingClassifier( 
282 |                     Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1, 
283 |                                                             weights='distance', 
284 |                                                             n_neighbors=4)) ]),
285 |                     n_jobs=-1,
286 |                     n_estimators=10)
287 |                      
288 |         },
289 |     # - best scaled rbf SVM      
290 |     'expt_36': { 
291 |         'note': 'bagging on best gridsearch scaled rbf svm',
292 |         'name': 'Sack of small popcorn',
293 |         'pl': BaggingClassifier( 
294 |                     Pipeline([ ('scaling', StandardScaler()), 
295 |                             ('rbf_svm', SVC(kernel='rbf', 
296 |                                             cache_size=2000,
297 |                                             C=10.0,
298 |                                             gamma='auto',
299 |                                             class_weight='balanced')) ]),    
300 |                     n_jobs=-1,
301 |                     n_estimators=10)
302 |         },
303 |     # - best scaled RF
304 |     'expt_37': { 
305 |         'note': 'bagging on best gridsearch result for scaled RF',
306 |         'name': 'Sack of small shrubs',
307 |         'pl': BaggingClassifier( 
308 |                     Pipeline([ ('scaling', StandardScaler()), 
309 |                             ('random_forest', RandomForestClassifier(n_jobs=-1,
310 |                                                                     n_estimators=500,
311 |                                                                     max_features='auto')) ]),
312 |                     n_jobs=-1,
313 |                     n_estimators=10)
314 |         },
315 |     # adaboost with best RF (must supports class weights) #####################
316 |     # - best scaled RF
317 |     'expt_38': { 
318 |         'note': 'adaboost on best gridsearch result for scaled RF',
319 |         'name': 'On the shoulders of Ents',
320 |         'pl':  Pipeline([ ('scaling', StandardScaler()), 
321 |                             ('adaboost_random_forest', AdaBoostClassifier( 
322 |                                                             RandomForestClassifier(n_jobs=-1,
323 |                                                                                     n_estimators=500,
324 |                                                                                     max_features='auto'),
325 |                                                             n_estimators=100)) ])
326 |         },
327 |     # ensemble voting ################################################
328 |     # - gridsearch voting w/ best three stand-alone models 
329 |     'expt_39': { 
330 |         'note': 'gs over voting across best gs models',
331 |         'name': 'gs over voting across best gs models',
332 |         'pl': GridSearchCV( 
333 |                     VotingClassifier( estimators=[
334 |                         ('gs_knn', Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1, 
335 |                                                             weights='distance', 
336 |                                                             n_neighbors=4)) ])),
337 |                         ('gs_svm', Pipeline([ ('scaling', StandardScaler()), 
338 |                                                 ('rbf_svm', SVC(kernel='rbf', 
339 |                                                                 probability=True,
340 |                                                                 cache_size=2000,
341 |                                                                 C=10.0,
342 |                                                                 gamma='auto',
343 |                                                                 class_weight='balanced')) ])),    
344 |                         ('gs_rf', Pipeline([ ('scaling', StandardScaler()), 
345 |                                                 ('random_forest', RandomForestClassifier(n_jobs=-1,
346 |                                                                         n_estimators=500,
347 |                                                                         max_features='auto')) ])) ]),
348 |                     param_grid=dict(voting=['hard','soft']),
349 |                     n_jobs=-1)
350 |         },
351 |     # - gridsearch voting w/ bagged combos 
352 |     'expt_40': { 
353 |         'note': 'gs over voting across bagged best gs models',
354 |         'name': 'gs over voting across bagged best gs models',
355 |         'pl': GridSearchCV( 
356 |                     VotingClassifier( estimators=[
357 |                         ('bag_knn', BaggingClassifier( 
358 |                                         KNeighborsClassifier(n_jobs=-1, 
359 |                                                             weights='distance', 
360 |                                                             n_neighbors=4), 
361 |                                         n_jobs=-1,
362 |                                         n_estimators=10)),
363 |                         ('bag_svm', BaggingClassifier( 
364 |                                         Pipeline([ ('scaling', StandardScaler()), 
365 |                                                     ('rbf_svm', SVC(kernel='rbf', 
366 |                                                                     probability=True,
367 |                                                                     cache_size=2000,
368 |                                                                     C=10.0,
369 |                                                                     gamma='auto',
370 |                                                                     class_weight='balanced')) ]),    
371 |                                         n_jobs=-1,
372 |                                         n_estimators=10)),
373 |                         ('bag_rf', BaggingClassifier( 
374 |                                         Pipeline([ ('scaling', StandardScaler()), 
375 |                                                     ('random_forest', RandomForestClassifier(n_jobs=-1,
376 |                                                                                 n_estimators=500,
377 |                                                                                 max_features='auto')) ]),
378 |                                         n_jobs=-1,
379 |                                         n_estimators=10))]),
380 |                         param_grid=dict(voting=['hard','soft']),
381 |                         n_jobs=-1)
382 |         },
383 |     # - gridsearch voting w/ bagged + boosted rf
384 |     'expt_41': { 
385 |         'note': 'gs over voting across bagged + boosted best gs models',
386 |         'name': 'gs over voting across bagged + boosted best gs models',
387 |         'pl': GridSearchCV( 
388 |                     VotingClassifier( estimators=[
389 |                         ('bag_knn', BaggingClassifier( 
390 |                                         KNeighborsClassifier(n_jobs=-1, 
391 |                                                             weights='distance', 
392 |                                                             n_neighbors=4), 
393 |                                         n_jobs=-1,
394 |                                         n_estimators=10)),
395 |                         ('bag_svm', BaggingClassifier( 
396 |                                         Pipeline([ ('scaling', StandardScaler()), 
397 |                                                     ('rbf_svm', SVC(kernel='rbf', 
398 |                                                                     probability=True,
399 |                                                                     cache_size=2000,
400 |                                                                     C=10.0,
401 |                                                                     gamma='auto',
402 |                                                                     class_weight='balanced')) ]),    
403 |                                         n_jobs=-1,
404 |                                         n_estimators=10)),
405 | 
406 |                         ('boost_rf', Pipeline([ ('scaling', StandardScaler()), 
407 |                                                 ('adaboost_random_forest', AdaBoostClassifier( 
408 |                                                                             RandomForestClassifier(n_jobs=-1,
409 |                                                                                                 n_estimators=500,
410 |                                                                                                 max_features='auto'),
411 |                                                                             n_estimators=100)) ])) ]),
412 | 
413 | 
414 |                         param_grid=dict(voting=['hard','soft']),
415 |                         n_jobs=-1)
416 |         },
417 |     # - fix vote=soft for 39-40 (41?) & train on full data  ############################# 
418 |     #   - (expt 39 w/o gs + soft vote)
419 |     'expt_42': { 
420 |         # "3-party system" trained this model on the original data 
421 |         #'name': 'Basic three-party system',
422 |         #'note': 'soft voting with best gs models',
423 |         # "E Pluribus Unum" trained this model on the expanded data 
424 |         'name': 'E pluribus unum',
425 |         'note': 'soft voting with best gs models on expanded dataset',
426 |         'pl': VotingClassifier( estimators=[
427 |                         ('gs_knn', Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1, 
428 |                                                             weights='distance', 
429 |                                                             n_neighbors=4)) ])),
430 |                         ('gs_svm', Pipeline([ ('scaling', StandardScaler()), 
431 |                                                 ('rbf_svm', SVC(kernel='rbf', 
432 |                                                                 probability=True,
433 |                                                                 cache_size=2000,
434 |                                                                 C=10.0,
435 |                                                                 gamma='auto',
436 |                                                                 class_weight='balanced')) ])),    
437 |                         ('gs_rf', Pipeline([ ('scaling', StandardScaler()), 
438 |                                                 ('random_forest', RandomForestClassifier(n_jobs=-1,
439 |                                                                                         n_estimators=500,
440 |                                                                                         max_features='auto')) ])) ],
441 |                     voting='soft')
442 | 
443 |         },
444 |     #   - (expt 40 w/o gs + soft vote)
445 |     'expt_43': { 
446 |         'note': 'soft voting with bagged gs models',
447 |         'name': 'PACs and the three-party system',
448 |         'pl': VotingClassifier( estimators=[
449 |                         ('bag_knn', BaggingClassifier( 
450 |                                         KNeighborsClassifier(n_jobs=-1, 
451 |                                                             weights='distance', 
452 |                                                             n_neighbors=4), 
453 |                                         n_jobs=-1,
454 |                                         n_estimators=10)),
455 |                         ('bag_svm', BaggingClassifier( 
456 |                                         Pipeline([ ('scaling', StandardScaler()), 
457 |                                                     ('rbf_svm', SVC(kernel='rbf', 
458 |                                                                     probability=True,
459 |                                                                     cache_size=2000,
460 |                                                                     C=10.0,
461 |                                                                     gamma='auto',
462 |                                                                     class_weight='balanced')) ]),    
463 |                                         n_jobs=-1,
464 |                                         n_estimators=10)),
465 |                         ('bag_rf', BaggingClassifier( 
466 |                                         Pipeline([ ('scaling', StandardScaler()), 
467 |                                                     ('random_forest', RandomForestClassifier(n_jobs=-1,
468 |                                                                                 n_estimators=500,
469 |                                                                                 max_features='auto')) ]),
470 |                                         n_jobs=-1,
471 |                                         n_estimators=10))],
472 |                     voting='soft')
473 |         },
474 |     #   - (expt 41 w/o gs + soft vote)
475 |     'expt_44': { 
476 |         'note': 'voting classifier: 2x bags + boosted RF w/ soft voting',
477 |         'name': 'SuperPACs ruin everything',
478 |         'pl': VotingClassifier( estimators=[
479 |                         ('bag_knn', BaggingClassifier( 
480 |                                         KNeighborsClassifier(n_jobs=-1, 
481 |                                                             weights='distance', 
482 |                                                             n_neighbors=4), 
483 |                                         n_jobs=-1,
484 |                                         n_estimators=10)),
485 |                         ('bag_svm', BaggingClassifier( 
486 |                                         Pipeline([ ('scaling', StandardScaler()), 
487 |                                                     ('rbf_svm', SVC(kernel='rbf', 
488 |                                                                     probability=True,
489 |                                                                     cache_size=2000,
490 |                                                                     C=10.0,
491 |                                                                     gamma='auto',
492 |                                                                     class_weight='balanced')) ]),    
493 |                                         n_jobs=-1,
494 |                                         n_estimators=10)),
495 |                         ('boost_rf', Pipeline([ ('scaling', StandardScaler()), 
496 |                                                 ('adaboost_random_forest', AdaBoostClassifier( 
497 |                                                                                 RandomForestClassifier(n_jobs=-1,
498 |                                                                                                 n_estimators=500,
499 |                                                                                                 max_features='auto'),
500 |                                                                                 n_estimators=100)) ])) ],
501 |                     voting='soft')
502 |         },
503 |     # Include inferred class distributions in best stand-alone models of SVM, RF ################## 
504 |     'expt_45': { 
505 |         'note': 'add class weights to expt_32',
506 |         'name': 'Yeah I work out',
507 |         'pl': Pipeline([ ('scaling', StandardScaler()), 
508 |                         ('random_forest', RandomForestClassifier(n_jobs=-1,
509 |                                                                     n_estimators=500,
510 |                                                                     max_features='auto',
511 |                                                                     class_weight = {0:0.098, 
512 |                                                                                     1:0.111, 
513 |                                                                                     2:0.104, 
514 |                                                                                     3:0.102, 
515 |                                                                                     4:0.098, 
516 |                                                                                     5:0.088, 
517 |                                                                                     6:0.095, 
518 |                                                                                     7:0.103, 
519 |                                                                                     8:0.098, 
520 |                                                                                     9:0.102})) ])
521 |         },
522 |     'expt_46': { 
523 |         'note': 'add class weights to expt_36',
524 |         'name': 'Oh you work out?',
525 |         'pl': BaggingClassifier( 
526 |                     Pipeline([ ('scaling', StandardScaler()), 
527 |                             ('rbf_svm', SVC(kernel='rbf', 
528 |                                             cache_size=2000,
529 |                                             C=10.0,
530 |                                             gamma='auto',
531 |                                             class_weight = {0:0.098, 
532 |                                                             1:0.111, 
533 |                                                             2:0.104, 
534 |                                                             3:0.102, 
535 |                                                             4:0.098, 
536 |                                                             5:0.088, 
537 |                                                             6:0.095, 
538 |                                                             7:0.103, 
539 |                                                             8:0.098, 
540 |                                                             9:0.102})) ]),    
541 |                     n_jobs=-1,
542 |                     n_estimators=10)
543 |         },
544 |     # 
545 |     # As of the time of writing, using the MLPClassifier requires building the   
546 |     #   developer branch of sklearn. If you want to use these experiments, 
547 |     #   the sklearn docs include a ref for building this version:
548 |     #   http://scikit-learn.org/stable/developers/contributing.html#git-repo
549 |     #   Then, you can uncomment the next few experiments below (+ 52) to run them. 
550 |     #
551 |     # neural network experiments ################################################
552 |     # - sklearn's MLPClassifier
553 | #    'expt_47': { 
554 | #        'note': 'gridsearch multilayer perceptron, using tips from dev docs',
555 | #        'name': 'tbd',
556 | #        'pl': GridSearchCV( 
557 | #                    Pipeline([ ('scaling', StandardScaler()), 
558 | #                                ('mlp', MLPClassifier()) ]), 
559 | #                    param_grid=dict( mlp__alpha=10.0**-np.arange(1, 7),
560 | #                                    mlp__hidden_layer_sizes=[(50, ), (100, ), (200, )],
561 | #                                    mlp__activation=['logistic', 'tanh', 'relu'],
562 | #                                    mlp__algorithm=['l-bfgs', 'sgd', 'adam']),
563 | #                    n_jobs=-1)
564 | #        },
565 | #    # - v2 of sklearn's MLPClassifier
566 | #    'expt_48': { 
567 | #        'note': 'v2 of gridsearch multilayer perceptron, modifying param_grid',
568 | #        'name': 'tbd',
569 | #        'pl': GridSearchCV( 
570 | #                    Pipeline([ ('scaling', StandardScaler()), 
571 | #                                ('mlp', MLPClassifier(activation='relu')) ]), 
572 | #                    param_grid=dict( mlp__alpha=10.0**-np.arange(-1,6),
573 | #                                    mlp__hidden_layer_sizes=[(50,), 
574 | #                                                            (100,), 
575 | #                                                            (200,), 
576 | #                                                            (50,50), 
577 | #                                                            (100,100), 
578 | #                                                            (200,200), 
579 | #                                                            (50,50,50), 
580 | #                                                            (100,100,100), 
581 | #                                                            (200,200,200)],
582 | #                                    mlp__algorithm=['l-bfgs', 'adam']),
583 | #                    n_jobs=-1)
584 | #        },
585 | #    # - gridsearch wide MLP hidden layers
586 | #    'expt_49': { 
587 | #        'note': 'v3 of gridsearch multilayer perceptron, modifying param_grid',
588 | #        'name': 'tbd',
589 | #        'pl': GridSearchCV( 
590 | #                    Pipeline([ ('scaling', StandardScaler()), 
591 | #                                ('mlp', MLPClassifier(activation='relu', verbose=True)) ]), 
592 | #                    param_grid=dict( mlp__alpha=10.0**-np.arange(-2,5),
593 | #                                    mlp__hidden_layer_sizes=[(200,), 
594 | #                                                            (500,), 
595 | #                                                            (1000,), 
596 | #                                                            (200,200), 
597 | #                                                            (500,500), 
598 | #                                                            (1000,1000), 
599 | #                                                            (500,500,500)],
600 | #                                    mlp__algorithm=['l-bfgs', 'adam']),
601 | #                    n_jobs=-1)
602 | #        },
603 |     # revisit SVM with poly kernel gridsearch ##################################################
604 |     'expt_50': { 
605 |         'note': 'gridsearch poly kernel degree with scaled svm',
606 |         'name': 'gridsearch poly kernel degree with scaled svm',
607 |         'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 
608 |                                         ('svm', SVC(cache_size=2000,
609 |                                                     kernel='poly', 
610 |                                                     gamma='auto')) ]),
611 |                             param_grid=dict(svm__C=[0.1, 0.5, 1.0, 5.0, 10.0, 15.0],
612 |                                             svm__degree=np.arange(2,12)),
613 |                             n_jobs=-1) 
614 |         },
615 | #    # dimensionality reduction + kNN ######################################################
616 | #    # note: this doesn't work because TSNE doesn't implement a transform method. Pipeline throws 
617 | #    #          an error on import about this, so leave this commented out.
618 | #    'expt_51': { 
619 | #        'note': 'gridsearch over tSNE dim reduction + kNN',
620 | #        'name': 'gridsearch over tSNE dim reduction + kNN',
621 | #        'pl': GridSearchCV( Pipeline([ 
622 | #                                ('tsne', TSNE(verbose=1)), 
623 | #                                ('knn', KNeighborsClassifier(n_jobs=-1)) ]), 
624 | #                            param_grid=dict(tsne__n_components=[2,3,4],
625 | #                                            tsne__perplexity=[20,30,40,50],
626 | #                                            tsne__learning_rate=[400,700,1000],
627 | #                                            knn__n_neighbors=range(2,10), 
628 | #                                            knn__weights=['distance','uniform']), 
629 | #                            n_jobs=-1 ) 
630 | #        },
631 |     # best MLP from gridsearch (note: out of order due to run time!) ######################### 
632 |     # {'mlp__hidden_layer_sizes': (1000, 1000), 'mlp__algorithm': 'l-bfgs', 'mlp__alpha': 10.0} 
633 | #    'expt_52': { 
634 | #        'note': 'best MLP from gridsearch',
635 | #        'name': 'Pinky and the Brain',
636 | #        'pl': Pipeline([ ('scaling', StandardScaler()), 
637 | #                        ('mlp', MLPClassifier(activation='relu', 
638 | #                                                hidden_layer_sizes=(1000,1000), 
639 | #                                                algorithm='l-bfgs', 
640 | #                                                alpha=10.0, 
641 | #                                                verbose=True)) ]) 
642 | #        },
643 | 
644 |     } # end of experiment_dict
645 | 
646 | 


--------------------------------------------------------------------------------