├── log └── README.md ├── saved_models └── README.md ├── data ├── README.md ├── expt_42.jpg ├── SWTPF_counts.csv ├── original │ └── README.md └── SWTPF-leaderboard-scores.csv ├── submissions └── README.md ├── requirements.txt ├── requirements_dev_sklearn.txt ├── LICENSE ├── .gitignore ├── images.py ├── launch-processes.bash ├── Makefile ├── utils.py ├── convert-binary-data.py ├── expand-np-arrays.py ├── README.md ├── notes.md └── models.py /log/README.md: -------------------------------------------------------------------------------- 1 | this is where we'll write logs 2 | -------------------------------------------------------------------------------- /saved_models/README.md: -------------------------------------------------------------------------------- 1 | this is where we'll save our models 2 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | Data and intermediate files go here, but aren't commited. 2 | -------------------------------------------------------------------------------- /submissions/README.md: -------------------------------------------------------------------------------- 1 | This is where we'll write out all submission files. 2 | -------------------------------------------------------------------------------- /data/expt_42.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrmontag/mnist-sklearn/HEAD/data/expt_42.jpg -------------------------------------------------------------------------------- /data/SWTPF_counts.csv: -------------------------------------------------------------------------------- 1 | 997 0 2 | 1135 1 3 | 1039 2 4 | 1011 3 5 | 980 4 6 | 880 5 7 | 961 6 8 | 1022 7 9 | 969 8 10 | 1006 9 11 | -------------------------------------------------------------------------------- /data/original/README.md: -------------------------------------------------------------------------------- 1 | Put the provided data files here. They should be named: ``test-images.gz, train-images.gz, train-labels.gz`` 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.10.2 2 | scipy==0.16.1 3 | jupyter==1.0.0 4 | scikit-learn==0.17 5 | matplotlib==1.5.0 6 | seaborn==0.6.0 7 | -------------------------------------------------------------------------------- /data/SWTPF-leaderboard-scores.csv: -------------------------------------------------------------------------------- 1 | 0.99081633 0 2 | 0.99030837 1 3 | 0.9689922481 2 4 | 0.9653465347 3 5 | 0.9735234216 4 6 | 0.9674887892 5 7 | 0.9791231733 6 8 | 0.96692607 7 9 | 0.9599589322 8 10 | 0.9554013875 9 11 | -------------------------------------------------------------------------------- /requirements_dev_sklearn.txt: -------------------------------------------------------------------------------- 1 | numpy==1.10.2 2 | scipy==0.16.1 3 | jupyter==1.0.0 4 | -e git@github.com:scikit-learn/scikit-learn.git@7cfa55452609c717c96b4c267466c80cc4038845 5 | matplotlib==1.5.0 6 | seaborn==0.6 7 | cython==0.23 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Josh Montague 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/python 2 | 3 | ### data + misc ### 4 | *.gz 5 | *.swp 6 | *.npy 7 | *.npy.z 8 | *.submission 9 | *.pdf 10 | *.pkl 11 | *.log 12 | *.out 13 | 14 | 15 | ### Python ### 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # IPython checkpoints 22 | .ipynb_checkpoints 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | env/ 30 | tmp-venv/ 31 | build/ 32 | develop-eggs/ 33 | dist/ 34 | downloads/ 35 | eggs/ 36 | .eggs/ 37 | lib/ 38 | lib64/ 39 | parts/ 40 | sdist/ 41 | var/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *,cover 65 | .hypothesis/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | -------------------------------------------------------------------------------- /images.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import fileinput 3 | import struct 4 | 5 | # read the labels from binary file 6 | label_names = ["train-labels.gz"] 7 | g = fileinput.FileInput(label_names, openhook=fileinput.hook_compressed) 8 | # grab the first chunk of data for header info 9 | x = g.next() 10 | head = [] 11 | for i in range(2): 12 | head.append(struct.unpack(">I", x[4*i:4*i+4])[0]) 13 | magic, n_labels = head 14 | print "magic={}\nlabels={}".format(*head) 15 | # start reading the lables 16 | # unsigned binary ints - 1 byte each 17 | labels = [] 18 | j = 8 # byte index on current chunk 19 | while len(labels) < n_labels: 20 | try: 21 | val = struct.unpack("B", x[j])[0] 22 | except IndexError: 23 | # read a new chuck from file 24 | x = g.next() 25 | j = 0 26 | val = struct.unpack("B", x[j])[0] 27 | labels.append(val) 28 | j += 1 29 | 30 | # read images from binary file 31 | infile_names = ["train-images.gz"] 32 | f = fileinput.FileInput(infile_names, openhook=fileinput.hook_compressed) 33 | x = f.next() 34 | head = [] 35 | for i in range(4): 36 | head.append(struct.unpack(">I", x[4*i:4*i+4])[0]) 37 | magic, n_images, rows, columns = head 38 | print "magic={}\nimages={}\nrows={}\ncols={}".format(*head) 39 | j = 16 # index in current chunk 40 | for i in range(n_images): 41 | for r in range(rows): 42 | for c in range(columns): 43 | try: 44 | val = struct.unpack("B", x[j])[0] 45 | except IndexError: 46 | # need to read a new chunck of data from finle 47 | x = f.next() 48 | j = 0 49 | val = struct.unpack("B", x[j])[0] 50 | ################################## 51 | # simple image plots using screen text layout 52 | # 3 levels of grey 53 | if val > 170: 54 | print "#", 55 | elif val > 85: 56 | print ".", 57 | else: 58 | print " ", 59 | ################################## 60 | j += 1 61 | print "row={:2}, j={:4}".format(r,j) 62 | print "image={}, label={}".format(i, labels[i]) 63 | 64 | 65 | -------------------------------------------------------------------------------- /launch-processes.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # This script is used to launch a set of experiments or processes 5 | # 6 | 7 | #################### config ############################# 8 | # range of experiments to run 9 | SEQUENCE=`seq 1 3` 10 | #SEQUENCE=42 11 | 12 | # are we doing a cv split ("experiment")? (vs. train and predict on test) 13 | EXPERIMENT=true 14 | 15 | # server (ubuntu)? (vs. laptop) 16 | SERVER=true 17 | 18 | # expanded dataset? (vs. as-given) 19 | EXPANDED=false 20 | 21 | # stagger the workload (seconds) 22 | SLEEPTIME=10 23 | # 60 = 1 min 24 | # 300 = 5 min 25 | # 600 = 10 min 26 | # 1200 = 20 min 27 | ########################################################### 28 | 29 | ## variables 30 | # log dir 31 | LOGDIR=log 32 | # virtualenv 33 | #VENV=/home/jmontague/CCC-venv 34 | VENV=tmp-venv 35 | #VENV=dev-tmp-venv 36 | # Python script 37 | PY_SCRIPT=build-model.py 38 | 39 | # activate the appropriate virtualenv 40 | source ${VENV}/bin/activate 41 | 42 | # set some defaults 43 | ARGS="--verbose" 44 | NICE="nice -n10" 45 | 46 | 47 | # remember bash boolean conditionals are confusing; just 48 | # use convenient strings 49 | # http://stackoverflow.com/a/21210966/1851811 50 | if [ "${EXPERIMENT}" = true ]; then 51 | # be polite 52 | NICE="nice -n15" 53 | else 54 | # be slightly less polite 55 | NICE="nice -n5" 56 | ARGS="${ARGS} --submission" 57 | fi 58 | 59 | if [ "${SERVER}" = true ]; then 60 | ARGS="${ARGS} --ubuntu" 61 | fi 62 | 63 | if [ "${EXPANDED}" = true ]; then 64 | ARGS="${ARGS} --expanded" 65 | fi 66 | 67 | 68 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- started running $0" 69 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- using python interpreter: $(which python)" 70 | 71 | # name all of these processes similarly 72 | filedate="$(date +%Y-%m-%dT%H:%M:%S)" 73 | 74 | for i in ${SEQUENCE}; do 75 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- launching experiment ${i} with ${PY_SCRIPT} and ARGS=${ARGS}" 76 | # launch the appropriate process 77 | # - run this bash script w/ nohup & all of the python procs will inherit it 78 | ${NICE} python ${PY_SCRIPT} expt_${i} ${ARGS} > ${LOGDIR}/${filedate}_expt_${i}.log & 79 | # note this will also sleep after the last process 80 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- sleeping for ${SLEEPTIME} seconds" 81 | sleep ${SLEEPTIME} 82 | done 83 | 84 | 85 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- deactivating virtualenv" 86 | deactivate 87 | 88 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- interpreter now set to: $(which python)" 89 | echo "$(date +%Y-%m-%d\ %H:%M:%S) -- finished launching experiments" 90 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Josh Montague, 2015-12 2 | # MIT License 3 | 4 | # locations 5 | BASEDIR=$(PWD) 6 | DATADIR=$(BASEDIR)/data 7 | SAVEDIR=$(BASEDIR)/saved_models 8 | SUBDIR=$(BASEDIR)/submissions 9 | LOGDIR=$(BASEDIR)/log 10 | 11 | 12 | # Python 13 | BASE_PY=python2 14 | # rename virtualenv if desired 15 | VENV=tmp-venv 16 | DEVVENV=dev-tmp-venv 17 | # virtualenv-specific locations 18 | VBIN=$(BASEDIR)/$(VENV)/bin 19 | 20 | # code 21 | #CONVERT=convert-binary-data.py 22 | #EXPAND=expand-np-arrays.py 23 | 24 | 25 | # datetime 26 | DATE := $(shell date +'%Y-%m-%dT%H:%M:%S') 27 | #DATE := $(shell date +'%Y-%m-%d') 28 | TIME := $(shell date +'%H:%M:%S') 29 | 30 | help: 31 | @echo 'Makefile for reproducible analysis ' 32 | @echo ' ' 33 | 34 | 35 | # run everything in the setup 36 | demo: $(SAVEDIR)/knn_cv-split_*.pdf 37 | 38 | 39 | # example experiment 40 | $(SAVEDIR)/knn_cv-split_*.pdf: $(DATADIR)/train-images.npy 41 | @echo 42 | @echo 'Sample experiments will now run for ~45 seconds. The corresponding ' 43 | @echo ' log file swill be available in log/ afterward.' 44 | @echo 45 | @echo 'When complete, cross-validation confusion matrices will open' 46 | @echo ' automatically.' 47 | nohup nice bash launch-processes.bash > $(LOGDIR)/$(DATE)_sample-log.nohup.out 48 | open $(SAVEDIR)/*.pdf 49 | 50 | 51 | # binary data ==> npy arrays 52 | $(DATADIR)/train-images.npy: $(VBIN)/activate $(DATADIR)/original/train-images.gz 53 | . $(VENV)/bin/activate; \ 54 | python convert-binary-data.py 55 | 56 | 57 | # local environment 58 | $(VBIN)/activate: requirements.txt 59 | virtualenv -p $(BASE_PY) $(VENV) 60 | . $(VENV)/bin/activate ; \ 61 | pip install -r $< 62 | touch $(VENV)/bin/activate 63 | 64 | 65 | # download binary data from web 66 | $(DATADIR)/original/train-images.gz: 67 | curl http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz -o data/original/train-images.gz & 68 | curl http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz -o data/original/train-labels.gz & 69 | curl http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz -o data/original/test-images.gz 70 | 71 | 72 | # additional training data 73 | expanded: $(DATADIR)/train-images.npy 74 | test -e data/expanded-train-images.npy || \ 75 | . $(VENV)/bin/activate ; \ 76 | python expand-np-arrays.py 77 | touch $< 78 | 79 | 80 | # !!! delete all generated npy arrays !!! 81 | clean: 82 | [ ! -d data ] || rm data/*.npy 83 | 84 | 85 | # build env with dev version of sklearn 86 | skl-dev-env: requirements_dev_sklearn.txt 87 | virtualenv -p $(BASE_PY) $(DEVVENV) 88 | . $(DEVVENV)/bin/activate ; \ 89 | pip install -r $< 90 | touch $(DEVVENV)/bin/activate 91 | 92 | 93 | 94 | .PHONY: clean expanded all 95 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | __author__="Josh Montague" 3 | __license__="MIT License" 4 | 5 | # 6 | # This module defines a number of helper functions. 7 | # 8 | 9 | from datetime import datetime 10 | import logging 11 | import numpy as np 12 | import os 13 | import sys 14 | 15 | 16 | # set up a logger 17 | util_logr = logging.getLogger(__name__) 18 | util_logr.setLevel(logging.DEBUG) 19 | util_sh = logging.StreamHandler(stream=sys.stdout) 20 | formatter = logging.Formatter('%(asctime)s : %(name)s : %(levelname)s : %(message)s') 21 | util_sh.setFormatter(formatter) 22 | util_logr.addHandler(util_sh) 23 | 24 | 25 | def short_name(model): 26 | """Return a simplified name for this model. A bit brittle.""" 27 | # for a single model, this will work 28 | name = model.__class__.__name__ 29 | try: 30 | if hasattr(model, 'steps'): 31 | # pipeline 32 | name = '-'.join( [ pair[0] for pair in model.steps ] ) 33 | elif hasattr(model, 'best_estimator_'): 34 | if hasattr(model.estimator, 'steps'): 35 | # gridsearchcv 36 | name = 'gscv_' + '-'.join( [x[0] for x in model.estimator.steps ]) 37 | elif hasattr(model.estimator, 'estimators'): 38 | # votingclassifier 39 | name = 'gscv_vc_' + '-'.join( [x[0] for x in model.estimator.estimators ]) 40 | elif hasattr(model, 'base_estimator_'): 41 | # bagging 42 | name = 'bag_' + short_name(model.base_estimator) 43 | except AttributeError, e: 44 | util_logr.info('utils.short_name() couldnt generate quality name') 45 | # for a single model, this will work 46 | name = model.__class__.__name__ 47 | util_logr.info('falling back to generic name={}'.format(name)) 48 | return name 49 | 50 | 51 | def create_submission(predictions, sub_name, comment=None, team='DrJ'): 52 | """Include the specified array of image predictions in a 53 | properly-formatted submission file. 54 | """ 55 | now = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S') 56 | submission_name = '-'.join(sub_name.split()) 57 | with open('submissions/{}_{}.submission'.format( 58 | now, submission_name), 'w') as f: 59 | 60 | f.write('#'*20 + ' Generated submission file\n') 61 | if comment is not None: 62 | f.write('# ' + comment + '\n') 63 | f.write('{}\n'.format(team)) 64 | f.write('{}\n'.format(now)) 65 | f.write('{}\n'.format(sub_name)) 66 | for p in predictions: 67 | f.write('{}\n'.format(p)) 68 | return True 69 | 70 | 71 | def load_np_arrays(files='original'): 72 | """ 73 | Return numpy arrays for training dataset, training labels, 74 | and test dataset (in that order). If files='original', 75 | return the image data, as given. If files='expanded', 76 | return the perturbed image files (~5 times larger). 77 | 78 | files='original', 'expanded' 79 | """ 80 | # nb: path assumes that we call this function from project root 81 | train_imgs_f = 'train-images.npy' 82 | train_labels_f = 'train-labels.npy' 83 | 84 | if files == 'expanded': 85 | train_imgs_f = 'expanded-' + train_imgs_f 86 | train_labels_f = 'expanded-' + train_labels_f 87 | 88 | X_train = np.load(os.path.join('data', train_imgs_f)) 89 | y_train = np.load(os.path.join('data', train_labels_f)) 90 | X_test = np.load(os.path.join('data', 'test-images.npy')) 91 | 92 | return (X_train, y_train, X_test) 93 | 94 | 95 | -------------------------------------------------------------------------------- /convert-binary-data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # modified from images.py (included), by Scott Hendrickson 5 | # 6 | 7 | import fileinput 8 | import logging 9 | import numpy as np 10 | import struct 11 | import sys 12 | 13 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 14 | #level=logging.DEBUG, 15 | level=logging.INFO, 16 | stream=sys.stdout 17 | ) 18 | 19 | 20 | logging.info('Beginning data conversion') 21 | 22 | # display ascii versions of training data 23 | PLOT = False 24 | logging.info('ASCII plotting enabled? {}'.format(PLOT)) 25 | 26 | 27 | ######### 28 | # Get the training labels 29 | ######### 30 | logging.info("reading training labels") 31 | label_names = ["data/original/train-labels.gz"] 32 | g = fileinput.FileInput(label_names, openhook=fileinput.hook_compressed) 33 | # grab the first chunk of data for header info 34 | logging.info(" reading header") 35 | x = g.next() 36 | head = [] 37 | for i in range(2): 38 | head.append(struct.unpack(">I", x[4*i:4*i+4])[0]) 39 | magic, n_labels = head 40 | logging.info(" magic={}, labels={}".format(*head)) 41 | 42 | # unsigned binary ints - 1 byte each 43 | logging.info(" reading data") 44 | labels = [] 45 | j = 8 # byte index on current chunk 46 | while len(labels) < n_labels: 47 | try: 48 | val = struct.unpack("B", x[j])[0] 49 | except IndexError: 50 | # read a new chuck from file 51 | x = g.next() 52 | j = 0 53 | val = struct.unpack("B", x[j])[0] 54 | labels.append(val) 55 | j += 1 56 | logging.debug("observed labels: {}".format(labels)) 57 | 58 | label_array = np.array(labels) 59 | logging.debug(" label_array type: {} (length: {})".format(type(label_array), len(label_array))) 60 | 61 | logging.info("writing numpy label array to disk") 62 | with open('data/train-labels.npy', 'wb') as f: 63 | np.save(f, label_array) 64 | 65 | 66 | ################################ 67 | 68 | datasets = ( ('train-images', 'data/original/train-images.gz'), 69 | ('test-images', 'data/original/test-images.gz') ) 70 | 71 | for dataset in datasets: 72 | data_name, data_file = dataset 73 | logging.info('reading dataset={}, from file={}'.format(data_name, data_file)) 74 | f = fileinput.FileInput([data_file], openhook=fileinput.hook_compressed) 75 | x = f.next() 76 | # start with the relevant header data 77 | head = [] 78 | logging.info(" reading header") 79 | for i in range(4): 80 | head.append(struct.unpack(">I", x[4*i:4*i+4])[0]) 81 | magic, n_images, rows, columns = head 82 | logging.info(" magic={}, images={}, rows={}, cols={}".format(*head)) 83 | 84 | # now we know the shape of the data, so we can allocate an array 85 | data_array = np.zeros((n_images, rows*columns), dtype=int) 86 | 87 | # onto the main file data 88 | logging.info(" reading data") 89 | j = 16 # index in current chunk 90 | for i in range(n_images): 91 | # keep track of all values for this sample (image) 92 | sample_i_values = [] 93 | for r in range(rows): 94 | # keep appending to sample array all the way through 95 | # the rows and cols of sample i 96 | for c in range(columns): 97 | try: 98 | val = struct.unpack("B", x[j])[0] 99 | except IndexError: 100 | # need to read a new chunck of data from finle 101 | x = f.next() 102 | j = 0 103 | val = struct.unpack("B", x[j])[0] 104 | if PLOT: 105 | ################################## 106 | # simple image plots using screen text layout 107 | # 3 levels of grey 108 | if val > 170: 109 | print "#", 110 | elif val > 85: 111 | print ".", 112 | else: 113 | print " ", 114 | ################################## 115 | # append this value to the sample row 116 | sample_i_values.append(val) 117 | j += 1 118 | if PLOT: 119 | print "row={:2}, j={:4}".format(r,j) 120 | if PLOT and data_name is 'train-images': 121 | # there are no labels for the test dataset 122 | print "image={}, label={}".format(i, labels[i]) 123 | # visually verify that our numeric values are similar to the ascii art 124 | logging.debug("sample_i_values (len={}): {}".format(len(sample_i_values), sample_i_values)) 125 | 126 | # update the row in our cumulative array that corresponds to this sample (image) 127 | data_array[i] = np.array(sample_i_values) 128 | 129 | # after the for loop, save for later use (more transparently, with .npy arrays) 130 | logging.info("writing {} to disk as numpy array".format(data_name)) 131 | with open('data/{}.npy'.format(data_name), 'wb') as f: 132 | np.save(f, data_array) 133 | 134 | -------------------------------------------------------------------------------- /expand-np-arrays.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | __author__="Josh Montague" 4 | __license__="MIT License" 5 | 6 | """ 7 | Adapted from: 8 | https://github.com/mnielsen/neural-networks-and-deep-learning 9 | """ 10 | 11 | import argparse 12 | import gc 13 | import logging 14 | import numpy as np 15 | import random 16 | import sys 17 | 18 | import utils 19 | 20 | 21 | # this will be helpful for displaying arrays 22 | np.set_printoptions(linewidth=200) 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("-v", "--verbose", action="store_true", 26 | help="increase output verbosity") 27 | args = parser.parse_args() 28 | 29 | # use a simple logger - get the level from the cmd line 30 | loglevel = logging.DEBUG if args.verbose else logging.INFO 31 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 32 | stream=sys.stdout, level=loglevel) 33 | logging.debug('logging enabled - preparing for work') 34 | 35 | 36 | logging.info('reading original input data from disk') 37 | try: 38 | # nb: we don't need X_test right now 39 | X_train_full, y_train_full, X_test = utils.load_np_arrays() 40 | logging.debug('observed data dimensions: {}, {}. {}'.format( 41 | X_train_full.shape, y_train_full.shape, X_test.shape)) 42 | except IOError, e: 43 | # let it crash, but give some insight in the log 44 | logging.warn('Error reading data from files (do they exist yet?)') 45 | logging.warn('Error message={}'.format(e)) 46 | raise e 47 | 48 | # define the perturbations 49 | perturb_list = [ 50 | # displacement, axis, index position, index 51 | (1, 0, "first", 0), 52 | (-1, 0, "first", 27), 53 | (1, 1, "last", 0), 54 | (-1, 1, "last", 27)] 55 | 56 | # a container for our new data set 57 | expanded_data = [] 58 | 59 | # loop over all images in training set 60 | for i, (img, label) in enumerate(zip(X_train_full, y_train_full)): 61 | if i % 100 == 0: 62 | logging.info('perturbing image number {}, current length of expanded_data={}'.format(i, len(expanded_data))) 63 | # add the original array to the new array 64 | expanded_data.append((img, label)) 65 | # reshape back into square for roll() 66 | img = np.reshape(img, (28,28)) 67 | # perturb in each direction 68 | for d, ax, position, idx in perturb_list: 69 | #logging.debug('current perturb_list={}'.format([d,ax,position,idx])) 70 | # shift pixels in this image by d along ax 71 | perturbed_img = np.roll(img, d, ax) 72 | # in case pixels get shifted across the edge boundaries, 73 | # we can just set the corresponding edge to 0s (first 74 | # reshape the array so we can slice efficiently) 75 | #perturbed_img = np.reshape(perturbed_img, (28,28)) 76 | if position == "first": 77 | # first row/column 78 | perturbed_img[idx, :] = np.zeros(28) 79 | else: 80 | # last row/column 81 | perturbed_img[:, idx] = np.zeros(28) 82 | # add new (flattened) image and label to the expanded list 83 | expanded_data.append( (np.reshape(perturbed_img, 784), label) ) 84 | # e_d ~ [(np.arr, int), ... ] 85 | 86 | logging.debug('current pertub_list={}'.format([d,ax,position,idx])) 87 | logging.debug('current label={}'.format(label)) 88 | logging.debug('original image array=\n{}'.format(img)) 89 | logging.debug('shifted data array=\n{}'.format(perturbed_img)) 90 | 91 | 92 | # shuffle to avoid bias in array positions 93 | logging.info('shuffling expanded data set') 94 | random.shuffle(expanded_data) 95 | logging.debug('expanded_data=\n{}'.format(expanded_data)) 96 | 97 | # e_d is a list of (img-array, label) tuples 98 | # - zip(*e_d) pairs the elements of each img-array 99 | logging.info('converting expanded data to list of numpy arrays') 100 | expanded_data_array_list = [np.array(x) for x in zip(*expanded_data)] 101 | logging.info('length of expanded array list={}'.format(len(expanded_data_array_list))) 102 | 103 | # extract the labels from the last column of the array 104 | #y_expanded = expanded_data_array[:,-1] 105 | # and the data from everything *but* the last column 106 | #X_expanded = expanded_data_array[:,:-1] 107 | X_expanded = expanded_data_array_list[0] 108 | y_expanded = expanded_data_array_list[1] 109 | 110 | logging.debug('X_expanded (length={})=\n{}'.format(len(X_expanded), X_expanded)) 111 | logging.debug('y_expanded (length={})=\n{}'.format(len(y_expanded), y_expanded)) 112 | 113 | # verify 114 | n=5 115 | for i in range(n): 116 | logging.debug('expanded image {} (label={}):\n{}'.format(i, y_expanded[i], np.reshape(X_expanded[i,:], (28,28)))) 117 | 118 | 119 | # after the for loop, save for later use (more transparently, with .npy arrays) 120 | for name, dataset in [('expanded-train-images', X_expanded) , ('expanded-train-labels', y_expanded)]: 121 | logging.info("writing {} to disk as numpy array".format(name)) 122 | with open('data/{}.npy'.format(name), 'wb') as f: 123 | np.save(f, dataset) 124 | 125 | logging.info('done expanding data') 126 | 127 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MNIST + ``scikit-learn`` = :star2: 2 | 3 | This code was developed for a intra-team Kaggle-like modeling competition on the canonical [MNIST handwritten digits dataset](https://en.wikipedia.org/wiki/MNIST_database). For more narrative on the approach and process, [you can read this article.](http://joshmontague.com/posts/2016/mnist-scikit-learn/) 4 | 5 | We were given the training images and labels, the test images, and a simple Python script that read (and "displayed") the binary image data (also included here: ``images.py``). We had two weeks to submit any number of prediction files for the test images, one prediction per line. My highest accuracy model ([#42](https://github.com/jrmontag/mnist-sklearn/blob/master/models.py#L419)) scored 98.18% with no feature engineering. The same model with a minor amount of feature engineering ("added samples" [via image translation](https://github.com/jrmontag/mnist-sklearn/blob/master/expand-np-arrays.py)) scored a 98.68% and was my highest score. 6 | 7 | ![Model #42 confusion matrix (original data)](data/expt_42.jpg "Model #42 confusion matrix (original data)") 8 | 9 | The rules were pretty simple: 10 | 11 | - write code in whatever language you want 12 | - use whatever platform you want (laptop, EC2, tea leaves, stone tablet) 13 | - be prepared to present what you did to the team (in particular, this meant if you cheated and just downloaded the test labels, you were in for A Bad Time) 14 | 15 | Given the small size of the data and how much I :heart: ``scikit-learn``, I chose to use Python on a single server (a combination of a MacBook and AWS EC2). All of the data processing and modeling code here is written in Python, though there are a couple of additional bash scripts to facilitate various pieces of the workflow. I think most of this code should work out of the box with Python 2.7 on OS X and Ubuntu (an isolated environment is recommended - the requirements.txt I used is included). 16 | 17 | **BUT WAIT!** I can do you one better: I included the entire setup that I used within this repo. If all goes as intended, the following commands will get you up and running\*. 18 | 19 | ```bash 20 | $ git clone 21 | $ cd mnist-sklearn 22 | $ make demo 23 | ``` 24 | 25 | \*the prerequisites for using this code are having Python 2.7 (yeah, I know it's old), ``make``, and ``virtualenv`` installed. Your machine likely has ``make`` installed already. If needed, you can ``sudo pip install virtualenv``. 26 | 27 | ## What's happening here? 28 | 29 | The ``make demo`` command will do the following things: 30 | 31 | - use ``virtualenv`` to create an isolated Python environment in this directory (and install all the necessary libraries) 32 | - download the raw binary data from [Yann LeCun's website](http://yann.lecun.com/exdb/mnist/) (about 10 MB) 33 | - convert those binary files to ``numpy`` arrays and write them to disk (about 400 MB) 34 | - start a set of sample model runs 35 | - display the confusion matrices from the best cross-validation model 36 | 37 | The one-time environment setup may take a few minutes. The three sample models are staggered by the ``bash`` script and should be done in about 45 seconds. You can look at the data in ``log/`` to see what's going on, and look in ``saved_models/`` to see both the serialized models and per-model confusion matrices. 38 | 39 | ## What should I do next? 40 | 41 | For more standard usage, the designed approach is to add new Pipelines to ``models.py`` (with accompanying descriptions and names, used in file-nameing conventions). Then, update the ``SEQUENCE`` variable in ``launch-processes.bash`` - either using a single value, or a range via ``seq``. Each "experiment" (as the ``expt_*`` convention was intended), will create a new log file and all logs from a single use of ``launch-process.bash`` will share a timestamp for ease of separating your trials. 42 | 43 | Since some of the models can take minutes to hours to run, the recommended syntax is something like: 44 | 45 | ```bash 46 | $ nohup bash launch-processes.bash > log/2016-01-12_expt-4-12.nohup.log & 47 | ``` 48 | 49 | This will let you disconnect from the session while things are still running, and also log (in the nohup log) any unexpected exceptions that crash your code. 50 | 51 | Given the relatively small size of data, most of these models seem to be CPU bound. For optimal iteration time (and fun of watching ``htop``), a high-CPU-count server is the best approach. Go ask AWS for something from the C3 or C4 family of EC2 instances. 52 | 53 | As you dig in further, review the Notes section below for some important details. 54 | 55 | Questions? [Let me know](https://www.twitter.com/jrmontag)! Otherwise, have fun classifying! 56 | 57 | ----- 58 | 59 | ## Notes 60 | 61 | - I'd recommend using an isolated environment; I used ``virtualenv`` and the Makefile will you set that up 62 | - for reasons unclear to me, when ``pip install``ing these requirements on Ubuntu, ``pip`` threw a ``Failed building wheel`` error for ``numpy, scipy,`` and ``scikit-learn``. It appeared to then recover and be happy by building them via ``setup.py``. ``¯\_(ツ)_/¯`` 63 | - As of the time of writing, the development branch of ``sklearn`` is required to use the ``MLPClassifer`` (Multi-layer perceptron). See the additional notes below for instructions on setting this up. 64 | - This code was developed on both OS X and Ubuntu, which can lead to inconsistencies in the behavior of library behavior. At least once, I had to modify the ``matplotlib`` "backend" in the corresponding Python environment ``matplotlibrc`` file. In the end, I was using ``macosx`` and ``agg`` (on OS X and Ubuntu, respectively). 65 | - The expanded data set (small, linear translations of the original data) used to obtain the top score is obtained by a separate ``make`` command, and is ~2 GB): 66 | - As the get more complicated, models can take from order seconds (default k-nearest neighbors) to order hours (Multi-layer Perceptron). If using shared resources (or, anecdotally, to increase the efficiency of running multiple processes), I recommend increasing the ``SLEEPTIME`` variable in the launch script to something like 1-5 minutes. 67 | 68 | 69 | ### Installing the development branch of ``sklearn`` 70 | 71 | I sort of figured this out by trial an error (on Ubuntu, didn't test on OS X), so there may be a better way to do it. Nevertheless, here's what I did that worked: 72 | 73 | - If you want to build a separate virtualenv from the one with stable ``sklearn`` (this is, after all part of the point of virtualenv), do the following: 74 | 75 | ```bash 76 | # install similar libraries as before, plus cython 77 | $ make skl-dev-env 78 | # install sklearn from the git commit I used 79 | $ source dev-tmp-venv/bin/activate 80 | $ pip install git+https://github.com/scikit-learn/scikit-learn.git@7cfa55452609c717c96b4c267466c80cc4038845 81 | ``` 82 | 83 | - Or, if you want to use the virtualenv that you've already built, you can: 84 | 85 | ```bash 86 | # replace sklearn + install cython 87 | $ source dev-tmp-venv/bin/activate 88 | $ pip uninstall sklearn 89 | $ pip install cython 90 | $ pip install git+https://github.com/scikit-learn/scikit-learn.git@7cfa55452609c717c96b4c267466c80cc4038845 91 | ``` 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /notes.md: -------------------------------------------------------------------------------- 1 | # CCC notes, 2015 edition 2 | 3 | These are the notes that I was keeping while working through the development of this code. For posterity's sake, I decided to keep them attached to the repo. Enjoy the walk through my thought process! 4 | 5 | ----------- 6 | 7 | ## kick-off meeting 8 | 9 | - review project definition 10 | - submission format (see "random" example, too): 11 | 12 | 13 | 14 | 15 | 16 | 17 | ## outline 18 | 19 | - [x] read some scikit examples 20 | - [x] copy the given images code into lab, create numpy arrays 21 | - because of some differences in fileInput in the 3.4 interpreter, reverting back to 2.7 22 | - able to run images.py as given 23 | - work on getting the arrays out into np arrays => write to file for easy import later 24 | - note: takes ~30s to read original data into np array 25 | - [x] make first prediction file (interactively) 26 | - add test data to the existing image.py script 27 | - use dummy classifier 28 | - [x] move binary => numpy creation into Makefile 29 | - still in notebook 30 | - other formats? 31 | - joblib (compressed) [link](https://pythonhosted.org/joblib/persistence.html) 32 | - HDF5 (groups) [link](http://docs.h5py.org/en/latest/quick.html#appendix-creating-a-file) 33 | - [this (old) post](https://robertdragan.wordpress.com/2012/08/31/comparying-various-methods-for-saving-and-loading-numpy-arrays/) shows that joblib/hdf5/numpy are all about the same in performance. joblib looks to win, slightly, and is recommended in the scikit-learn docs. so, use that one. 34 | - this seems to pertain more to the models that carry large arrays within them. 35 | - stick to .npy arrays from original data, and clean up that code 36 | - [x] improve virtualenv setup 37 | - get virtualenv incorporated with existing conversion script 38 | - can now ``make everything`` from scratch! 39 | - [x] use basic SVM in notebook (memory of this being good baseline for MNIST) 40 | - [x] move utils code out of notebook 41 | - data reading, submissions creation 42 | - fix import paths for bin/ 43 | - [x] build diagnostics (to save with each model) 44 | - scoring (accuracy + stdev) 45 | - ``cross_val_predict`` + confusion matrix 46 | - [x] per-experiment executables 47 | - create python module that defines the list of models, steps 48 | - use eg ``experiment-1.py`` to read that in and execute from bash 49 | - include saving model, logging, saving confusion matrix 50 | - [x] next round of experiments 51 | - loop over: scaling v. no scaling X every default classifier 52 | - summarize results (``$ cat log/*.log | grep "+/-" | cut -d"=" -f2,5- | sort -nr``): 53 | - k-NN (96.61%) 54 | - scaled rbf SVM (95.76%) 55 | - scaled RF (93.67%) 56 | - scaled k-NN (93.65%) 57 | - RF (93.59%) 58 | - train the top two (kNN [expt4] + scaled rbf SVM [expt7]) on all training data => predict + submission 59 | - make another python utility that takes the experiment pipeline (from models.py) and creates a submission file 60 | - the kNN is pretty fast, but the rbf SVM takes a while train/predict 61 | - also ran the wrong model (expt6); killed and running expt7 now 62 | - submitted kNN (Mac & Kelly) 63 | - submitted scaled rbf SVM (compact popcorn machine) 64 | - didn't include multi-class LR (scaled and not) in initial experiment -- do that now 65 | - not stellar, ~90% 66 | - [x] gridsearch (first level) the best performers from ^ 67 | - gridsearchcv the top three performers above (kNN, scaled rbf SVM, scaled RF) 68 | - need to find reasonable params for each model 69 | - update any code that relies on on pipeline (instead of gscv) 70 | - best gridsearch kNN had ~95%, lower than with default settings 71 | - killed and restarted a few times with attempts to boost efficiency 72 | - these are expts 24-26. best results: 73 | - kNN ~ 95% ({'knn__n_neighbors': 3}) 74 | - default k=10 (was 96.6%) 75 | - SVM ~ 93% ({'rbf_svm__C': 10, 'rbf_svm__gamma': 0.0001, 'rbf_svm__class_weight': 'balanced'} [tie w/ c_w=None]) 76 | - default C=1, gamma='auto' (1/n_features ~ 0.008), classweight=None 77 | - RF ~ 94% ({'random_forest__n_estimators': 100, 'random_forest__max_features': 'auto'}) 78 | - default n_estimators=10, max_features='auto' 79 | - [x] gridsearch (second level) 80 | - focus ranges around blend of "default" and last GS best performance 81 | - these are expts 27-29; check logs for best 82 | - kNN: ~95%. k=3-5, all similar, weight='distance'; {'knn__weights': 'distance', 'knn__n_neighbors': 4} 83 | - SVM: ~95%. C=10, gamma=0.001/auto, similar; {'rbf_svm__C': 10, 'rbf_svm__gamma': 'auto', 'rbf_svm__class_weight': 'balanced'} 84 | - RF: ~95%. {'random_forest__n_estimators': 500, 'random_forest__max_features': 'auto'} 85 | - once these are fit, submit one or more 86 | - train RF on everything (expt 32), submit results 87 | - [x] intermission to clean up repo & file structure 88 | - [x] didn't try ExtraTrees classifier earlier -- try this now 89 | - 94%, slightly higher with scaling (still 94.x%) 90 | - [x] read about [ensemble methods](http://scikit-learn.org/stable/modules/ensemble.html#ensemble-methods) 91 | - averaging 92 | - Bagging == use this one each of 3 best 93 | - boosting 94 | - AdaBoost ==> use large adaboost classifier with each of 3 best classifiers 95 | - supposedly better with e.g. shallow trees, maybe our gridsearch'd models are opposed to this? 96 | - VotingClassifer 97 | 98 | 99 | - [x] set up baggingclassifier with each of the three best as base 100 | - if the 'pl' is BaggingClassifier(Pipeline()), need to update e.g. utils.name() anything in run-experiment.py? 101 | - seems like the baggingclassifier params could also be gridsearched 102 | - running all in parallel seems too much for # of cores... increase stagger 103 | - bagging kNN didn't finish. 104 | - run this one again (expt_35) 105 | - ~96.8%, improvement over single RF 106 | - bagging SVM (36) ~ 96.3% - improvement over the single svm 107 | - bagging RF (37) ~ 96.3% - improvement over single RF 108 | - submit each of these trained on full dataset 109 | - [x] set up adaboost with best RF 110 | - must have class weights and proper attrs in estimator (SVC doesn't, RF does, kNN doesn't) 111 | - try 100 estimators (expt 38) 112 | - very fast. ~ 96.5% (slightly better than bagging) 113 | - submit 114 | - [x] gridsearch votingclassifier on top of the 3 gs'd classifiers to find best 'vote' type 115 | - then fit, train, predict, submit that one 116 | - debug: JoblibAttributeError 117 | - seems related to probability estimates in SVC; add probability=True to constructor (doesn't appear to effect other performance) 118 | - 0.962 (+/-0.010) for {'voting': 'soft'} 119 | - was expecting a better result by "averaging" 120 | - trail full model with voting=soft & submit 121 | - [x] gridsearch VC for 'vote' w/ 3x bagged or adaboost 122 | - 3x bagged: 0.960 (+/-0.009) for {'voting': 'soft'} 123 | - 2x bagged + adaboost RF: 0.961 (+/-0.010) for {'voting': 'soft'} 124 | - was expecting a better result by "averaging" 125 | - train full model on 3x bagging w/ voting=soft & submit (43) 126 | - train full model on 2x bagging + RF boosting w/ voting=soft & submit (44) 127 | - [x] use ``class_weights`` in RF & SVM models to reverse-engineered values from scoreboard 128 | - look at submission for Small Wooded Treatment Plant Fence (expt-32) and count up the predictions 129 | 130 | ```bash 131 | $ tail -n+6 submissions/2015-12-24T18:16:42_Small-Wooded-Treatment-Plant-Fence.submission | sort | uniq -c | sort -n | sed 's/^ *//' | sort -t" " -k2,2 > data/SWTPF_counts.csv 132 | 997 0 133 | 1135 1 134 | 1039 2 135 | 1011 3 136 | 980 4 137 | 880 5 138 | 961 6 139 | 1022 7 140 | 969 8 141 | 1006 9 142 | ``` 143 | 144 | - now look at per-count accuracy from scoreboard (data/SWTPF-leaderboard-scores.csv) 145 | 146 | ```bash 147 | 0.99081633 0 148 | 0.99030837 1 149 | 0.9689922481 2 150 | 0.9653465347 3 151 | 0.9735234216 4 152 | 0.9674887892 5 153 | 0.9791231733 6 154 | 0.96692607 7 155 | 0.9599589322 8 156 | 0.9554013875 9 157 | ``` 158 | - and now we can combine them to get the actual count of digits in the leaderboard test set (if we round) 159 | 160 | ```bash 161 | $ join SWTPF_counts.csv SWTPF-leaderboard-scores.csv -1 2 -2 2 | awk 'BEGIN { sum = 0 } { printf "%d %d\n", $1, $2/$3; sum+=$2/$3 } END { printf "\n%d \n", sum }' 162 | 0 1006 163 | 1 1146 164 | 2 1072 165 | 3 1047 166 | 4 1006 167 | 5 909 168 | 6 981 169 | 7 1056 170 | 8 1009 171 | 9 1052 172 | 173 | 10288 174 | 175 | $ join SWTPF_counts.csv SWTPF-leaderboard-scores.csv -1 2 -2 2 | awk 'BEGIN { sum = 10288 } { printf "%d %1.3f\n", $1, $2/$3/sum }' 176 | 0 0.098 177 | 1 0.111 178 | 2 0.104 179 | 3 0.102 180 | 4 0.098 181 | 5 0.088 182 | 6 0.095 183 | 7 0.103 184 | 8 0.098 185 | 9 0.102 186 | ``` 187 | 188 | - in principal, we can now use the relative prevalence of these to weight the classes in eg the SVC model (want the weights to sum to one) 189 | - both SVC and RF support passing the class weights, repurpose the best-performing versions of those 190 | - RF: expt 32 (scalded RF) performed best on leaderboard (97.2%) 191 | - reuse with weights (45) => ~96.6%, decent 192 | - leaderboard score => 97.1% 193 | - SVM: expt 36 (bagged, scaled, gs'd SVM) performed best on leaderboard (97.1%) 194 | - reuse with weights (46) => ~95.6%, decent 195 | - leaderboard score => 96.4% 196 | - submit these as stand-alone models [running now] 197 | - relaunched them because they weren't named (overwrite log files) 198 | - sent 199 | - neither were much higher than the original; don't bother updating VotingClassifier 200 | - [x] sklearn's built-in NN (MLPClassifier) 201 | - big gridsearch 202 | - dang. MLPC only in dev version of scikit 203 | - see if we can create a local virtualenv for that 204 | - looks ok, running as expt_47 205 | 206 | ```bash 207 | jmontague@data-science-3:~ 208 | $ virtualenv -p python ~/CCC-venv 209 | 210 | jmontague@data-science-3:~/2015-12-21_CCC [master+*] 211 | $ source ~/CCC-venv/bin/activate 212 | 213 | jmontague@data-science-3:~ 214 | $ pip install -r requirements.txt 215 | $ pip uninstall scikit-learn 216 | 217 | (CCC-venv)jmontague@data-science-3:~/2015-12-21_CCC [master+*] 218 | $ pip install -e git+git@github.com:scikit-learn/scikit-learn.git 219 | 220 | (CCC-venv)jmontague@data-science-3:~/2015-12-21_CCC [master+*] 221 | $ pip install cython 222 | 223 | (CCC-venv)jmontague@data-science-3:~/CCC-venv/lib/python2.7/site-packages/scikit-learn [master] 224 | $ python setup.py build_ext --inplace 225 | $ python 226 | >>> import sklearn; sklearn.__version__ 227 | '0.18.dev0' 228 | 229 | # but didn't build/install totally correctly, maybe ran setup.py in wrong place? 230 | # - in virtualenv, get sklearn ImportError 231 | # - resolve by either (in the launch-process.bash script): 232 | $ export PYTHONPATH=~/CCC-venv/lib/python2.7/site-packages/scikit-learn:$PYTHONPATH 233 | # or: 234 | jmontague@data-science-3:~/CCC-venv/lib/python2.7/site-packages 235 | $ ln -s scikit-learn/sklearn sklearn 236 | ``` 237 | 238 | - ran gridsearch on 47 - worked, high of ~94.8% 239 | 240 | - [x] try again with updated grid based on scores 241 | - best alpha was on edge of grid - run again to extend on larger end 242 | - also didn't think to add extra layers - add that, too: [(50,), (100,), (200,), (50,50), (100,100), (200,200), (50,50,50), (100,100,100), (200,200,200)] 243 | - drop 'sgd' algorithm 244 | - stick to 'relu' activation 245 | - best score (from 48) ~ 95.6% 246 | - extra layer, alpha at edge of grid again 247 | - look through this more & make next round of GS: 248 | ``cat log/2015-12-29T03:43:29_expt_48.log | grep for | sort -n -t" " -k6,6`` 249 | 250 | - [x] run with larger range of layer sizes and other params 251 | - took ~4 hrs for GSCV 252 | - best model ~95.8%: {'mlp__hidden_layer_sizes': (1000, 1000), 'mlp__algorithm': 'l-bfgs', 'mlp__alpha': 10.0} 253 | - convert best to train for submission (52 - note: out of order bc of earlier long run times 254 | 255 | - [x] test other SVM kernels (in particular, poly w/ gs on degree) 256 | - running as expt_50 257 | - best ~95.1%, {'svm__degree': 2, 'svm__C': 15.0} 258 | - not noticibly better than rbf kernel (which I think should do better with high-dimensional data) 259 | - [x] combine dimensionality reduction with kNN 260 | - t-SNE to 2-5 dimensions, then kNN (expt 51) 261 | - doesn't work because TSNE doesn't have a transform method 262 | 263 | - [x] expand training data with perturbations 264 | - then train this data on all of the simplest algorithms 265 | - new train data ~ 1 GB 266 | - [x] test this with default models (expt 27, 28, 29), compare scores 267 | - need to flag run-experiment.py to read the proper dataset 268 | - these are slower to train, need to increase stagger time (killed 28 and 29 to let 27 run - restart them once 27 finishes) 269 | - 27 running for >1 hr 270 | - also much more ram (~30 GB for expt_27) 271 | - all 3 were ~95% in first round (w/ the smaller dataset) 272 | - kNN (96.1%): {'knn__weights': 'distance', 'knn__n_neighbors': 4} (which is the same as expt_30 273 | - ^ from the gscv, killed before fully finished 274 | - SVM (): 275 | - RF (): 276 | - in the interest of trying to get predictions, I think I'll just run a full fit and predict on the gridsearch'd voting classifier (basic three party system, 42) 277 | - running now ("expanded 42"). note: original fit to data took 1.5 hrs, and this data is 5x bigger. 278 | - took 16 hours! 279 | - submitted (98.6% on hold-out set) 280 | 281 | 282 | 283 | ## other ideas 284 | 285 | - scikit-neuralnetwork 286 | - tpot 287 | - sklearn-deap 288 | - nolearn 289 | - tensorflow 290 | 291 | 292 | 293 | ## Future work? 294 | 295 | - don't instantiate estimators in ``models.py``, wait until they're used in main script 296 | - move matrix plotting into utils module (?) 297 | - make utils.short_name less fragile 298 | - build funcs to read and display example images 299 | - look at feature importance 300 | - http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances_faces.html#example-ensemble-plot-forest-importances-faces-py 301 | - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/) 302 | - ex: in voting classifier, does per-digit accuracy vary by model? 303 | 304 | 305 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | __author__="Josh Montague" 4 | __license__="MIT License" 5 | 6 | # this module defines models and pipelines for import into 7 | # individual experiment runs 8 | 9 | import logging 10 | import numpy as np 11 | import sys 12 | 13 | # set up a logger, at least for the ImportError 14 | model_logr = logging.getLogger(__name__) 15 | model_logr.setLevel(logging.DEBUG) 16 | model_sh = logging.StreamHandler(stream=sys.stdout) 17 | formatter = logging.Formatter('%(asctime)s : %(name)s : %(levelname)s : %(message)s') 18 | model_sh.setFormatter(formatter) 19 | model_logr.addHandler(model_sh) 20 | 21 | 22 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis 23 | from sklearn.dummy import DummyClassifier 24 | from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, VotingClassifier 25 | from sklearn.grid_search import GridSearchCV 26 | from sklearn.linear_model import SGDClassifier, LogisticRegression 27 | from sklearn.manifold import TSNE 28 | from sklearn.naive_bayes import GaussianNB, MultinomialNB 29 | from sklearn.neighbors import KNeighborsClassifier 30 | try: 31 | from sklearn.neural_network import MLPClassifier 32 | except ImportError, e: 33 | model_logr.info('couldnt import sklearn.neural_network') 34 | model_logr.info('... as of the time of writing, this requires a build of the dev release (see README)') 35 | from sklearn.preprocessing import StandardScaler 36 | from sklearn.svm import SVC 37 | from sklearn.tree import DecisionTreeClassifier 38 | 39 | from sklearn.pipeline import Pipeline 40 | 41 | 42 | experiment_dict = \ 43 | { 44 | # Note: keys are of the form expt_*, which are used to execute the 45 | # associated values of 'pl' keys 46 | # 47 | # experiments to build pipeline ################################################ 48 | 'expt_1': { 49 | 'note': 'random guessing (maintains class distributions)', 50 | 'name': 'Crash Test Dummies', 51 | 'pl': Pipeline([ ('dummy_clf', DummyClassifier()) ]) 52 | }, 53 | 'expt_2': { 54 | 'note': 'vanilla linear svm (heard it through the grapevine)', 55 | 'name': 'Grapevine', 56 | 'pl': Pipeline([ ('linear_svm', SGDClassifier(n_jobs=-1)) ]) 57 | }, 58 | 'expt_3': { 59 | 'note': 'add scaling prior to SVM (you must be this tall to ride)', 60 | 'name': 'This tall to ride', 61 | 'pl': Pipeline([ ('scaling', StandardScaler()), 62 | ('linear_svm', SGDClassifier(n_jobs=-1)) ]) 63 | }, 64 | # systematic check of default classifiers + scaling ################################ 65 | 'expt_4': { 66 | 'note': 'vanilla knn (mac and kelly from 2014 "neighbors"', 67 | 'name': 'Mac and Kelly', 68 | 'pl': Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1)) ]) 69 | }, 70 | 'expt_5': { 71 | 'note': 'scaled knn', 72 | 'name': 'scaled knn', 73 | 'pl': Pipeline([ ('scaling', StandardScaler()), 74 | ('knn', KNeighborsClassifier(n_jobs=-1)) ]) 75 | }, 76 | 'expt_6': { 77 | 'note': 'rbf kernel SVM', 78 | 'name': 'rbf kernel SVM', 79 | 'pl': Pipeline([ ('rbf-svm', SVC(kernel='rbf')) ]) 80 | }, 81 | 'expt_7': { 82 | 'note': 'scaled rbf kernel SVM', 83 | 'name': 'Portable popcorn machine', 84 | 'pl': Pipeline([ ('scaling', StandardScaler()), 85 | ('rbf-svm', SVC(kernel='rbf', cache_size=1000)) ]) 86 | }, 87 | 'expt_8': { 88 | 'note': 'default decision tree', 89 | 'name': 'default decision tree', 90 | 'pl': Pipeline([ ('decision-tree', DecisionTreeClassifier()) ]) 91 | }, 92 | 'expt_9': { 93 | 'note': 'scaled default decision tree', 94 | 'name': 'scaled default decision tree', 95 | 'pl': Pipeline([ ('scaling', StandardScaler()), 96 | ('decision-tree', DecisionTreeClassifier()) ]) 97 | }, 98 | 'expt_10': { 99 | 'note': 'default RF', 100 | 'name': 'default RF', 101 | 'pl': Pipeline([ ('random-forest', RandomForestClassifier()) ]) 102 | }, 103 | 'expt_11': { 104 | 'note': 'scaled default RF', 105 | 'name': 'scaled default RF', 106 | 'pl': Pipeline([ ('scaling', StandardScaler()), 107 | ('random-forest', RandomForestClassifier()) ]) 108 | }, 109 | 'expt_12': { 110 | 'note': 'default adaboost', 111 | 'name': 'default adaboost', 112 | 'pl': Pipeline([ ('DT-adaboost', AdaBoostClassifier()) ]) 113 | }, 114 | 'expt_13': { 115 | 'note': 'scaled default adaboost', 116 | 'name': 'scaled default adaboost', 117 | 'pl': Pipeline([ ('scaling', StandardScaler()), 118 | ('DT-adaboost', AdaBoostClassifier()) ]) 119 | }, 120 | 'expt_14': { 121 | 'note': 'default Gaussian NB', 122 | 'name': 'default Gaussian NB', 123 | 'pl': Pipeline([ ('gaussian-nb', GaussianNB()) ]) 124 | }, 125 | 'expt_15': { 126 | 'note': 'scaled Gaussian NB', 127 | 'name': 'scaled Gaussian NB', 128 | 'pl': Pipeline([ ('scaling', StandardScaler()), 129 | ('gaussian-nb', GaussianNB()) ]) 130 | }, 131 | 'expt_16': { 132 | 'note': 'default Multinomial NB', 133 | 'name': 'default Multinomial NB', 134 | 'pl': Pipeline([ ('multi-nb', MultinomialNB()) ]) 135 | }, 136 | 'expt_17': { 137 | 'note': 'scaled Multinomial NB', 138 | 'name': 'scaled Multinomial NB', 139 | 'pl': Pipeline([ ('scaling', StandardScaler()), 140 | ('multi-nb', MultinomialNB()) ]) 141 | }, 142 | 'expt_18': { 143 | 'note': 'default LDA', 144 | 'name': 'default LDA', 145 | 'pl': Pipeline([ ('linear-da', LinearDiscriminantAnalysis()) ]) 146 | }, 147 | 'expt_19': { 148 | 'note': 'scaled LDA', 149 | 'name': 'scaled LDA', 150 | 'pl': Pipeline([ ('scaling', StandardScaler()), 151 | ('linear-da', LinearDiscriminantAnalysis()) ]) 152 | }, 153 | 'expt_20': { 154 | 'note': 'default QDA', 155 | 'name': 'default QDA', 156 | 'pl': Pipeline([ ('Quadratic-da', QuadraticDiscriminantAnalysis()) ]) 157 | }, 158 | 'expt_21': { 159 | 'note': 'scaled QDA', 160 | 'name': 'scaled QDA', 161 | 'pl': Pipeline([ ('scaling', StandardScaler()), 162 | ('Quadratic-da', QuadraticDiscriminantAnalysis()) ]) 163 | }, 164 | 'expt_22': { 165 | 'note': 'default (multi-class) Logistic regression', 166 | 'name': 'default (multi-class) Logistic regression', 167 | 'pl': Pipeline([ ('log-reg', LogisticRegression(n_jobs=-1)) ]) 168 | }, 169 | 'expt_23': { 170 | 'note': 'scaled default (multi-class) Logistic regression', 171 | 'name': 'scaled default (multi-class) Logistic regression', 172 | 'pl': Pipeline([ ('scaling', StandardScaler()), 173 | ('log-reg', LogisticRegression(n_jobs=-1)) ]) 174 | }, 175 | # gridsearch cv the best performers from above ################################ 176 | # - kNN 177 | 'expt_24': { 178 | 'note': 'gridsearch cv on kNN', 179 | 'name': 'gridsearch cv on kNN', 180 | 'pl': GridSearchCV( Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1)) ]), 181 | param_grid=dict(knn__n_neighbors=[3,12,20]), 182 | n_jobs=-1 ) 183 | }, 184 | # - scaled rbf SVM 185 | 'expt_25': { 186 | 'note': 'gridsearch cv on scaled rbf svm', 187 | 'name': 'gridsearch cv on scaled rbf svm', 188 | 'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 189 | ('rbf_svm', SVC(kernel='rbf', cache_size=1000)) ]), 190 | param_grid=dict(rbf_svm__C=[0.1,1.0,10], 191 | rbf_svm__gamma=[0.0001,0.01,0.1], 192 | rbf_svm__class_weight=[None, 'balanced']), 193 | n_jobs=-1) 194 | }, 195 | # - scaled RF 196 | 'expt_26': { 197 | 'note': 'gridsearch cv on scaled default RF', 198 | 'name': 'gridsearch cv on scaled default RF', 199 | 'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 200 | ('random_forest', RandomForestClassifier(n_jobs=-1)) ]), 201 | param_grid=dict(random_forest__n_estimators=[3,50,100], 202 | random_forest__max_features=[10,100,'auto']), 203 | n_jobs=-1) 204 | }, 205 | # narrower gridsearch on three models above #################################### 206 | # - kNN 207 | 'expt_27': { 208 | 'note': 'focused gridsearch cv on kNN', 209 | 'name': 'focused gridsearch cv on kNN', 210 | 'pl': GridSearchCV( Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1)) ]), 211 | param_grid=dict(knn__n_neighbors=range(2,12), 212 | knn__weights=['distance','uniform']), 213 | n_jobs=-1 ) 214 | }, 215 | # - scaled rbf SVM 216 | 'expt_28': { 217 | 'note': 'focussed gridsearch cv on scaled rbf svm', 218 | 'name': 'focussed gridsearch cv on scaled rbf svm', 219 | 'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 220 | ('rbf_svm', SVC(kernel='rbf', cache_size=2000)) ]), 221 | param_grid=dict(rbf_svm__C=[1,2,5,10], 222 | rbf_svm__gamma=[0.001,0.005,0.01,'auto'], 223 | rbf_svm__class_weight=[None, 'balanced']), 224 | n_jobs=-1) 225 | }, 226 | # - scaled RF 227 | 'expt_29': { 228 | 'note': 'focussed gridsearch cv on scaled default RF', 229 | 'name': 'focussed gridsearch cv on scaled default RF', 230 | 'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 231 | ('random_forest', RandomForestClassifier(n_jobs=-1)) ]), 232 | param_grid=dict(random_forest__n_estimators=[10,100,500,1000], 233 | random_forest__max_features=[10,20,30,'auto']), 234 | n_jobs=-1) 235 | }, 236 | # best results of gridsearch'd models above #################################### 237 | # - best kNN 238 | 'expt_30': { 239 | 'note': 'best gridsearch result for kNN', 240 | 'name': 'Neighborhood Treatment Plant Fence', 241 | 'pl': Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1, 242 | weights='distance', 243 | n_neighbors=4)) ]) 244 | }, 245 | # - best scaled rbf SVM 246 | 'expt_31': { 247 | 'note': 'best gridsearch result for scaled rbf svm', 248 | 'name': 'Small Popcorn Treatment Plant Fence', 249 | 'pl': Pipeline([ ('scaling', StandardScaler()), 250 | ('rbf_svm', SVC(kernel='rbf', 251 | cache_size=2000, 252 | C=10.0, 253 | gamma='auto', 254 | class_weight='balanced')) ]) 255 | }, 256 | # - best scaled RF 257 | 'expt_32': { 258 | 'note': 'best gridsearch result for scaled RF', 259 | 'name': 'Small Wooded Treatment Plant Fence', 260 | 'pl': Pipeline([ ('scaling', StandardScaler()), 261 | ('random_forest', RandomForestClassifier(n_jobs=-1, 262 | n_estimators=500, 263 | max_features='auto')) ]) 264 | }, 265 | # ensemble decision tree classifer that didn't get run earlier #################################### 266 | 'expt_33': { 267 | 'note': 'ExtraTrees', 268 | 'name': 'ExtraTrees', 269 | 'pl': Pipeline([ ('extra-trees', ExtraTreesClassifier(n_jobs=-1)) ]) 270 | }, 271 | 'expt_34': { 272 | 'note': 'scaled default ExtraTrees', 273 | 'name': 'scaled default ExtraTrees', 274 | 'pl': Pipeline([ ('scaling', StandardScaler()), ('extra-trees', ExtraTreesClassifier(n_jobs=-1)) ]) 275 | }, 276 | # bagging versions of three best classifiers ################################## 277 | # - kNN 278 | 'expt_35': { 279 | 'note': 'bagging on best gridsearched kNN estimator', 280 | 'name': 'Sack of Flanders', 281 | 'pl': BaggingClassifier( 282 | Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1, 283 | weights='distance', 284 | n_neighbors=4)) ]), 285 | n_jobs=-1, 286 | n_estimators=10) 287 | 288 | }, 289 | # - best scaled rbf SVM 290 | 'expt_36': { 291 | 'note': 'bagging on best gridsearch scaled rbf svm', 292 | 'name': 'Sack of small popcorn', 293 | 'pl': BaggingClassifier( 294 | Pipeline([ ('scaling', StandardScaler()), 295 | ('rbf_svm', SVC(kernel='rbf', 296 | cache_size=2000, 297 | C=10.0, 298 | gamma='auto', 299 | class_weight='balanced')) ]), 300 | n_jobs=-1, 301 | n_estimators=10) 302 | }, 303 | # - best scaled RF 304 | 'expt_37': { 305 | 'note': 'bagging on best gridsearch result for scaled RF', 306 | 'name': 'Sack of small shrubs', 307 | 'pl': BaggingClassifier( 308 | Pipeline([ ('scaling', StandardScaler()), 309 | ('random_forest', RandomForestClassifier(n_jobs=-1, 310 | n_estimators=500, 311 | max_features='auto')) ]), 312 | n_jobs=-1, 313 | n_estimators=10) 314 | }, 315 | # adaboost with best RF (must supports class weights) ##################### 316 | # - best scaled RF 317 | 'expt_38': { 318 | 'note': 'adaboost on best gridsearch result for scaled RF', 319 | 'name': 'On the shoulders of Ents', 320 | 'pl': Pipeline([ ('scaling', StandardScaler()), 321 | ('adaboost_random_forest', AdaBoostClassifier( 322 | RandomForestClassifier(n_jobs=-1, 323 | n_estimators=500, 324 | max_features='auto'), 325 | n_estimators=100)) ]) 326 | }, 327 | # ensemble voting ################################################ 328 | # - gridsearch voting w/ best three stand-alone models 329 | 'expt_39': { 330 | 'note': 'gs over voting across best gs models', 331 | 'name': 'gs over voting across best gs models', 332 | 'pl': GridSearchCV( 333 | VotingClassifier( estimators=[ 334 | ('gs_knn', Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1, 335 | weights='distance', 336 | n_neighbors=4)) ])), 337 | ('gs_svm', Pipeline([ ('scaling', StandardScaler()), 338 | ('rbf_svm', SVC(kernel='rbf', 339 | probability=True, 340 | cache_size=2000, 341 | C=10.0, 342 | gamma='auto', 343 | class_weight='balanced')) ])), 344 | ('gs_rf', Pipeline([ ('scaling', StandardScaler()), 345 | ('random_forest', RandomForestClassifier(n_jobs=-1, 346 | n_estimators=500, 347 | max_features='auto')) ])) ]), 348 | param_grid=dict(voting=['hard','soft']), 349 | n_jobs=-1) 350 | }, 351 | # - gridsearch voting w/ bagged combos 352 | 'expt_40': { 353 | 'note': 'gs over voting across bagged best gs models', 354 | 'name': 'gs over voting across bagged best gs models', 355 | 'pl': GridSearchCV( 356 | VotingClassifier( estimators=[ 357 | ('bag_knn', BaggingClassifier( 358 | KNeighborsClassifier(n_jobs=-1, 359 | weights='distance', 360 | n_neighbors=4), 361 | n_jobs=-1, 362 | n_estimators=10)), 363 | ('bag_svm', BaggingClassifier( 364 | Pipeline([ ('scaling', StandardScaler()), 365 | ('rbf_svm', SVC(kernel='rbf', 366 | probability=True, 367 | cache_size=2000, 368 | C=10.0, 369 | gamma='auto', 370 | class_weight='balanced')) ]), 371 | n_jobs=-1, 372 | n_estimators=10)), 373 | ('bag_rf', BaggingClassifier( 374 | Pipeline([ ('scaling', StandardScaler()), 375 | ('random_forest', RandomForestClassifier(n_jobs=-1, 376 | n_estimators=500, 377 | max_features='auto')) ]), 378 | n_jobs=-1, 379 | n_estimators=10))]), 380 | param_grid=dict(voting=['hard','soft']), 381 | n_jobs=-1) 382 | }, 383 | # - gridsearch voting w/ bagged + boosted rf 384 | 'expt_41': { 385 | 'note': 'gs over voting across bagged + boosted best gs models', 386 | 'name': 'gs over voting across bagged + boosted best gs models', 387 | 'pl': GridSearchCV( 388 | VotingClassifier( estimators=[ 389 | ('bag_knn', BaggingClassifier( 390 | KNeighborsClassifier(n_jobs=-1, 391 | weights='distance', 392 | n_neighbors=4), 393 | n_jobs=-1, 394 | n_estimators=10)), 395 | ('bag_svm', BaggingClassifier( 396 | Pipeline([ ('scaling', StandardScaler()), 397 | ('rbf_svm', SVC(kernel='rbf', 398 | probability=True, 399 | cache_size=2000, 400 | C=10.0, 401 | gamma='auto', 402 | class_weight='balanced')) ]), 403 | n_jobs=-1, 404 | n_estimators=10)), 405 | 406 | ('boost_rf', Pipeline([ ('scaling', StandardScaler()), 407 | ('adaboost_random_forest', AdaBoostClassifier( 408 | RandomForestClassifier(n_jobs=-1, 409 | n_estimators=500, 410 | max_features='auto'), 411 | n_estimators=100)) ])) ]), 412 | 413 | 414 | param_grid=dict(voting=['hard','soft']), 415 | n_jobs=-1) 416 | }, 417 | # - fix vote=soft for 39-40 (41?) & train on full data ############################# 418 | # - (expt 39 w/o gs + soft vote) 419 | 'expt_42': { 420 | # "3-party system" trained this model on the original data 421 | #'name': 'Basic three-party system', 422 | #'note': 'soft voting with best gs models', 423 | # "E Pluribus Unum" trained this model on the expanded data 424 | 'name': 'E pluribus unum', 425 | 'note': 'soft voting with best gs models on expanded dataset', 426 | 'pl': VotingClassifier( estimators=[ 427 | ('gs_knn', Pipeline([ ('knn', KNeighborsClassifier(n_jobs=-1, 428 | weights='distance', 429 | n_neighbors=4)) ])), 430 | ('gs_svm', Pipeline([ ('scaling', StandardScaler()), 431 | ('rbf_svm', SVC(kernel='rbf', 432 | probability=True, 433 | cache_size=2000, 434 | C=10.0, 435 | gamma='auto', 436 | class_weight='balanced')) ])), 437 | ('gs_rf', Pipeline([ ('scaling', StandardScaler()), 438 | ('random_forest', RandomForestClassifier(n_jobs=-1, 439 | n_estimators=500, 440 | max_features='auto')) ])) ], 441 | voting='soft') 442 | 443 | }, 444 | # - (expt 40 w/o gs + soft vote) 445 | 'expt_43': { 446 | 'note': 'soft voting with bagged gs models', 447 | 'name': 'PACs and the three-party system', 448 | 'pl': VotingClassifier( estimators=[ 449 | ('bag_knn', BaggingClassifier( 450 | KNeighborsClassifier(n_jobs=-1, 451 | weights='distance', 452 | n_neighbors=4), 453 | n_jobs=-1, 454 | n_estimators=10)), 455 | ('bag_svm', BaggingClassifier( 456 | Pipeline([ ('scaling', StandardScaler()), 457 | ('rbf_svm', SVC(kernel='rbf', 458 | probability=True, 459 | cache_size=2000, 460 | C=10.0, 461 | gamma='auto', 462 | class_weight='balanced')) ]), 463 | n_jobs=-1, 464 | n_estimators=10)), 465 | ('bag_rf', BaggingClassifier( 466 | Pipeline([ ('scaling', StandardScaler()), 467 | ('random_forest', RandomForestClassifier(n_jobs=-1, 468 | n_estimators=500, 469 | max_features='auto')) ]), 470 | n_jobs=-1, 471 | n_estimators=10))], 472 | voting='soft') 473 | }, 474 | # - (expt 41 w/o gs + soft vote) 475 | 'expt_44': { 476 | 'note': 'voting classifier: 2x bags + boosted RF w/ soft voting', 477 | 'name': 'SuperPACs ruin everything', 478 | 'pl': VotingClassifier( estimators=[ 479 | ('bag_knn', BaggingClassifier( 480 | KNeighborsClassifier(n_jobs=-1, 481 | weights='distance', 482 | n_neighbors=4), 483 | n_jobs=-1, 484 | n_estimators=10)), 485 | ('bag_svm', BaggingClassifier( 486 | Pipeline([ ('scaling', StandardScaler()), 487 | ('rbf_svm', SVC(kernel='rbf', 488 | probability=True, 489 | cache_size=2000, 490 | C=10.0, 491 | gamma='auto', 492 | class_weight='balanced')) ]), 493 | n_jobs=-1, 494 | n_estimators=10)), 495 | ('boost_rf', Pipeline([ ('scaling', StandardScaler()), 496 | ('adaboost_random_forest', AdaBoostClassifier( 497 | RandomForestClassifier(n_jobs=-1, 498 | n_estimators=500, 499 | max_features='auto'), 500 | n_estimators=100)) ])) ], 501 | voting='soft') 502 | }, 503 | # Include inferred class distributions in best stand-alone models of SVM, RF ################## 504 | 'expt_45': { 505 | 'note': 'add class weights to expt_32', 506 | 'name': 'Yeah I work out', 507 | 'pl': Pipeline([ ('scaling', StandardScaler()), 508 | ('random_forest', RandomForestClassifier(n_jobs=-1, 509 | n_estimators=500, 510 | max_features='auto', 511 | class_weight = {0:0.098, 512 | 1:0.111, 513 | 2:0.104, 514 | 3:0.102, 515 | 4:0.098, 516 | 5:0.088, 517 | 6:0.095, 518 | 7:0.103, 519 | 8:0.098, 520 | 9:0.102})) ]) 521 | }, 522 | 'expt_46': { 523 | 'note': 'add class weights to expt_36', 524 | 'name': 'Oh you work out?', 525 | 'pl': BaggingClassifier( 526 | Pipeline([ ('scaling', StandardScaler()), 527 | ('rbf_svm', SVC(kernel='rbf', 528 | cache_size=2000, 529 | C=10.0, 530 | gamma='auto', 531 | class_weight = {0:0.098, 532 | 1:0.111, 533 | 2:0.104, 534 | 3:0.102, 535 | 4:0.098, 536 | 5:0.088, 537 | 6:0.095, 538 | 7:0.103, 539 | 8:0.098, 540 | 9:0.102})) ]), 541 | n_jobs=-1, 542 | n_estimators=10) 543 | }, 544 | # 545 | # As of the time of writing, using the MLPClassifier requires building the 546 | # developer branch of sklearn. If you want to use these experiments, 547 | # the sklearn docs include a ref for building this version: 548 | # http://scikit-learn.org/stable/developers/contributing.html#git-repo 549 | # Then, you can uncomment the next few experiments below (+ 52) to run them. 550 | # 551 | # neural network experiments ################################################ 552 | # - sklearn's MLPClassifier 553 | # 'expt_47': { 554 | # 'note': 'gridsearch multilayer perceptron, using tips from dev docs', 555 | # 'name': 'tbd', 556 | # 'pl': GridSearchCV( 557 | # Pipeline([ ('scaling', StandardScaler()), 558 | # ('mlp', MLPClassifier()) ]), 559 | # param_grid=dict( mlp__alpha=10.0**-np.arange(1, 7), 560 | # mlp__hidden_layer_sizes=[(50, ), (100, ), (200, )], 561 | # mlp__activation=['logistic', 'tanh', 'relu'], 562 | # mlp__algorithm=['l-bfgs', 'sgd', 'adam']), 563 | # n_jobs=-1) 564 | # }, 565 | # # - v2 of sklearn's MLPClassifier 566 | # 'expt_48': { 567 | # 'note': 'v2 of gridsearch multilayer perceptron, modifying param_grid', 568 | # 'name': 'tbd', 569 | # 'pl': GridSearchCV( 570 | # Pipeline([ ('scaling', StandardScaler()), 571 | # ('mlp', MLPClassifier(activation='relu')) ]), 572 | # param_grid=dict( mlp__alpha=10.0**-np.arange(-1,6), 573 | # mlp__hidden_layer_sizes=[(50,), 574 | # (100,), 575 | # (200,), 576 | # (50,50), 577 | # (100,100), 578 | # (200,200), 579 | # (50,50,50), 580 | # (100,100,100), 581 | # (200,200,200)], 582 | # mlp__algorithm=['l-bfgs', 'adam']), 583 | # n_jobs=-1) 584 | # }, 585 | # # - gridsearch wide MLP hidden layers 586 | # 'expt_49': { 587 | # 'note': 'v3 of gridsearch multilayer perceptron, modifying param_grid', 588 | # 'name': 'tbd', 589 | # 'pl': GridSearchCV( 590 | # Pipeline([ ('scaling', StandardScaler()), 591 | # ('mlp', MLPClassifier(activation='relu', verbose=True)) ]), 592 | # param_grid=dict( mlp__alpha=10.0**-np.arange(-2,5), 593 | # mlp__hidden_layer_sizes=[(200,), 594 | # (500,), 595 | # (1000,), 596 | # (200,200), 597 | # (500,500), 598 | # (1000,1000), 599 | # (500,500,500)], 600 | # mlp__algorithm=['l-bfgs', 'adam']), 601 | # n_jobs=-1) 602 | # }, 603 | # revisit SVM with poly kernel gridsearch ################################################## 604 | 'expt_50': { 605 | 'note': 'gridsearch poly kernel degree with scaled svm', 606 | 'name': 'gridsearch poly kernel degree with scaled svm', 607 | 'pl': GridSearchCV( Pipeline([ ('scaling', StandardScaler()), 608 | ('svm', SVC(cache_size=2000, 609 | kernel='poly', 610 | gamma='auto')) ]), 611 | param_grid=dict(svm__C=[0.1, 0.5, 1.0, 5.0, 10.0, 15.0], 612 | svm__degree=np.arange(2,12)), 613 | n_jobs=-1) 614 | }, 615 | # # dimensionality reduction + kNN ###################################################### 616 | # # note: this doesn't work because TSNE doesn't implement a transform method. Pipeline throws 617 | # # an error on import about this, so leave this commented out. 618 | # 'expt_51': { 619 | # 'note': 'gridsearch over tSNE dim reduction + kNN', 620 | # 'name': 'gridsearch over tSNE dim reduction + kNN', 621 | # 'pl': GridSearchCV( Pipeline([ 622 | # ('tsne', TSNE(verbose=1)), 623 | # ('knn', KNeighborsClassifier(n_jobs=-1)) ]), 624 | # param_grid=dict(tsne__n_components=[2,3,4], 625 | # tsne__perplexity=[20,30,40,50], 626 | # tsne__learning_rate=[400,700,1000], 627 | # knn__n_neighbors=range(2,10), 628 | # knn__weights=['distance','uniform']), 629 | # n_jobs=-1 ) 630 | # }, 631 | # best MLP from gridsearch (note: out of order due to run time!) ######################### 632 | # {'mlp__hidden_layer_sizes': (1000, 1000), 'mlp__algorithm': 'l-bfgs', 'mlp__alpha': 10.0} 633 | # 'expt_52': { 634 | # 'note': 'best MLP from gridsearch', 635 | # 'name': 'Pinky and the Brain', 636 | # 'pl': Pipeline([ ('scaling', StandardScaler()), 637 | # ('mlp', MLPClassifier(activation='relu', 638 | # hidden_layer_sizes=(1000,1000), 639 | # algorithm='l-bfgs', 640 | # alpha=10.0, 641 | # verbose=True)) ]) 642 | # }, 643 | 644 | } # end of experiment_dict 645 | 646 | --------------------------------------------------------------------------------