├── ICA.py ├── PCA.py ├── RF.py ├── RP.py ├── benchmark.py ├── clustering.py ├── helpers.py ├── madelon tricks.py ├── parse.py └── run.bat /ICA.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | #%% Imports 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.preprocessing import StandardScaler 7 | from sklearn.pipeline import Pipeline 8 | from helpers import nn_arch, nn_reg 9 | from sklearn.neural_network import MLPClassifier 10 | from sklearn.model_selection import GridSearchCV 11 | from sklearn.decomposition import FastICA 12 | 13 | out = './ICA/' 14 | 15 | np.random.seed(0) 16 | digits = pd.read_hdf('./BASE/datasets.hdf','digits') 17 | digitsX = digits.drop('Class',1).copy().values 18 | digitsY = digits['Class'].copy().values 19 | 20 | madelon = pd.read_hdf('./BASE/datasets.hdf','madelon') 21 | madelonX = madelon.drop('Class',1).copy().values 22 | madelonY = madelon['Class'].copy().values 23 | 24 | 25 | madelonX = StandardScaler().fit_transform(madelonX) 26 | digitsX= StandardScaler().fit_transform(digitsX) 27 | 28 | clusters = [2,5,10,15,20,25,30,35,40] 29 | dims = [2,5,10,15,20,25,30,35,40,45,50,55,60] 30 | #raise 31 | #%% data for 1 32 | 33 | ica = FastICA(random_state=5) 34 | kurt = {} 35 | for dim in dims: 36 | ica.set_params(n_components=dim) 37 | tmp = ica.fit_transform(madelonX) 38 | tmp = pd.DataFrame(tmp) 39 | tmp = tmp.kurt(axis=0) 40 | kurt[dim] = tmp.abs().mean() 41 | 42 | kurt = pd.Series(kurt) 43 | kurt.to_csv(out+'madelon scree.csv') 44 | 45 | 46 | ica = FastICA(random_state=5) 47 | kurt = {} 48 | for dim in dims: 49 | ica.set_params(n_components=dim) 50 | tmp = ica.fit_transform(digitsX) 51 | tmp = pd.DataFrame(tmp) 52 | tmp = tmp.kurt(axis=0) 53 | kurt[dim] = tmp.abs().mean() 54 | 55 | kurt = pd.Series(kurt) 56 | kurt.to_csv(out+'digits scree.csv') 57 | raise 58 | 59 | #%% Data for 2 60 | 61 | grid ={'ica__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 62 | ica = FastICA(random_state=5) 63 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 64 | pipe = Pipeline([('ica',ica),('NN',mlp)]) 65 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 66 | 67 | gs.fit(madelonX,madelonY) 68 | tmp = pd.DataFrame(gs.cv_results_) 69 | tmp.to_csv(out+'Madelon dim red.csv') 70 | 71 | 72 | grid ={'ica__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 73 | ica = FastICA(random_state=5) 74 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 75 | pipe = Pipeline([('ica',ica),('NN',mlp)]) 76 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 77 | 78 | gs.fit(digitsX,digitsY) 79 | tmp = pd.DataFrame(gs.cv_results_) 80 | tmp.to_csv(out+'digits dim red.csv') 81 | raise 82 | #%% data for 3 83 | # Set this from chart 2 and dump, use clustering script to finish up 84 | dim = 45 85 | ica = FastICA(n_components=dim,random_state=10) 86 | 87 | madelonX2 = ica.fit_transform(madelonX) 88 | madelon2 = pd.DataFrame(np.hstack((madelonX2,np.atleast_2d(madelonY).T))) 89 | cols = list(range(madelon2.shape[1])) 90 | cols[-1] = 'Class' 91 | madelon2.columns = cols 92 | madelon2.to_hdf(out+'datasets.hdf','madelon',complib='blosc',complevel=9) 93 | 94 | dim = 60 95 | ica = FastICA(n_components=dim,random_state=10) 96 | digitsX2 = ica.fit_transform(digitsX) 97 | digits2 = pd.DataFrame(np.hstack((digitsX2,np.atleast_2d(digitsY).T))) 98 | cols = list(range(digits2.shape[1])) 99 | cols[-1] = 'Class' 100 | digits2.columns = cols 101 | digits2.to_hdf(out+'datasets.hdf','digits',complib='blosc',complevel=9) -------------------------------------------------------------------------------- /PCA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 15 15:51:37 2017 4 | 5 | @author: jtay 6 | """ 7 | 8 | #%% Imports 9 | import pandas as pd 10 | import numpy as np 11 | from sklearn.preprocessing import StandardScaler 12 | from sklearn.pipeline import Pipeline 13 | from helpers import nn_arch,nn_reg 14 | from matplotlib import cm 15 | from sklearn.neural_network import MLPClassifier 16 | from sklearn.model_selection import GridSearchCV 17 | from sklearn.decomposition import PCA 18 | 19 | out = './PCA/' 20 | cmap = cm.get_cmap('Spectral') 21 | 22 | np.random.seed(0) 23 | digits = pd.read_hdf('./BASE/datasets.hdf','digits') 24 | digitsX = digits.drop('Class',1).copy().values 25 | digitsY = digits['Class'].copy().values 26 | 27 | madelon = pd.read_hdf('./BASE/datasets.hdf','madelon') 28 | madelonX = madelon.drop('Class',1).copy().values 29 | madelonY = madelon['Class'].copy().values 30 | 31 | 32 | madelonX = StandardScaler().fit_transform(madelonX) 33 | digitsX= StandardScaler().fit_transform(digitsX) 34 | 35 | clusters = [2,5,10,15,20,25,30,35,40] 36 | dims = [2,5,10,15,20,25,30,35,40,45,50,55,60] 37 | #raise 38 | #%% data for 1 39 | 40 | pca = PCA(random_state=5) 41 | pca.fit(madelonX) 42 | tmp = pd.Series(data = pca.explained_variance_,index = range(1,501)) 43 | tmp.to_csv(out+'madelon scree.csv') 44 | 45 | 46 | pca = PCA(random_state=5) 47 | pca.fit(digitsX) 48 | tmp = pd.Series(data = pca.explained_variance_,index = range(1,65)) 49 | tmp.to_csv(out+'digits scree.csv') 50 | 51 | 52 | #%% Data for 2 53 | 54 | grid ={'pca__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 55 | pca = PCA(random_state=5) 56 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 57 | pipe = Pipeline([('pca',pca),('NN',mlp)]) 58 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 59 | 60 | gs.fit(madelonX,madelonY) 61 | tmp = pd.DataFrame(gs.cv_results_) 62 | tmp.to_csv(out+'Madelon dim red.csv') 63 | 64 | 65 | grid ={'pca__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 66 | pca = PCA(random_state=5) 67 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 68 | pipe = Pipeline([('pca',pca),('NN',mlp)]) 69 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 70 | 71 | gs.fit(digitsX,digitsY) 72 | tmp = pd.DataFrame(gs.cv_results_) 73 | tmp.to_csv(out+'digits dim red.csv') 74 | raise 75 | #%% data for 3 76 | # Set this from chart 2 and dump, use clustering script to finish up 77 | dim = 5 78 | pca = PCA(n_components=dim,random_state=10) 79 | 80 | madelonX2 = pca.fit_transform(madelonX) 81 | madelon2 = pd.DataFrame(np.hstack((madelonX2,np.atleast_2d(madelonY).T))) 82 | cols = list(range(madelon2.shape[1])) 83 | cols[-1] = 'Class' 84 | madelon2.columns = cols 85 | madelon2.to_hdf(out+'datasets.hdf','madelon',complib='blosc',complevel=9) 86 | 87 | dim = 60 88 | pca = PCA(n_components=dim,random_state=10) 89 | digitsX2 = pca.fit_transform(digitsX) 90 | digits2 = pd.DataFrame(np.hstack((digitsX2,np.atleast_2d(digitsY).T))) 91 | cols = list(range(digits2.shape[1])) 92 | cols[-1] = 'Class' 93 | digits2.columns = cols 94 | digits2.to_hdf(out+'datasets.hdf','digits',complib='blosc',complevel=9) -------------------------------------------------------------------------------- /RF.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | #%% Imports 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.pipeline import Pipeline 9 | from helpers import nn_arch,nn_reg,ImportanceSelect 10 | from sklearn.ensemble import RandomForestClassifier 11 | from sklearn.neural_network import MLPClassifier 12 | from sklearn.model_selection import GridSearchCV 13 | 14 | 15 | if __name__ == '__main__': 16 | out = './RF/' 17 | 18 | np.random.seed(0) 19 | digits = pd.read_hdf('./BASE/datasets.hdf','digits') 20 | digitsX = digits.drop('Class',1).copy().values 21 | digitsY = digits['Class'].copy().values 22 | 23 | madelon = pd.read_hdf('./BASE/datasets.hdf','madelon') 24 | madelonX = madelon.drop('Class',1).copy().values 25 | madelonY = madelon['Class'].copy().values 26 | 27 | 28 | madelonX = StandardScaler().fit_transform(madelonX) 29 | digitsX= StandardScaler().fit_transform(digitsX) 30 | 31 | clusters = [2,5,10,15,20,25,30,35,40] 32 | dims = [2,5,10,15,20,25,30,35,40,45,50,55,60] 33 | 34 | #%% data for 1 35 | 36 | rfc = RandomForestClassifier(n_estimators=100,class_weight='balanced',random_state=5,n_jobs=7) 37 | fs_madelon = rfc.fit(madelonX,madelonY).feature_importances_ 38 | fs_digits = rfc.fit(digitsX,digitsY).feature_importances_ 39 | 40 | tmp = pd.Series(np.sort(fs_madelon)[::-1]) 41 | tmp.to_csv(out+'madelon scree.csv') 42 | 43 | tmp = pd.Series(np.sort(fs_digits)[::-1]) 44 | tmp.to_csv(out+'digits scree.csv') 45 | 46 | #%% Data for 2 47 | filtr = ImportanceSelect(rfc) 48 | grid ={'filter__n':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 49 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 50 | pipe = Pipeline([('filter',filtr),('NN',mlp)]) 51 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 52 | 53 | gs.fit(madelonX,madelonY) 54 | tmp = pd.DataFrame(gs.cv_results_) 55 | tmp.to_csv(out+'Madelon dim red.csv') 56 | 57 | 58 | grid ={'filter__n':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 59 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 60 | pipe = Pipeline([('filter',filtr),('NN',mlp)]) 61 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 62 | 63 | gs.fit(digitsX,digitsY) 64 | tmp = pd.DataFrame(gs.cv_results_) 65 | tmp.to_csv(out+'digits dim red.csv') 66 | # raise 67 | #%% data for 3 68 | # Set this from chart 2 and dump, use clustering script to finish up 69 | dim = 20 70 | filtr = ImportanceSelect(rfc,dim) 71 | 72 | madelonX2 = filtr.fit_transform(madelonX,madelonY) 73 | madelon2 = pd.DataFrame(np.hstack((madelonX2,np.atleast_2d(madelonY).T))) 74 | cols = list(range(madelon2.shape[1])) 75 | cols[-1] = 'Class' 76 | madelon2.columns = cols 77 | madelon2.to_hdf(out+'datasets.hdf','madelon',complib='blosc',complevel=9) 78 | 79 | dim = 40 80 | filtr = ImportanceSelect(rfc,dim) 81 | digitsX2 = filtr.fit_transform(digitsX,digitsY) 82 | digits2 = pd.DataFrame(np.hstack((digitsX2,np.atleast_2d(digitsY).T))) 83 | cols = list(range(digits2.shape[1])) 84 | cols[-1] = 'Class' 85 | digits2.columns = cols 86 | digits2.to_hdf(out+'datasets.hdf','digits',complib='blosc',complevel=9) -------------------------------------------------------------------------------- /RP.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | #%% Imports 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.preprocessing import StandardScaler 7 | from sklearn.pipeline import Pipeline 8 | from collections import defaultdict 9 | from helpers import pairwiseDistCorr,nn_reg,nn_arch,reconstructionError 10 | from matplotlib import cm 11 | from sklearn.neural_network import MLPClassifier 12 | from sklearn.model_selection import GridSearchCV 13 | from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection 14 | from itertools import product 15 | 16 | out = './RP/' 17 | cmap = cm.get_cmap('Spectral') 18 | 19 | np.random.seed(0) 20 | digits = pd.read_hdf('./BASE/datasets.hdf','digits') 21 | digitsX = digits.drop('Class',1).copy().values 22 | digitsY = digits['Class'].copy().values 23 | 24 | madelon = pd.read_hdf('./BASE/datasets.hdf','madelon') 25 | madelonX = madelon.drop('Class',1).copy().values 26 | madelonY = madelon['Class'].copy().values 27 | 28 | 29 | madelonX = StandardScaler().fit_transform(madelonX) 30 | digitsX= StandardScaler().fit_transform(digitsX) 31 | 32 | clusters = [2,5,10,15,20,25,30,35,40] 33 | dims = [2,5,10,15,20,25,30,35,40,45,50,55,60] 34 | #raise 35 | #%% data for 1 36 | 37 | tmp = defaultdict(dict) 38 | for i,dim in product(range(10),dims): 39 | rp = SparseRandomProjection(random_state=i, n_components=dim) 40 | tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX) 41 | tmp =pd.DataFrame(tmp).T 42 | tmp.to_csv(out+'madelon scree1.csv') 43 | 44 | 45 | tmp = defaultdict(dict) 46 | for i,dim in product(range(10),dims): 47 | rp = SparseRandomProjection(random_state=i, n_components=dim) 48 | tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) 49 | tmp =pd.DataFrame(tmp).T 50 | tmp.to_csv(out+'digits scree1.csv') 51 | 52 | 53 | tmp = defaultdict(dict) 54 | for i,dim in product(range(10),dims): 55 | rp = SparseRandomProjection(random_state=i, n_components=dim) 56 | rp.fit(madelonX) 57 | tmp[dim][i] = reconstructionError(rp, madelonX) 58 | tmp =pd.DataFrame(tmp).T 59 | tmp.to_csv(out+'madelon scree2.csv') 60 | 61 | 62 | tmp = defaultdict(dict) 63 | for i,dim in product(range(10),dims): 64 | rp = SparseRandomProjection(random_state=i, n_components=dim) 65 | rp.fit(digitsX) 66 | tmp[dim][i] = reconstructionError(rp, digitsX) 67 | tmp =pd.DataFrame(tmp).T 68 | tmp.to_csv(out+'digits scree2.csv') 69 | 70 | #%% Data for 2 71 | 72 | grid ={'rp__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 73 | rp = SparseRandomProjection(random_state=5) 74 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 75 | pipe = Pipeline([('rp',rp),('NN',mlp)]) 76 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 77 | 78 | gs.fit(madelonX,madelonY) 79 | tmp = pd.DataFrame(gs.cv_results_) 80 | tmp.to_csv(out+'Madelon dim red.csv') 81 | 82 | 83 | grid ={'rp__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 84 | rp = SparseRandomProjection(random_state=5) 85 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 86 | pipe = Pipeline([('rp',rp),('NN',mlp)]) 87 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 88 | 89 | gs.fit(digitsX,digitsY) 90 | tmp = pd.DataFrame(gs.cv_results_) 91 | tmp.to_csv(out+'digits dim red.csv') 92 | raise 93 | #%% data for 3 94 | # Set this from chart 2 and dump, use clustering script to finish up 95 | dim = 10 96 | rp = SparseRandomProjection(n_components=dim,random_state=5) 97 | 98 | madelonX2 = rp.fit_transform(madelonX) 99 | madelon2 = pd.DataFrame(np.hstack((madelonX2,np.atleast_2d(madelonY).T))) 100 | cols = list(range(madelon2.shape[1])) 101 | cols[-1] = 'Class' 102 | madelon2.columns = cols 103 | madelon2.to_hdf(out+'datasets.hdf','madelon',complib='blosc',complevel=9) 104 | 105 | dim = 60 106 | rp = SparseRandomProjection(n_components=dim,random_state=5) 107 | digitsX2 = rp.fit_transform(digitsX) 108 | digits2 = pd.DataFrame(np.hstack((digitsX2,np.atleast_2d(digitsY).T))) 109 | cols = list(range(digits2.shape[1])) 110 | cols[-1] = 'Class' 111 | digits2.columns = cols 112 | digits2.to_hdf(out+'datasets.hdf','digits',complib='blosc',complevel=9) -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 20 17:17:14 2017 4 | 5 | @author: JTay 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from sklearn.preprocessing import StandardScaler 12 | from sklearn.pipeline import Pipeline 13 | from helpers import nn_arch,nn_reg 14 | from sklearn.neural_network import MLPClassifier 15 | from sklearn.model_selection import GridSearchCV 16 | 17 | out = './BASE/' 18 | np.random.seed(0) 19 | digits = pd.read_hdf('./BASE/datasets.hdf','digits') 20 | digitsX = digits.drop('Class',1).copy().values 21 | digitsY = digits['Class'].copy().values 22 | 23 | madelon = pd.read_hdf('./BASE/datasets.hdf','madelon') 24 | madelonX = madelon.drop('Class',1).copy().values 25 | madelonY = madelon['Class'].copy().values 26 | 27 | 28 | madelonX = StandardScaler().fit_transform(madelonX) 29 | digitsX= StandardScaler().fit_transform(digitsX) 30 | 31 | #%% benchmarking for chart type 2 32 | 33 | grid ={'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 34 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 35 | pipe = Pipeline([('NN',mlp)]) 36 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 37 | 38 | gs.fit(madelonX,madelonY) 39 | tmp = pd.DataFrame(gs.cv_results_) 40 | tmp.to_csv(out+'Madelon NN bmk.csv') 41 | 42 | 43 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 44 | pipe = Pipeline([('NN',mlp)]) 45 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 46 | 47 | gs.fit(digitsX,digitsY) 48 | tmp = pd.DataFrame(gs.cv_results_) 49 | tmp.to_csv(out+'digits NN bmk.csv') 50 | raise -------------------------------------------------------------------------------- /clustering.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 16 10:38:28 2017 4 | 5 | @author: jtay 6 | """ 7 | 8 | #%% Imports 9 | import pandas as pd 10 | import numpy as np 11 | from sklearn.manifold import TSNE 12 | from time import clock 13 | from sklearn.preprocessing import StandardScaler 14 | from sklearn.pipeline import Pipeline 15 | from sklearn.cluster import KMeans as kmeans 16 | from sklearn.mixture import GaussianMixture as GMM 17 | from collections import defaultdict 18 | from helpers import cluster_acc, myGMM,nn_arch,nn_reg 19 | from sklearn.metrics import adjusted_mutual_info_score as ami 20 | from sklearn.neural_network import MLPClassifier 21 | from sklearn.model_selection import GridSearchCV 22 | import sys 23 | 24 | out = './{}/'.format(sys.argv[1]) 25 | 26 | np.random.seed(0) 27 | digits = pd.read_hdf(out+'datasets.hdf','digits') 28 | digitsX = digits.drop('Class',1).copy().values 29 | digitsY = digits['Class'].copy().values 30 | 31 | madelon = pd.read_hdf(out+'datasets.hdf','madelon') 32 | madelonX = madelon.drop('Class',1).copy().values 33 | madelonY = madelon['Class'].copy().values 34 | 35 | 36 | madelonX = StandardScaler().fit_transform(madelonX) 37 | digitsX= StandardScaler().fit_transform(digitsX) 38 | 39 | clusters = [2,5,10,15,20,25,30,35,40] 40 | 41 | #%% Data for 1-3 42 | SSE = defaultdict(dict) 43 | ll = defaultdict(dict) 44 | acc = defaultdict(lambda: defaultdict(dict)) 45 | adjMI = defaultdict(lambda: defaultdict(dict)) 46 | km = kmeans(random_state=5) 47 | gmm = GMM(random_state=5) 48 | 49 | st = clock() 50 | for k in clusters: 51 | km.set_params(n_clusters=k) 52 | gmm.set_params(n_components=k) 53 | km.fit(madelonX) 54 | gmm.fit(madelonX) 55 | SSE[k]['Madelon'] = km.score(madelonX) 56 | ll[k]['Madelon'] = gmm.score(madelonX) 57 | acc[k]['Madelon']['Kmeans'] = cluster_acc(madelonY,km.predict(madelonX)) 58 | acc[k]['Madelon']['GMM'] = cluster_acc(madelonY,gmm.predict(madelonX)) 59 | adjMI[k]['Madelon']['Kmeans'] = ami(madelonY,km.predict(madelonX)) 60 | adjMI[k]['Madelon']['GMM'] = ami(madelonY,gmm.predict(madelonX)) 61 | 62 | km.fit(digitsX) 63 | gmm.fit(digitsX) 64 | SSE[k]['Digits'] = km.score(digitsX) 65 | ll[k]['Digits'] = gmm.score(digitsX) 66 | acc[k]['Digits']['Kmeans'] = cluster_acc(digitsY,km.predict(digitsX)) 67 | acc[k]['Digits']['GMM'] = cluster_acc(digitsY,gmm.predict(digitsX)) 68 | adjMI[k]['Digits']['Kmeans'] = ami(digitsY,km.predict(digitsX)) 69 | adjMI[k]['Digits']['GMM'] = ami(digitsY,gmm.predict(digitsX)) 70 | print(k, clock()-st) 71 | 72 | 73 | SSE = (-pd.DataFrame(SSE)).T 74 | SSE.rename(columns = lambda x: x+' SSE (left)',inplace=True) 75 | ll = pd.DataFrame(ll).T 76 | ll.rename(columns = lambda x: x+' log-likelihood',inplace=True) 77 | acc = pd.Panel(acc) 78 | adjMI = pd.Panel(adjMI) 79 | 80 | 81 | SSE.to_csv(out+'SSE.csv') 82 | ll.to_csv(out+'logliklihood.csv') 83 | acc.ix[:,:,'Digits'].to_csv(out+'Digits acc.csv') 84 | acc.ix[:,:,'Madelon'].to_csv(out+'Madelon acc.csv') 85 | adjMI.ix[:,:,'Digits'].to_csv(out+'Digits adjMI.csv') 86 | adjMI.ix[:,:,'Madelon'].to_csv(out+'Madelon adjMI.csv') 87 | 88 | 89 | #%% NN fit data (2,3) 90 | 91 | grid ={'km__n_clusters':clusters,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 92 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 93 | km = kmeans(random_state=5) 94 | pipe = Pipeline([('km',km),('NN',mlp)]) 95 | gs = GridSearchCV(pipe,grid,verbose=10) 96 | 97 | gs.fit(madelonX,madelonY) 98 | tmp = pd.DataFrame(gs.cv_results_) 99 | tmp.to_csv(out+'Madelon cluster Kmeans.csv') 100 | 101 | 102 | grid ={'gmm__n_components':clusters,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 103 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 104 | gmm = myGMM(random_state=5) 105 | pipe = Pipeline([('gmm',gmm),('NN',mlp)]) 106 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 107 | 108 | gs.fit(madelonX,madelonY) 109 | tmp = pd.DataFrame(gs.cv_results_) 110 | tmp.to_csv(out+'Madelon cluster GMM.csv') 111 | 112 | 113 | 114 | 115 | grid ={'km__n_clusters':clusters,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 116 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 117 | km = kmeans(random_state=5) 118 | pipe = Pipeline([('km',km),('NN',mlp)]) 119 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 120 | 121 | gs.fit(digitsX,digitsY) 122 | tmp = pd.DataFrame(gs.cv_results_) 123 | tmp.to_csv(out+'Digits cluster Kmeans.csv') 124 | 125 | 126 | grid ={'gmm__n_components':clusters,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 127 | mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 128 | gmm = myGMM(random_state=5) 129 | pipe = Pipeline([('gmm',gmm),('NN',mlp)]) 130 | gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 131 | 132 | gs.fit(digitsX,digitsY) 133 | tmp = pd.DataFrame(gs.cv_results_) 134 | tmp.to_csv(out+'Digits cluster GMM.csv') 135 | 136 | 137 | # %% For chart 4/5 138 | madelonX2D = TSNE(verbose=10,random_state=5).fit_transform(madelonX) 139 | digitsX2D = TSNE(verbose=10,random_state=5).fit_transform(digitsX) 140 | 141 | madelon2D = pd.DataFrame(np.hstack((madelonX2D,np.atleast_2d(madelonY).T)),columns=['x','y','target']) 142 | digits2D = pd.DataFrame(np.hstack((digitsX2D,np.atleast_2d(digitsY).T)),columns=['x','y','target']) 143 | 144 | madelon2D.to_csv(out+'madelon2D.csv') 145 | digits2D.to_csv(out+'digits2D.csv') 146 | 147 | 148 | -------------------------------------------------------------------------------- /helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 16 10:47:56 2017 4 | 5 | @author: jtay 6 | """ 7 | 8 | import numpy as np 9 | from collections import Counter 10 | from sklearn.metrics import accuracy_score as acc 11 | from sklearn.mixture import GaussianMixture as GMM 12 | from sklearn.metrics.pairwise import pairwise_distances 13 | from sklearn.feature_selection import mutual_info_classif as MIC 14 | from sklearn.base import TransformerMixin,BaseEstimator 15 | import scipy.sparse as sps 16 | from scipy.linalg import pinv 17 | 18 | nn_arch= [(50,50),(50,),(25,),(25,25),(100,25,100)] 19 | nn_reg = [10**-x for x in range(1,5)] 20 | 21 | def cluster_acc(Y,clusterLabels): 22 | assert (Y.shape == clusterLabels.shape) 23 | pred = np.empty_like(Y) 24 | for label in set(clusterLabels): 25 | mask = clusterLabels == label 26 | sub = Y[mask] 27 | target = Counter(sub).most_common(1)[0][0] 28 | pred[mask] = target 29 | # assert max(pred) == max(Y) 30 | # assert min(pred) == min(Y) 31 | return acc(Y,pred) 32 | 33 | 34 | class myGMM(GMM): 35 | def transform(self,X): 36 | return self.predict_proba(X) 37 | 38 | 39 | def pairwiseDistCorr(X1,X2): 40 | assert X1.shape[0] == X2.shape[0] 41 | 42 | d1 = pairwise_distances(X1) 43 | d2 = pairwise_distances(X2) 44 | return np.corrcoef(d1.ravel(),d2.ravel())[0,1] 45 | 46 | 47 | def aveMI(X,Y): 48 | MI = MIC(X,Y) 49 | return np.nanmean(MI) 50 | 51 | 52 | def reconstructionError(projections,X): 53 | W = projections.components_ 54 | if sps.issparse(W): 55 | W = W.todense() 56 | p = pinv(W) 57 | reconstructed = ((p@W)@(X.T)).T # Unproject projected data 58 | errors = np.square(X-reconstructed) 59 | return np.nanmean(errors) 60 | 61 | 62 | 63 | # http://datascience.stackexchange.com/questions/6683/feature-selection-using-feature-importances-in-random-forests-with-scikit-learn 64 | class ImportanceSelect(BaseEstimator, TransformerMixin): 65 | def __init__(self, model, n=1): 66 | self.model = model 67 | self.n = n 68 | def fit(self, *args, **kwargs): 69 | self.model.fit(*args, **kwargs) 70 | return self 71 | def transform(self, X): 72 | return X[:,self.model.feature_importances_.argsort()[::-1][:self.n]] 73 | 74 | #http://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans 75 | from scipy.spatial import distance 76 | def compute_bic(kmeans,X): 77 | """ 78 | Computes the BIC metric for a given clusters 79 | 80 | Parameters: 81 | ----------------------------------------- 82 | kmeans: List of clustering object from scikit learn 83 | 84 | X : multidimension np array of data points 85 | 86 | Returns: 87 | ----------------------------------------- 88 | BIC value 89 | """ 90 | # assign centers and labels 91 | centers = [kmeans.cluster_centers_] 92 | labels = kmeans.labels_ 93 | #number of clusters 94 | m = kmeans.n_clusters 95 | # size of the clusters 96 | n = np.bincount(labels) 97 | #size of data set 98 | N, d = X.shape 99 | 100 | #compute variance for all clusters beforehand 101 | cl_var = (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)]) 102 | 103 | const_term = 0.5 * m * np.log(N) * (d+1) 104 | 105 | BIC = np.sum([n[i] * np.log(n[i]) - 106 | n[i] * np.log(N) - 107 | ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) - 108 | ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term 109 | 110 | return(BIC) -------------------------------------------------------------------------------- /madelon tricks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Mar 18 13:08:50 2017 4 | 5 | @author: JTay 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | from helpers import nn_arch,nn_reg,ImportanceSelect 13 | from matplotlib import cm 14 | from sklearn.neural_network import MLPClassifier 15 | from sklearn.model_selection import GridSearchCV 16 | from sklearn.decomposition import PCA 17 | import matplotlib.pyplot as plt 18 | from sklearn.neighbors import KNeighborsClassifier 19 | from sklearn.metrics import pairwise_distances 20 | from sklearn.svm import SVC 21 | from sklearn.pipeline import Pipeline 22 | out = './PCA/' 23 | cmap = cm.get_cmap('Spectral') 24 | 25 | np.random.seed(0) 26 | 27 | 28 | madelon = pd.read_hdf('./BASE/datasets.hdf','madelon') 29 | madelonX = madelon.drop('Class',1).copy().values 30 | madelonY = madelon['Class'].copy().values 31 | scaler =StandardScaler() 32 | 33 | madelon_test = pd.read_hdf('./BASE/datasets.hdf','madelon') 34 | madelon_tstX = madelon_test.drop('Class',1).copy().values 35 | madelon_tstY = madelon_test['Class'].copy().values 36 | from sklearn.ensemble import RandomForestClassifier 37 | 38 | 39 | 40 | madelonX = scaler.fit_transform(madelonX) 41 | madelon_tstX = scaler.transform(madelon_tstX) 42 | 43 | 44 | #Reproduce best estimator so far 45 | #if __name__=='__main__': 46 | # rfc = RandomForestClassifier(n_estimators=100,class_weight='balanced',random_state=5,n_jobs=7) 47 | # filtr = ImportanceSelect(rfc) 48 | # grid ={'filter__n':[20],'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch} 49 | # mlp = MLPClassifier(activation='relu',max_iter=2000,early_stopping=True,random_state=5) 50 | # pipe = Pipeline([('filter',filtr),('NN',mlp)]) 51 | # gs = GridSearchCV(pipe,grid,verbose=10,cv=5) 52 | # gs.fit(madelonX,madelonY) 53 | # print('Best CV Score {}'.format(gs.best_score_)) 54 | # print('Test Score {}'.format(gs.score(madelon_tstX,madelon_tstY))) 55 | # rf_features = gs.best_estimator_.steps[0][1].model.feature_importances_.argsort()[::-1][:20] 56 | 57 | 58 | # Use PCA to find true correct featuers 59 | pca = PCA(random_state=5,n_components=500) 60 | pca.fit(madelonX) 61 | ve = pd.Series(pca.explained_variance_) 62 | ve.plot() 63 | plt.xlabel('Component') 64 | plt.ylabel('Variance Explained') 65 | tmp = pd.DataFrame(pca.components_) 66 | tmp=tmp.iloc[-15:,:] 67 | pca_features=tmp.columns[tmp.abs().max()>0.1] 68 | 69 | 70 | xx= madelonX[:,pca_features] 71 | xx_tst = madelon_tstX[:,pca_features] 72 | 73 | ## NN testing - standard param set 74 | #grid ={'alpha':nn_reg,'hidden_layer_sizes':nn_arch} 75 | #mlp = MLPClassifier(activation='relu',max_iter=3000,early_stopping=False,random_state=5) 76 | #gs = GridSearchCV(mlp,param_grid=grid,verbose=10,cv=5) 77 | #gs.fit(madelonX[:,pca_features],madelonY) 78 | #print('NN - Standard params - Best CV Score {}'.format(gs.best_score_)) 79 | #print('NN - Standard params - Test Score {}'.format(gs.score(xx_tst,madelon_tstY))) 80 | # 81 | # 82 | # 83 | ## NN testing - standard param set 84 | #grid ={'alpha':[1e-4,1e-5,1e-6],'hidden_layer_sizes':[(200,100,100,64,100,100,200)]} 85 | #mlp = MLPClassifier(activation='relu',max_iter=3000,early_stopping=False,random_state=5) 86 | #gs = GridSearchCV(mlp,param_grid=grid,verbose=10,cv=5) 87 | #gs.fit(madelonX[:,pca_features],madelonY) 88 | #print('NN - Big network- Best CV Score {}'.format(gs.best_score_)) 89 | #print('NN - Big network - Test Score {}'.format(gs.score(xx_tst,madelon_tstY))) 90 | 91 | 92 | #KNN 93 | knn = KNeighborsClassifier() 94 | grid={'n_neighbors':range(1,25,1),'p':[1,2],'weights':['uniform','distance']} 95 | gs = GridSearchCV(knn,param_grid=grid,cv=5,verbose=10) 96 | gs.fit(xx,madelonY) 97 | print('KNN - Best CV Score {}'.format(gs.best_score_)) 98 | print('KNN - Test Score {}'.format(gs.score(xx_tst,madelon_tstY))) 99 | 100 | 101 | # SVM 102 | dis = pairwise_distances(xx) 103 | m = np.median(dis) 104 | gammas = [(1/m)*x for x in np.arange(0.1,2.1,0.1)]+[0.1,0.2,0.3,0.4,0.5] 105 | gammas = np.arange(0.1,0.9,0.05) 106 | 107 | gammas = [(1/m)*x for x in np.arange(0.1,2.1,0.1)] 108 | param_grid={'gamma':gammas,'C':[10**x for x in [-1,0,1,2,3]]} 109 | gs = GridSearchCV(SVC(kernel='rbf',C=1),param_grid=param_grid,cv=5,verbose=10,n_jobs=1) 110 | gs.fit(xx,madelonY) 111 | print('SVM - Best CV Score {}'.format(gs.best_score_)) 112 | print('SVM - Test Score {}'.format(gs.score(xx_tst,madelon_tstY))) 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 15 10:39:27 2017 4 | 5 | @author: jtay 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | from sklearn.datasets import load_digits 11 | import os 12 | import sklearn.model_selection as ms 13 | 14 | for d in ['BASE','RP','PCA','ICA','RF']: 15 | n = './{}/'.format(d) 16 | if not os.path.exists(n): 17 | os.makedirs(n) 18 | 19 | OUT = './BASE/' 20 | madX1 = pd.read_csv('./madelon_train.data',header=None,sep=' ') 21 | madX2 = pd.read_csv('./madelon_valid.data',header=None,sep=' ') 22 | madX = pd.concat([madX1,madX2],0).astype(float) 23 | madY1 = pd.read_csv('./madelon_train.labels',header=None,sep=' ') 24 | madY2 = pd.read_csv('./madelon_valid.labels',header=None,sep=' ') 25 | madY = pd.concat([madY1,madY2],0) 26 | madY.columns = ['Class'] 27 | 28 | madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split(madX, madY, test_size=0.3, random_state=0,stratify=madY) 29 | 30 | madX = pd.DataFrame(madelon_trgX) 31 | madY = pd.DataFrame(madelon_trgY) 32 | madY.columns = ['Class'] 33 | 34 | madX2 = pd.DataFrame(madelon_tstX) 35 | madY2 = pd.DataFrame(madelon_tstY) 36 | madY2.columns = ['Class'] 37 | 38 | mad1 = pd.concat([madX,madY],1) 39 | mad1 = mad1.dropna(axis=1,how='all') 40 | mad1.to_hdf(OUT+'datasets.hdf','madelon',complib='blosc',complevel=9) 41 | 42 | mad2 = pd.concat([madX2,madY2],1) 43 | mad2 = mad2.dropna(axis=1,how='all') 44 | mad2.to_hdf(OUT+'datasets.hdf','madelon_test',complib='blosc',complevel=9) 45 | 46 | 47 | 48 | digits = load_digits(return_X_y=True) 49 | digitsX,digitsY = digits 50 | 51 | digits = np.hstack((digitsX, np.atleast_2d(digitsY).T)) 52 | digits = pd.DataFrame(digits) 53 | cols = list(range(digits.shape[1])) 54 | cols[-1] = 'Class' 55 | digits.columns = cols 56 | digits.to_hdf(OUT+'datasets.hdf','digits',complib='blosc',complevel=9) 57 | 58 | -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | python clustering.py PCA 2 | python clustering.py BASE 3 | python clustering.py ICA 4 | python clustering.py RP 5 | python clustering.py RF --------------------------------------------------------------------------------