├── .gitignore ├── README.md ├── complib ├── __init__.py ├── digit_recognizer.py ├── hackathon3x.py ├── titanic.py └── toy.py └── kaggle.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # data 92 | data/ 93 | 94 | # dump 95 | dump/ 96 | 97 | # submission 98 | submission/ 99 | 100 | # log 101 | log/ 102 | 103 | # pic 104 | pic/ 105 | 106 | # ple 107 | ple/ 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kaggle 2 | kaggle实战 3 | -------------------------------------------------------------------------------- /complib/__init__.py: -------------------------------------------------------------------------------- 1 | import digit_recognizer, titanic, toy, hackathon3x 2 | 3 | __all__ = ['digit_recognizer', 'titanic', 'toy', 'hackathon3x'] 4 | -------------------------------------------------------------------------------- /complib/digit_recognizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 3 | from sklearn.grid_search import GridSearchCV 4 | 5 | def model(*argList, **argDict): 6 | classifier = RandomForestClassifier(verbose=2, n_jobs=-1) 7 | 8 | # param_grid={'n_estimators':np.arange(1, 202, 10)} 9 | # param_grid={'n_estimators':[200], 'criterion':['gini', 'entropy']} 10 | # param_grid={'n_estimators':[200], 'max_features':np.append(np.arange(28-20, 28, 1), np.arange(28, 28+20, 1))} 11 | # param_grid={'n_estimators':[200], 'max_depth':np.arange(40, 40+20, 1)} 12 | # param_grid={'n_estimators':[200], 'min_samples_split':np.arange(2, 2+10, 1)} 13 | # param_grid={'n_estimators':[200], 'min_samples_leaf':np.arange(1, 1+10, 1)} 14 | # param_grid={'n_estimators':[200], 'max_leaf_nodes':np.arange(3000, 3000+1000, 100)} 15 | 16 | searcher = GridSearchCV(classifier, n_jobs=-1, param_grid=param_grid) 17 | 18 | return searcher 19 | 20 | def loadTrainSet(filepath): 21 | raw = np.loadtxt(filepath, delimiter=',', dtype=np.str, skiprows=1) 22 | X, y = raw[:,1:], raw[:,0] 23 | trainSet = np.hstack((X, y.reshape(-1,1))) 24 | return trainSet 25 | 26 | def loadTestSet(filepath): 27 | raw = np.loadtxt(filepath, delimiter=',', dtype=np.str, skiprows=1) 28 | testSet = np.hstack((np.arange(1, raw.shape[0]+1).reshape(-1,1), raw)) 29 | return testSet 30 | 31 | def saveSubmission(filepath, idList, y): 32 | result = np.vstack((idList.astype(np.int64), y.astype(np.int64))).T 33 | np.savetxt(filepath, result, fmt='%d', delimiter=',', header='ImageId,Label', comments='') 34 | -------------------------------------------------------------------------------- /complib/hackathon3x.py: -------------------------------------------------------------------------------- 1 | from time import strptime, localtime 2 | import numpy as np 3 | from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler 4 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 5 | from sklearn.grid_search import GridSearchCV 6 | from ple import FeatureUnionExt, PipelineExt 7 | 8 | now = localtime() 9 | _gender_map = {'Female': '0', 'Male': '1', '':'nan'} 10 | _bool_map = {'N': '0', 'Y': '1', '':'nan'} 11 | _var1_map = {'HAVC': 0, 'HAXA': 1, 'HAXB': 2, 'HAXC': 3, 'HAXF': 4, 'HAXM': 5, 'HAYT': 6, 'HAZD': 7, 'HBXA': 8, 'HBXB': 9, 'HBXC': 10, 'HBXD': 11, 'HBXH': 12, 'HBXX': 13, 'HCXD': 14, 'HCXF': 15, 'HCXG': 16, 'HCYS': 17, 'HVYS': 18} 12 | _device_type_map = {'Mobile': '0', 'Web-browser': '1', '':'nan'} 13 | _var2_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, '':'nan'} 14 | _source_map = {'S122': 0, 'S133': 1,'':'nan'} 15 | _miss_value = lambda x: 'nan' if x == '' else x 16 | 17 | def model(*argList, **argDict): 18 | classifier = GradientBoostingClassifier(verbose=1) 19 | 20 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split': [1200], 'min_samples_leaf':[60], 'max_depth':[9], 'max_features':[7], 'subsample':[0.8]} 21 | 22 | # param_grid={'n_estimators':np.arange(50, 50+50, 10), 'learning_rate':np.arange(0.01, 0.01+0.2, 0.01)} 23 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'max_depth':np.arange(1, 1+10, 1)} 24 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split':np.arange(2, 2+1000, 100)} 25 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':np.arange(2, 2+100, 10)} 26 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':np.arange(0.7, 0.7+0.29, 0.01)} 27 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'max_leaf_nodes':np.arange(2, 2+100, 10)} 28 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'max_features':np.arange(1, 1+19, 1)} 29 | 30 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'min_samples_split':np.arange(2, 2+3000, 100)} 31 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'subsample':np.arange(0.7, 0.7+0.29, 0.01)} 32 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_depth':np.arange(1, 1+10, 1)} 33 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_features':np.arange(1, 1+19, 1)} 34 | 35 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_depth':[4], 'subsample':np.arange(0.7, 0.7+0.29, 0.01)} 36 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_depth':[4], 'max_features':np.arange(1, 1+19, 1)} 37 | 38 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_depth':[4], 'subsample':[0.77], 'max_features':np.arange(1, 1+19, 1)} 39 | 40 | 41 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split':[202], 'min_samples_leaf':np.arange(2, 2+100, 10)} 42 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split':[202], 'subsample':np.arange(0.7, 0.7+0.29, 0.01)} 43 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split':[202], 'max_features':np.arange(1, 1+19, 1)} 44 | 45 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':np.arange(1, 1+19, 1)} 46 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':np.arange(1, 1+100, 10)} 47 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':[51], 'min_samples_split':np.arange(2, 2+1500, 100)} 48 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':[51], 'min_samples_split':[402], 'max_depth':np.arange(1, 1+10, 1)} 49 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':[51], 'min_samples_split':np.arange(2, 2+1500, 100), 'max_depth':[7]} 50 | # param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':np.arange(1, 1+100, 1), 'min_samples_split':[1202], 'max_depth':[7]} 51 | 52 | searcher = GridSearchCV(classifier, n_jobs=-1, scoring='roc_auc', param_grid=param_grid) 53 | 54 | return searcher 55 | 56 | def _transfer(X): 57 | imputers = [['Imputer_{i}'.format(i=i), Imputer()] for i in range(19)] 58 | imputers[0][1] = imputers[6][1] = imputers[8][1] = imputers[9][1] = imputers[10][1] = imputers[11][1] = imputers[12][1] = imputers[13][1] = imputers[14][1] = imputers[15][1] = imputers[16][1] = imputers[17][1] = Imputer(strategy='most_frequent') 59 | step1 = ('FeatureUnionExt', FeatureUnionExt(transformer_list=imputers, idx_list=[[i] for i in range(19)])) 60 | step2 = ('OneHotEncoder', OneHotEncoder(categorical_features=[0, 6, 8, 14, 15, 16, 17], sparse=False)) 61 | step3 = ('StandardScaler', StandardScaler()) 62 | pipeline = PipelineExt(steps=[step1, step2, step3]) 63 | X = pipeline.fit_transform(X) 64 | return X 65 | 66 | def loadTrainSet(filepath): 67 | converters = dict([(i, _miss_value) for i in range(26)]) 68 | converters[1] = lambda x: _gender_map[x] 69 | converters[4] = lambda x: now.tm_year - strptime(x, '%d-%b-%y').tm_year 70 | converters[11] = converters[19] = lambda x: _bool_map[x] 71 | converters[13] = lambda x: _var1_map[x] 72 | converters[14] = converters[15] = converters[16] = converters[17] = converters[18] = lambda x: 1 if len(x.strip()) == 0 else 0 73 | converters[20] = lambda x: _device_type_map[x] 74 | converters[21] = lambda x: _var2_map[x] 75 | converters[22] = lambda x: _source_map.get(x, '2') 76 | 77 | usecols = [1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25] 78 | raw = np.loadtxt(filepath, delimiter=',', usecols=usecols, converters=converters, dtype=np.str, skiprows=1) 79 | raw = raw.astype(np.float64) 80 | X, y = raw[:,:-1], raw[:,-1] 81 | X = _transfer(X) 82 | 83 | trainSet = np.hstack((X, y.reshape(-1,1))) 84 | return trainSet 85 | 86 | def loadTestSet(filepath): 87 | raise 88 | 89 | def saveSubmission(filepath, idList, y): 90 | raise 91 | -------------------------------------------------------------------------------- /complib/titanic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler 3 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 4 | from sklearn.grid_search import GridSearchCV 5 | from ple import FeatureUnionExt, PipelineExt 6 | 7 | _sex_map = {'female': '0', 'male': '1', '':'nan'} 8 | _embark_map = {'C': '0', 'Q': '1', 'S':'2', '':'nan'} 9 | _miss_value = lambda x: 'nan' if x == '' else x 10 | 11 | def model(*argList, **argDict): 12 | classifier = GradientBoostingClassifier(verbose=1) 13 | # searcher = GridSearchCV(classifier, param_grid={'n_estimators':np.arange(50, 50+200, 10), 'learning_rate':np.arange(0.01, 0.01+0.2, 0.01)}) 14 | # searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'max_depth':np.arange(1, 1+20, 1)}) 15 | # searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'min_samples_split':np.arange(2, 2+20, 2)}) 16 | # searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'min_samples_leaf':np.arange(1, 1+20, 2)}) 17 | # searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'subsample':np.arange(0.1, 0.1+1, 0.1)}) 18 | # searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'max_leaf_nodes':np.arange(20, 20+20, 2)}) 19 | # searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'max_features':np.arange(1, 1+7, 1)}) 20 | 21 | searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'min_samples_leaf':np.arange(1, 1+20, 2), 'max_features':np.arange(1, 1+7, 1)}) 22 | 23 | return searcher 24 | 25 | def _transfer(X): 26 | imputers = [['Imputer_{i}'.format(i=i), Imputer()] for i in range(7)] 27 | imputers[1][1] = imputers[6][1] = Imputer(strategy='most_frequent') 28 | step1 = ('FeatureUnionExt', FeatureUnionExt(transformer_list=imputers, idx_list=[[i] for i in range(7)])) 29 | step2 = ('OneHotEncoder', OneHotEncoder(categorical_features=[6], sparse=False)) 30 | step3 = ('StandardScaler', StandardScaler()) 31 | pipeline = PipelineExt(steps=[step1, step2, step3]) 32 | X = pipeline.fit_transform(X) 33 | return X 34 | 35 | def loadTrainSet(filepath): 36 | #Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Embarked 37 | converters = dict([(i, _miss_value) for i in range(12)]) 38 | converters[4] = lambda x:_sex_map[x] 39 | converters[11] = lambda x:_embark_map[x] 40 | 41 | raw = np.loadtxt(filepath, delimiter=',', usecols=[1, 2, 4, 5, 6, 7, 9, 11], converters=converters, dtype=np.str, skiprows=1) 42 | raw = raw.astype(np.float64) 43 | X, y = raw[:,1:], raw[:,0] 44 | X = _transfer(X) 45 | print X.shape 46 | 47 | trainSet = np.hstack((X, y.reshape(-1,1))) 48 | return trainSet 49 | 50 | def loadTestSet(filepath): 51 | #Pclass, Sex, Age, SibSp, Parch, Fare, Embarked 52 | converters = dict([(i, _miss_value) for i in range(11)]) 53 | converters[3] = lambda x:_sex_map[x] 54 | converters[10] = lambda x:_embark_map[x] 55 | 56 | raw = np.loadtxt(filepath, delimiter=',', usecols=[0, 1, 3, 4, 5, 6, 8, 10], converters=converters, dtype=np.str, skiprows=1) 57 | idList, X = raw[:,0], raw[:,1:].astype(np.float64) 58 | X = _transfer(X) 59 | 60 | testSet = np.hstack((idList.reshape(-1,1), X)) 61 | return testSet 62 | 63 | def saveSubmission(filepath, idList, y): 64 | result = np.vstack((idList, y.astype(np.int64))).T 65 | np.savetxt(filepath, result, fmt='%s', delimiter=',', header='PassengerId,Survived', comments='') 66 | -------------------------------------------------------------------------------- /complib/toy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 3 | from sklearn.grid_search import GridSearchCV 4 | from sklearn.datasets import load_boston 5 | 6 | def model(*argList, **argDict): 7 | classifier = RandomForestRegressor() 8 | # searcher = GridSearchCV(classifier, param_grid={'n_estimators':np.arange(10, 20), 'max_features':np.arange(0.1, 1.0, 0.1)}) 9 | searcher = GridSearchCV(classifier, param_grid={'n_estimators':np.arange(10, 100, 10), 'max_depth':np.arange(50, 100, 5)}) 10 | return searcher 11 | 12 | def loadTrainSet(filepath): 13 | boston = load_boston() 14 | trainSet = np.hstack((boston.data, boston.target.reshape((-1,1)))) 15 | return trainSet 16 | 17 | def loadTestSet(filepath): 18 | raise Exception('No Test Set') 19 | 20 | def saveSubmissor(filepath, y): 21 | raise Exception('No Test Set') 22 | -------------------------------------------------------------------------------- /kaggle.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python 2 | from os import listdir 3 | from os.path import basename, splitext 4 | from argparse import ArgumentParser 5 | import numpy as np 6 | from sklearn.externals import joblib 7 | from sklearn.grid_search import BaseSearchCV 8 | import matplotlib 9 | matplotlib.use('Agg') 10 | from matplotlib import pyplot as plt 11 | from matplotlib.ticker import FormatStrFormatter 12 | import complib 13 | 14 | def fit(competition, label, train): 15 | lib = getattr(complib, competition) 16 | 17 | model = lib.model() 18 | try: 19 | trainSet = joblib.load('dump/{competition}/train.dmp'.format(competition=competition)) 20 | except IOError, e: 21 | trainSet = lib.loadTrainSet('data/{competition}/{train}'.format(competition=competition, train=train)) 22 | joblib.dump(trainSet, 'dump/{competition}/train.dmp'.format(competition=competition), compress=3) 23 | 24 | X, y = (trainSet[:,:-1].astype(np.float64), trainSet[:,-1].astype(np.float64)) 25 | model.fit(X, y) 26 | joblib.dump(model, 'dump/{competition}/{label}.dmp'.format(competition=competition, label=label), compress=3) 27 | 28 | def predict(competition, label, test, submission): 29 | lib = getattr(complib, competition) 30 | 31 | model = joblib.load('dump/{competition}/{label}.dmp'.format(competition=competition, label=label)) 32 | try: 33 | testSet = joblib.load('dump/{competition}/test.dmp'.format(competition=competition)) 34 | except IOError, e: 35 | testSet = lib.loadTestSet('data/{competition}/{test}'.format(competition=competition, test=test)) 36 | joblib.dump(testSet, 'dump/{competition}/test.dmp'.format(competition=competition), compress=3) 37 | idList, X = testSet[:,0], testSet[:,1:].astype(np.float64) 38 | y = model.predict(X) 39 | 40 | lib.saveSubmission('data/{competition}/{submission}'.format(competition=competition, submission=submission), idList, y) 41 | 42 | def analyze(competition, label): 43 | model = joblib.load('dump/{competition}/{label}.dmp'.format(competition=competition, label=label)) 44 | assert(isinstance(model, BaseSearchCV)) 45 | print 'Best Score:{best_score}, Best Params:{best_params}'.format(best_score=model.best_score_, best_params=model.best_params_) 46 | 47 | dynamicParamAndType = _getDynamicParamAndType(model) 48 | n_grid_scores = len(model.grid_scores_) 49 | if n_grid_scores > 0: 50 | n_params = len(dynamicParamAndType) 51 | if n_params in (1, 2): 52 | if n_params == 1: 53 | param, paramType = dynamicParamAndType[0] 54 | _singleParamAnalyze(plt, model, param, paramType) 55 | elif n_params == 2: 56 | param1, paramType1= dynamicParamAndType[0] 57 | param2, paramType2= dynamicParamAndType[1] 58 | _coupleParamAnalyze(plt, model, (param1, param2), (paramType1, paramType2)) 59 | plt.savefig('pic/{competition}/{label}.png'.format(competition=competition, label=label)) 60 | plt.close() 61 | try: 62 | trainSet = joblib.load('dump/{competition}/train.dmp'.format(competition=competition)) 63 | except IOError, e: 64 | trainSet = lib.loadTrainSet('data/{competition}/{train}'.format(competition=competition, train=train)) 65 | joblib.dump(trainSet, 'dump/{competition}/train.dmp'.format(competition=competition), compress=3) 66 | 67 | X, y = (trainSet[:,:-1].astype(np.float64), trainSet[:,-1].astype(np.float64)) 68 | print 'Score on Train:{score}'.format(score=model.score(X, y)) 69 | 70 | def _getDynamicParamAndType(model): 71 | param_grid = getattr(model, 'param_grid') 72 | paramList = param_grid.keys() 73 | dynamicParamList = filter(lambda x:len(param_grid[x]) > 1, paramList) 74 | paramTypeList = map(lambda x:type(param_grid[x][0]), dynamicParamList) 75 | return zip(dynamicParamList, paramTypeList) 76 | 77 | def _singleParamAnalyze(plt, model, param, paramType): 78 | n_grid_scores = len(model.grid_scores_) 79 | fig, ax = plt.subplots(nrows=1, ncols=2) 80 | ax[0].yaxis.set_major_formatter(FormatStrFormatter('%.4f')) 81 | ax[1].yaxis.set_major_formatter(FormatStrFormatter('%.4f')) 82 | ax[1].yaxis.tick_right() 83 | 84 | valueList = np.array([]) 85 | scoreList = np.array([]) 86 | covScoreList = np.array([]) 87 | 88 | for grid_score in model.grid_scores_: 89 | value = grid_score[0][param] 90 | score = grid_score[1] 91 | print 'Param:{param}={value} Score:{score}'.format(param=param, value=value, score=score) 92 | valueList = np.append(valueList, value) 93 | scoreList = np.append(scoreList, score) 94 | covScoreList = np.append(covScoreList, np.std(grid_score[2]) / grid_score[1]) 95 | 96 | if paramType is unicode: 97 | x_ticks = np.unique(valueList) 98 | x_tickDict = dict([(x_ticks[i], i+0.5) for i in range(len(x_ticks))]) 99 | x_pos = [x_tickDict[value] for value in valueList] 100 | ax[0].set_xticks(x_tickDict.values()) 101 | ax[0].set_xticklabels(x_tickDict.keys(), rotation=90) 102 | ax[1].set_xticks(x_tickDict.values()) 103 | ax[1].set_xticklabels(x_tickDict.keys(), rotation=90) 104 | else: 105 | x_pos = np.arange(n_grid_scores) + 0.5 106 | fig.canvas.draw() 107 | ax[0].set_xticks(x_pos) 108 | ax[0].set_xticklabels(valueList, rotation=90) 109 | ax[1].set_xticks(x_pos) 110 | ax[1].set_xticklabels(valueList, rotation=90) 111 | 112 | ax[0].plot(x_pos, scoreList, '-') 113 | ax[1].plot(x_pos, covScoreList, '-') 114 | ax[0].set_title('Accuracy@\'{param}\''.format(param=param)) 115 | ax[1].set_title('COV Accuracy@\'{param}\''.format(param=param)) 116 | ax[0].set_xlabel(param) 117 | ax[1].set_xlabel(param) 118 | ax[0].set_ylabel('Accuracy') 119 | ax[1].set_ylabel('STD Accuracy') 120 | 121 | def _coupleParamAnalyze(plt, model, params, paramsType): 122 | n_grid_scores = len(model.grid_scores_) 123 | param1, param2 = params 124 | paramType1, paramType2 = paramsType 125 | 126 | param_grid = getattr(model, 'param_grid') 127 | n_x_values = len(param_grid[param1]) 128 | n_y_values = len(param_grid[param2]) 129 | param_x_pos = dict(zip(param_grid[param1], range(n_x_values))) 130 | param_y_pos = dict(zip(param_grid[param2], range(n_y_values))) 131 | param_shape = (n_y_values, n_x_values) 132 | 133 | scores = np.zeros(param_shape) 134 | 135 | for grid_score in model.grid_scores_: 136 | value1 = grid_score[0][param1] 137 | value2 = grid_score[0][param2] 138 | score = grid_score[1] 139 | print 'Param:{param1}={value1} {param2}={value2} Score:{score}'.format(param1=param1, value1=value1, param2=param2, value2=value2, score=score) 140 | scores[param_y_pos[value2], param_x_pos[value1]] = score 141 | 142 | max_x = np.max(scores, axis=1) 143 | max_y = np.max(scores, axis=0) 144 | idx_max_x = np.dot((scores == max_x.reshape((-1, 1))), np.arange(n_x_values)) 145 | idx_max_y = np.dot(np.arange(n_y_values), (scores == max_y.reshape((1, -1)))) 146 | 147 | image = 1 - scores 148 | 149 | plt.xticks(param_x_pos.values(), param_x_pos.keys(), rotation=90) 150 | plt.yticks(param_y_pos.values(), param_y_pos.keys()) 151 | 152 | plt.imshow(image, cmap=plt.cm.gray, interpolation='nearest') 153 | plt.xlabel(param1) 154 | plt.ylabel(param2) 155 | plt.title('How {param1} and {param2} Affects Accuracy'.format(param1=param1, param2=param2)) 156 | 157 | def listall(competition): 158 | fileList = listdir('dump/{competition}/'.format(competition=competition)) 159 | for filepath in fileList: 160 | label, ext = splitext(basename(filepath)) 161 | if label not in ('train', 'test') and ext == '.dmp': 162 | print label 163 | 164 | def main(): 165 | parser = ArgumentParser(description='Practice For Kaggle') 166 | parser.add_argument('action', action='store', choices=('fit', 'analyze', 'predict', 'list'), help='Action') 167 | parser.add_argument('competition', action='store', help='Action') 168 | parser.add_argument('-l', action='store', dest='label', default='default', help='Label') 169 | parser.add_argument('--train', action='store', dest='train', default='train.csv', help='Train Set File') 170 | parser.add_argument('--test', action='store', dest='test', default='test.csv', help='Test Set File') 171 | parser.add_argument('--submission', action='store', dest='submission', default='submission.csv', help='Test Set File') 172 | 173 | args = parser.parse_args() 174 | 175 | assert(args.label not in ('train', 'test')) 176 | if args.action == 'fit': 177 | fit(args.competition, args.label, args.train) 178 | elif args.action == 'analyze': 179 | analyze(args.competition, args.label) 180 | elif args.action == 'predict': 181 | predict(args.competition, args.label, args.test, args.submission) 182 | elif args.action == 'list': 183 | listall(args.competition) 184 | 185 | if __name__ == '__main__': 186 | main() 187 | --------------------------------------------------------------------------------