├── .gitignore
├── README.md
├── complib
    ├── __init__.py
    ├── digit_recognizer.py
    ├── hackathon3x.py
    ├── titanic.py
    └── toy.py
└── kaggle.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | # data
 92 | data/
 93 | 
 94 | # dump
 95 | dump/
 96 | 
 97 | # submission
 98 | submission/
 99 | 
100 | # log
101 | log/
102 | 
103 | # pic
104 | pic/
105 | 
106 | # ple
107 | ple/
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # kaggle
2 | kaggle实战
3 | 


--------------------------------------------------------------------------------
/complib/__init__.py:
--------------------------------------------------------------------------------
1 | import digit_recognizer, titanic, toy, hackathon3x
2 | 
3 | __all__ = ['digit_recognizer', 'titanic', 'toy', 'hackathon3x']
4 | 


--------------------------------------------------------------------------------
/complib/digit_recognizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 3 | from sklearn.grid_search import GridSearchCV
 4 | 
 5 | def model(*argList, **argDict):
 6 |     classifier = RandomForestClassifier(verbose=2, n_jobs=-1)
 7 | 
 8 | #    param_grid={'n_estimators':np.arange(1, 202, 10)}
 9 | #    param_grid={'n_estimators':[200], 'criterion':['gini', 'entropy']}
10 | #    param_grid={'n_estimators':[200], 'max_features':np.append(np.arange(28-20, 28, 1), np.arange(28, 28+20, 1))}
11 | #    param_grid={'n_estimators':[200], 'max_depth':np.arange(40, 40+20, 1)}
12 | #    param_grid={'n_estimators':[200], 'min_samples_split':np.arange(2, 2+10, 1)}
13 | #    param_grid={'n_estimators':[200], 'min_samples_leaf':np.arange(1, 1+10, 1)}
14 | #    param_grid={'n_estimators':[200], 'max_leaf_nodes':np.arange(3000, 3000+1000, 100)}
15 | 
16 |     searcher = GridSearchCV(classifier, n_jobs=-1, param_grid=param_grid)
17 | 
18 |     return searcher
19 |     
20 | def loadTrainSet(filepath):
21 |     raw = np.loadtxt(filepath, delimiter=',', dtype=np.str, skiprows=1)
22 |     X, y = raw[:,1:], raw[:,0]
23 |     trainSet = np.hstack((X, y.reshape(-1,1)))
24 |     return trainSet
25 | 
26 | def loadTestSet(filepath):
27 |     raw = np.loadtxt(filepath, delimiter=',', dtype=np.str, skiprows=1)
28 |     testSet = np.hstack((np.arange(1, raw.shape[0]+1).reshape(-1,1), raw))
29 |     return testSet
30 | 
31 | def saveSubmission(filepath, idList, y):
32 |     result = np.vstack((idList.astype(np.int64), y.astype(np.int64))).T
33 |     np.savetxt(filepath, result, fmt='%d', delimiter=',', header='ImageId,Label', comments='')
34 | 


--------------------------------------------------------------------------------
/complib/hackathon3x.py:
--------------------------------------------------------------------------------
 1 | from time import strptime, localtime
 2 | import numpy as np
 3 | from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler
 4 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 5 | from sklearn.grid_search import GridSearchCV
 6 | from ple import FeatureUnionExt, PipelineExt
 7 | 
 8 | now = localtime()
 9 | _gender_map = {'Female': '0', 'Male': '1', '':'nan'}
10 | _bool_map = {'N': '0', 'Y': '1', '':'nan'}
11 | _var1_map = {'HAVC': 0, 'HAXA': 1, 'HAXB': 2, 'HAXC': 3, 'HAXF': 4, 'HAXM': 5, 'HAYT': 6, 'HAZD': 7, 'HBXA': 8, 'HBXB': 9, 'HBXC': 10, 'HBXD': 11, 'HBXH': 12, 'HBXX': 13, 'HCXD': 14, 'HCXF': 15, 'HCXG': 16, 'HCYS': 17, 'HVYS': 18}
12 | _device_type_map = {'Mobile': '0', 'Web-browser': '1', '':'nan'}
13 | _var2_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, '':'nan'}
14 | _source_map = {'S122': 0, 'S133': 1,'':'nan'}
15 | _miss_value = lambda x: 'nan' if x == '' else x
16 | 
17 | def model(*argList, **argDict):
18 |     classifier = GradientBoostingClassifier(verbose=1)
19 | 
20 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split': [1200], 'min_samples_leaf':[60], 'max_depth':[9], 'max_features':[7], 'subsample':[0.8]}
21 | 
22 | #    param_grid={'n_estimators':np.arange(50, 50+50, 10), 'learning_rate':np.arange(0.01, 0.01+0.2, 0.01)}
23 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'max_depth':np.arange(1, 1+10, 1)}
24 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split':np.arange(2, 2+1000, 100)}
25 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':np.arange(2, 2+100, 10)}
26 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':np.arange(0.7, 0.7+0.29, 0.01)}
27 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'max_leaf_nodes':np.arange(2, 2+100, 10)}
28 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'max_features':np.arange(1, 1+19, 1)}
29 | 
30 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'min_samples_split':np.arange(2, 2+3000, 100)}
31 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'subsample':np.arange(0.7, 0.7+0.29, 0.01)}
32 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_depth':np.arange(1, 1+10, 1)}
33 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_features':np.arange(1, 1+19, 1)}
34 | 
35 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_depth':[4], 'subsample':np.arange(0.7, 0.7+0.29, 0.01)}
36 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_depth':[4], 'max_features':np.arange(1, 1+19, 1)}
37 | 
38 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_leaf':[12], 'max_depth':[4], 'subsample':[0.77], 'max_features':np.arange(1, 1+19, 1)}
39 | 
40 | 
41 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split':[202], 'min_samples_leaf':np.arange(2, 2+100, 10)}
42 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split':[202], 'subsample':np.arange(0.7, 0.7+0.29, 0.01)}
43 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'min_samples_split':[202], 'max_features':np.arange(1, 1+19, 1)}
44 | 
45 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':np.arange(1, 1+19, 1)}
46 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':np.arange(1, 1+100, 10)}
47 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':[51], 'min_samples_split':np.arange(2, 2+1500, 100)}
48 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':[51], 'min_samples_split':[402], 'max_depth':np.arange(1, 1+10, 1)}
49 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':[51], 'min_samples_split':np.arange(2, 2+1500, 100), 'max_depth':[7]}
50 | #    param_grid={'n_estimators':[60], 'learning_rate':[0.1], 'subsample':[0.84], 'max_features':[11], 'min_samples_leaf':np.arange(1, 1+100, 1), 'min_samples_split':[1202], 'max_depth':[7]}
51 | 
52 |     searcher = GridSearchCV(classifier, n_jobs=-1, scoring='roc_auc', param_grid=param_grid)
53 | 
54 |     return searcher
55 | 
56 | def _transfer(X):
57 |     imputers = [['Imputer_{i}'.format(i=i), Imputer()] for i in range(19)] 
58 |     imputers[0][1] = imputers[6][1] = imputers[8][1] = imputers[9][1] = imputers[10][1] = imputers[11][1] = imputers[12][1] = imputers[13][1] = imputers[14][1] = imputers[15][1] = imputers[16][1] = imputers[17][1] = Imputer(strategy='most_frequent')
59 |     step1 = ('FeatureUnionExt', FeatureUnionExt(transformer_list=imputers, idx_list=[[i] for i in range(19)]))
60 |     step2 = ('OneHotEncoder', OneHotEncoder(categorical_features=[0, 6, 8, 14, 15, 16, 17], sparse=False))
61 |     step3 = ('StandardScaler', StandardScaler())
62 |     pipeline = PipelineExt(steps=[step1, step2, step3])
63 |     X = pipeline.fit_transform(X)
64 |     return X
65 | 
66 | def loadTrainSet(filepath):
67 |     converters = dict([(i, _miss_value) for i in range(26)])
68 |     converters[1] = lambda x: _gender_map[x]
69 |     converters[4] = lambda x: now.tm_year - strptime(x, '%d-%b-%y').tm_year
70 |     converters[11] = converters[19] = lambda x: _bool_map[x]
71 |     converters[13] = lambda x: _var1_map[x]
72 |     converters[14] = converters[15] = converters[16] = converters[17] = converters[18] = lambda x: 1 if len(x.strip()) == 0 else 0
73 |     converters[20] = lambda x: _device_type_map[x]
74 |     converters[21] = lambda x: _var2_map[x]
75 |     converters[22] = lambda x: _source_map.get(x, '2')
76 | 
77 |     usecols = [1, 3, 4, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25]
78 |     raw = np.loadtxt(filepath, delimiter=',', usecols=usecols, converters=converters, dtype=np.str, skiprows=1)
79 |     raw = raw.astype(np.float64)
80 |     X, y = raw[:,:-1], raw[:,-1]
81 |     X = _transfer(X)
82 |     
83 |     trainSet = np.hstack((X, y.reshape(-1,1)))
84 |     return trainSet
85 | 
86 | def loadTestSet(filepath):
87 |     raise
88 | 
89 | def saveSubmission(filepath, idList, y):
90 |     raise
91 | 


--------------------------------------------------------------------------------
/complib/titanic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler
 3 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 4 | from sklearn.grid_search import GridSearchCV
 5 | from ple import FeatureUnionExt, PipelineExt
 6 | 
 7 | _sex_map = {'female': '0', 'male': '1', '':'nan'}
 8 | _embark_map = {'C': '0', 'Q': '1', 'S':'2', '':'nan'}
 9 | _miss_value = lambda x: 'nan' if x == '' else x
10 | 
11 | def model(*argList, **argDict):
12 |     classifier = GradientBoostingClassifier(verbose=1)
13 | #    searcher = GridSearchCV(classifier, param_grid={'n_estimators':np.arange(50, 50+200, 10), 'learning_rate':np.arange(0.01, 0.01+0.2, 0.01)})
14 | #    searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'max_depth':np.arange(1, 1+20, 1)})
15 | #    searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'min_samples_split':np.arange(2, 2+20, 2)})
16 | #    searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'min_samples_leaf':np.arange(1, 1+20, 2)})
17 | #    searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'subsample':np.arange(0.1, 0.1+1, 0.1)})
18 | #    searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'max_leaf_nodes':np.arange(20, 20+20, 2)})
19 | #    searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'max_features':np.arange(1, 1+7, 1)})
20 | 
21 |     searcher = GridSearchCV(classifier, param_grid={'n_estimators':[181], 'learning_rate':[0.08], 'min_samples_leaf':np.arange(1, 1+20, 2), 'max_features':np.arange(1, 1+7, 1)})
22 | 
23 |     return searcher
24 | 
25 | def _transfer(X):
26 |     imputers = [['Imputer_{i}'.format(i=i), Imputer()] for i in range(7)] 
27 |     imputers[1][1] = imputers[6][1] = Imputer(strategy='most_frequent')
28 |     step1 = ('FeatureUnionExt', FeatureUnionExt(transformer_list=imputers, idx_list=[[i] for i in range(7)]))
29 |     step2 = ('OneHotEncoder', OneHotEncoder(categorical_features=[6], sparse=False))
30 |     step3 = ('StandardScaler', StandardScaler())
31 |     pipeline = PipelineExt(steps=[step1, step2, step3])
32 |     X = pipeline.fit_transform(X)
33 |     return X
34 | 
35 | def loadTrainSet(filepath):
36 |     #Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Embarked
37 |     converters = dict([(i, _miss_value) for i in range(12)])
38 |     converters[4] = lambda x:_sex_map[x]
39 |     converters[11] = lambda x:_embark_map[x]
40 | 
41 |     raw = np.loadtxt(filepath, delimiter=',', usecols=[1, 2, 4, 5, 6, 7, 9, 11], converters=converters, dtype=np.str, skiprows=1)
42 |     raw = raw.astype(np.float64)
43 |     X, y = raw[:,1:], raw[:,0]
44 |     X = _transfer(X)
45 |     print X.shape
46 |     
47 |     trainSet = np.hstack((X, y.reshape(-1,1)))
48 |     return trainSet
49 | 
50 | def loadTestSet(filepath):
51 |     #Pclass, Sex, Age, SibSp, Parch, Fare, Embarked
52 |     converters = dict([(i, _miss_value) for i in range(11)])
53 |     converters[3] = lambda x:_sex_map[x]
54 |     converters[10] = lambda x:_embark_map[x]
55 | 
56 |     raw = np.loadtxt(filepath, delimiter=',', usecols=[0, 1, 3, 4, 5, 6, 8, 10], converters=converters, dtype=np.str, skiprows=1)
57 |     idList, X = raw[:,0], raw[:,1:].astype(np.float64)
58 |     X = _transfer(X)
59 | 
60 |     testSet = np.hstack((idList.reshape(-1,1), X))
61 |     return testSet
62 | 
63 | def saveSubmission(filepath, idList, y):
64 |     result = np.vstack((idList, y.astype(np.int64))).T
65 |     np.savetxt(filepath, result, fmt='%s', delimiter=',', header='PassengerId,Survived', comments='')
66 | 


--------------------------------------------------------------------------------
/complib/toy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
 3 | from sklearn.grid_search import GridSearchCV
 4 | from sklearn.datasets import load_boston
 5 | 
 6 | def model(*argList, **argDict):
 7 |     classifier = RandomForestRegressor()
 8 | #    searcher = GridSearchCV(classifier, param_grid={'n_estimators':np.arange(10, 20), 'max_features':np.arange(0.1, 1.0, 0.1)})
 9 |     searcher = GridSearchCV(classifier, param_grid={'n_estimators':np.arange(10, 100, 10), 'max_depth':np.arange(50, 100, 5)})
10 |     return searcher
11 |     
12 | def loadTrainSet(filepath):
13 |     boston = load_boston()
14 |     trainSet = np.hstack((boston.data, boston.target.reshape((-1,1))))
15 |     return trainSet
16 | 
17 | def loadTestSet(filepath):
18 |     raise Exception('No Test Set')
19 | 
20 | def saveSubmissor(filepath, y):
21 |     raise Exception('No Test Set')
22 | 


--------------------------------------------------------------------------------
/kaggle.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python
  2 | from os import listdir
  3 | from os.path import basename, splitext
  4 | from argparse import ArgumentParser
  5 | import numpy as np
  6 | from sklearn.externals import joblib
  7 | from sklearn.grid_search import BaseSearchCV
  8 | import matplotlib
  9 | matplotlib.use('Agg')
 10 | from matplotlib import pyplot as plt
 11 | from matplotlib.ticker import FormatStrFormatter
 12 | import complib
 13 | 
 14 | def fit(competition, label, train):
 15 |     lib = getattr(complib, competition)
 16 | 
 17 |     model = lib.model()
 18 |     try:
 19 |         trainSet = joblib.load('dump/{competition}/train.dmp'.format(competition=competition))
 20 |     except IOError, e:
 21 |         trainSet = lib.loadTrainSet('data/{competition}/{train}'.format(competition=competition, train=train))
 22 |         joblib.dump(trainSet, 'dump/{competition}/train.dmp'.format(competition=competition), compress=3)
 23 | 
 24 |     X, y = (trainSet[:,:-1].astype(np.float64), trainSet[:,-1].astype(np.float64))
 25 |     model.fit(X, y)
 26 |     joblib.dump(model, 'dump/{competition}/{label}.dmp'.format(competition=competition, label=label), compress=3)
 27 | 
 28 | def predict(competition, label, test, submission):
 29 |     lib = getattr(complib, competition)
 30 | 
 31 |     model = joblib.load('dump/{competition}/{label}.dmp'.format(competition=competition, label=label))
 32 |     try:
 33 |         testSet = joblib.load('dump/{competition}/test.dmp'.format(competition=competition))
 34 |     except IOError, e:
 35 |         testSet = lib.loadTestSet('data/{competition}/{test}'.format(competition=competition, test=test))
 36 |         joblib.dump(testSet, 'dump/{competition}/test.dmp'.format(competition=competition), compress=3)
 37 |     idList, X = testSet[:,0], testSet[:,1:].astype(np.float64)
 38 |     y = model.predict(X)
 39 | 
 40 |     lib.saveSubmission('data/{competition}/{submission}'.format(competition=competition, submission=submission), idList, y)
 41 | 
 42 | def analyze(competition, label):
 43 |     model = joblib.load('dump/{competition}/{label}.dmp'.format(competition=competition, label=label))
 44 |     assert(isinstance(model, BaseSearchCV))
 45 |     print 'Best Score:{best_score}, Best Params:{best_params}'.format(best_score=model.best_score_, best_params=model.best_params_)
 46 | 
 47 |     dynamicParamAndType = _getDynamicParamAndType(model)
 48 |     n_grid_scores = len(model.grid_scores_)
 49 |     if n_grid_scores > 0:
 50 |         n_params = len(dynamicParamAndType)
 51 | 	if n_params in (1, 2):
 52 |             if n_params == 1:
 53 |                 param, paramType = dynamicParamAndType[0]
 54 |                 _singleParamAnalyze(plt, model, param, paramType)
 55 |             elif n_params == 2:
 56 |                 param1, paramType1= dynamicParamAndType[0]
 57 |                 param2, paramType2= dynamicParamAndType[1]
 58 |                 _coupleParamAnalyze(plt, model, (param1, param2), (paramType1, paramType2))
 59 |             plt.savefig('pic/{competition}/{label}.png'.format(competition=competition, label=label))
 60 |             plt.close()
 61 |         try:
 62 |             trainSet = joblib.load('dump/{competition}/train.dmp'.format(competition=competition))
 63 |         except IOError, e:
 64 |             trainSet = lib.loadTrainSet('data/{competition}/{train}'.format(competition=competition, train=train))
 65 |             joblib.dump(trainSet, 'dump/{competition}/train.dmp'.format(competition=competition), compress=3)
 66 | 
 67 |         X, y = (trainSet[:,:-1].astype(np.float64), trainSet[:,-1].astype(np.float64))
 68 |         print 'Score on Train:{score}'.format(score=model.score(X, y))
 69 | 
 70 | def _getDynamicParamAndType(model):
 71 |     param_grid = getattr(model, 'param_grid')
 72 |     paramList = param_grid.keys()
 73 |     dynamicParamList =  filter(lambda x:len(param_grid[x]) > 1, paramList)
 74 |     paramTypeList =  map(lambda x:type(param_grid[x][0]), dynamicParamList)
 75 |     return zip(dynamicParamList, paramTypeList)
 76 | 
 77 | def _singleParamAnalyze(plt, model, param, paramType):
 78 |     n_grid_scores = len(model.grid_scores_)
 79 |     fig, ax = plt.subplots(nrows=1, ncols=2)
 80 |     ax[0].yaxis.set_major_formatter(FormatStrFormatter('%.4f'))
 81 |     ax[1].yaxis.set_major_formatter(FormatStrFormatter('%.4f'))
 82 |     ax[1].yaxis.tick_right()
 83 | 
 84 |     valueList = np.array([])
 85 |     scoreList = np.array([])
 86 |     covScoreList = np.array([])
 87 | 
 88 |     for grid_score in model.grid_scores_:
 89 |         value = grid_score[0][param]
 90 |         score = grid_score[1]
 91 |         print 'Param:{param}={value} Score:{score}'.format(param=param, value=value, score=score)
 92 |         valueList = np.append(valueList, value)
 93 |         scoreList = np.append(scoreList, score)
 94 |         covScoreList = np.append(covScoreList, np.std(grid_score[2]) / grid_score[1])
 95 | 
 96 |     if paramType is unicode:
 97 |         x_ticks = np.unique(valueList)
 98 |         x_tickDict = dict([(x_ticks[i], i+0.5) for i in range(len(x_ticks))])
 99 |         x_pos = [x_tickDict[value] for value in valueList]
100 |         ax[0].set_xticks(x_tickDict.values())
101 |         ax[0].set_xticklabels(x_tickDict.keys(), rotation=90)
102 |         ax[1].set_xticks(x_tickDict.values())
103 |         ax[1].set_xticklabels(x_tickDict.keys(), rotation=90)
104 |     else:
105 |         x_pos = np.arange(n_grid_scores) + 0.5
106 |         fig.canvas.draw()
107 |         ax[0].set_xticks(x_pos)
108 |         ax[0].set_xticklabels(valueList, rotation=90)
109 |         ax[1].set_xticks(x_pos)
110 |         ax[1].set_xticklabels(valueList, rotation=90)
111 | 
112 |     ax[0].plot(x_pos, scoreList, '-')
113 |     ax[1].plot(x_pos, covScoreList, '-')
114 |     ax[0].set_title('Accuracy@\'{param}\''.format(param=param))
115 |     ax[1].set_title('COV Accuracy@\'{param}\''.format(param=param))
116 |     ax[0].set_xlabel(param)
117 |     ax[1].set_xlabel(param)
118 |     ax[0].set_ylabel('Accuracy')
119 |     ax[1].set_ylabel('STD Accuracy')
120 | 
121 | def _coupleParamAnalyze(plt, model, params, paramsType):
122 |     n_grid_scores = len(model.grid_scores_)
123 |     param1, param2 = params
124 |     paramType1, paramType2 = paramsType
125 | 
126 |     param_grid = getattr(model, 'param_grid')
127 |     n_x_values = len(param_grid[param1])
128 |     n_y_values = len(param_grid[param2])
129 |     param_x_pos = dict(zip(param_grid[param1], range(n_x_values)))
130 |     param_y_pos = dict(zip(param_grid[param2], range(n_y_values)))
131 |     param_shape = (n_y_values, n_x_values)
132 | 
133 |     scores = np.zeros(param_shape)
134 | 
135 |     for grid_score in model.grid_scores_:
136 |         value1 = grid_score[0][param1]
137 |         value2 = grid_score[0][param2]
138 |         score = grid_score[1] 
139 |         print 'Param:{param1}={value1} {param2}={value2} Score:{score}'.format(param1=param1, value1=value1, param2=param2, value2=value2, score=score)
140 |         scores[param_y_pos[value2], param_x_pos[value1]] = score
141 | 
142 |     max_x = np.max(scores, axis=1)
143 |     max_y = np.max(scores, axis=0)
144 |     idx_max_x = np.dot((scores == max_x.reshape((-1, 1))), np.arange(n_x_values))
145 |     idx_max_y = np.dot(np.arange(n_y_values), (scores == max_y.reshape((1, -1))))
146 | 
147 |     image = 1 - scores
148 | 
149 |     plt.xticks(param_x_pos.values(), param_x_pos.keys(), rotation=90)
150 |     plt.yticks(param_y_pos.values(), param_y_pos.keys())
151 | 
152 |     plt.imshow(image, cmap=plt.cm.gray, interpolation='nearest')
153 |     plt.xlabel(param1)
154 |     plt.ylabel(param2)
155 |     plt.title('How {param1} and {param2} Affects Accuracy'.format(param1=param1, param2=param2))
156 | 
157 | def listall(competition):
158 |     fileList = listdir('dump/{competition}/'.format(competition=competition))
159 |     for filepath in fileList:
160 |         label, ext = splitext(basename(filepath))
161 |         if label not in ('train', 'test') and ext == '.dmp':
162 |             print label
163 | 
164 | def main():
165 |     parser = ArgumentParser(description='Practice For Kaggle')
166 |     parser.add_argument('action', action='store', choices=('fit', 'analyze', 'predict', 'list'), help='Action')
167 |     parser.add_argument('competition', action='store', help='Action')
168 |     parser.add_argument('-l', action='store', dest='label', default='default', help='Label')
169 |     parser.add_argument('--train', action='store', dest='train', default='train.csv', help='Train Set File')
170 |     parser.add_argument('--test', action='store', dest='test', default='test.csv', help='Test Set File')
171 |     parser.add_argument('--submission', action='store', dest='submission', default='submission.csv', help='Test Set File')
172 | 
173 |     args = parser.parse_args()
174 | 
175 |     assert(args.label not in ('train', 'test'))
176 |     if args.action == 'fit':
177 |         fit(args.competition, args.label, args.train)
178 |     elif args.action == 'analyze':
179 |         analyze(args.competition, args.label)
180 |     elif args.action == 'predict':
181 |         predict(args.competition, args.label, args.test, args.submission)
182 |     elif args.action == 'list':
183 |         listall(args.competition)
184 | 
185 | if __name__ == '__main__':
186 |     main()
187 | 


--------------------------------------------------------------------------------