├── .gitignore
├── POTUS_aka_El_Presidente
├── README.txt
├── best_model.txt
├── model_selection.R
├── notes.txt
├── predict.R
├── python_code
│ ├── python_POTUS.ipynb
│ ├── python_POTUS_notes.ipynb
│ └── python_POTUS_pipeline.ipynb
├── test.csv
└── train.csv
├── Preprocessing_note.html
├── Preprocessing_note.ipynb
├── digit_recoginition
├── digit_recog_classifier_test_data.py
└── digit_recog_grid_search.py
├── expedia
└── EDA_1st_model.ipynb
├── homesite
├── Boris_gradient_boost.ipynb
├── initial_foray_insurance.ipynb
└── initial_foray_insurance_grad_boosting.ipynb
├── notes_on_ML
├── K-NN_and_preprocessing.html
├── K-NN_and_preprocessing.ipynb
├── Logistic_regression_and_preprocessing.ipynb
└── Scaling_synthesized_data.ipynb
├── paribas
├── README.md
├── boosting_in_barbados.ipynb
├── exploratory_analysis.ipynb
├── extra_trees_classifier.ipynb
├── paribas_I.ipynb
└── stratified_CV_with_xgboost.ipynb
└── wine_quality
├── README.txt
├── ipython_notebooks
├── Predicting_Wine_Quality.ipynb
├── Testing Box_Cox.ipynb
├── box_cox.py
├── explore_wine_data.ipynb
├── yeo_johnson.py
└── yjscratch.py
├── python
├── .ipynb_checkpoints
│ └── wine_notebook-checkpoint.ipynb
├── wine_classifier.py
├── wine_data.py
├── wine_explore.py
├── wine_main.py
├── wine_notebook.ipynb
└── wine_preprocesser.py
├── winequality-red.csv
└── winequality-white.csv
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .RData
3 | .Rhistory
4 | .project
5 | .pydevproject
6 | *.pickle
7 | *.pyc
8 | data/
9 | __init__.py
10 |
--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/README.txt:
--------------------------------------------------------------------------------
1 | Predicting Presidential Votes Across Counties
2 |
3 | This is a typical example of a job interview challenge (usually given as a homework assignment to complete in a couple of hours): they will provide you with a training .csv, ask you to build some predicitve models and choose the best one; then to apply it to their test data .csv and send them the results, along with your code.
4 | Herein find:
5 | -an example training set train.csv which contains metadata about counties and who won in- that county (Obama/Romney) and a test data set test.csv;
6 | -an R script model_selection.R that builds a variety of predictive models for the problem and chooses the one that performs best on train.csv (using repeated 10-fold cross validation);
7 | -an R script predict.R that that predicts the election outcome for the counties (rows) in test.csv;
8 | -notes.txt, that explains the model building process;
9 | -best_model.txt, that describes the performance of the best model.
10 |
11 | Enjoy!
12 |
13 |
--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/best_model.txt:
--------------------------------------------------------------------------------
1 | Stochastic Gradient Boosting
2 |
3 | 1213 samples
4 | 9 predictor
5 | 2 classes: 'Barack Obama', 'Mitt Romney'
6 |
7 | Pre-processing: centered, scaled, principal component signal extraction
8 | Resampling: Cross-Validated (10 fold, repeated 10 times)
9 |
10 | Summary of sample sizes: 1092, 1092, 1091, 1092, 1091, 1092, ...
11 |
12 | Resampling results across tuning parameters:
13 |
14 | interaction.depth n.trees Accuracy Kappa Accuracy SD Kappa SD
15 | 1 50 0.843 0.426 0.0243 0.1026
16 | 1 100 0.847 0.467 0.0258 0.0988
17 | 1 150 0.847 0.471 0.0274 0.1019
18 | 2 50 0.848 0.469 0.0248 0.0976
19 | 2 100 0.844 0.469 0.0263 0.0966
20 | 2 150 0.843 0.468 0.0262 0.0960
21 | 3 50 0.847 0.474 0.0256 0.0979
22 | 3 100 0.844 0.473 0.0280 0.0997
23 | 3 150 0.841 0.466 0.0280 0.0966
24 |
25 | Tuning parameter 'shrinkage' was held constant at a value of 0.1
26 | Accuracy was used to select the optimal model using the largest value.
27 | The final values used for the model were n.trees = 50, interaction.depth = 2 and shrinkage = 0.1.
28 |
--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/model_selection.R:
--------------------------------------------------------------------------------
1 | #################################################################################
2 | ###THE SET UP
3 | #################################################################################
4 | library( glmnet )
5 | library( ggplot2 )
6 | library( caret )
7 | library( kernlab )
8 | library( klaR )
9 | library(doMC)
10 |
11 |
12 | ###HERE I TAKE ADVANTAGE OF MULTITHREADING
13 | ###Using multithreading with my Dual Core 2.8 GHz Intel Core i7 processor,
14 | ###the code below takes ~6minutes to run
15 | nc <- detectCores()
16 | registerDoMC(cores = nc)
17 |
18 |
19 | rm(list=ls(all=TRUE))
20 | setwd("~/Documents/ML/")#SET YOUR WORKING DIRECTORY HERE
21 | data <- read.csv("train_potus_by_county.csv", header = TRUE )
22 |
23 |
24 | #################################################################################
25 | ###INITIAL DATA DIVE
26 | #################################################################################
27 | #HISTOGRAM OF RESPONSE VARIABLE TO CHEK FOR CLASS IMBALANCE
28 | q <- ggplot( data , aes(x=Winner))
29 | q + geom_histogram() ##note a class imbalance!
30 | ##CHECK FOR FEATURES WITH NEAR-ZERO VARIANCE (MAY THROW OFF MODELS)
31 | nzv <- nearZeroVar( data , saveMetrics=TRUE )
32 | #View(nzv) ##no variables with near-zero variance
33 | #VISUALIZE ALL VARIABLES AND THEIR RELATIONSHIPS
34 | #ggpairs( data ) #THIS FUNCTION IS COMPUTATIONALLY INTENSIVE AND NOT ESSENTIAL FOR WHAT FOLLOWS
35 |
36 | #################################################################################
37 | ###FEATURE SELECTION: I USE LASSO REGRESSION TO SELECT THE MOST IMPORTANT
38 | ###FEATURES IN DETERMINING THE WINNER
39 | ###(YOU COULD ALSO USE A NONLINEAR ALGORITHM, SUCH AS A RANDOM FOREST
40 | ###TO SELECT FEATURES: AN ADVANTAGE OF LASSO REGRESSION IS THAT IT
41 | ###SELECTS FEATURES AND TELLS YOU WHETHER THEY ARE +VELY OR -VELY
42 | ###CORRELATED WITH THE TARGET VARIABLE)
43 | #################################################################################
44 | ###SETUP INPUTS TO MODEL
45 | n <- length( data )
46 | x<- as.matrix(data[,-n])
47 | y <- as.matrix(data$Winner)
48 | ###RUN THE MODEL
49 | cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class",
50 | nfolds = 20 , nlambda = 1000 , alpha = 1)
51 | ##VARIABLES WITH NONZERO COOEFICIENTS ARE THE IMPORTANT VARIABLES
52 | coef(cvfit$glmnet.fit,s=cvfit$lambda.1se)
53 |
54 | ###KEEP IMPORTANT FEATURES AND RESPONSE VARIABLE
55 | keep <- c("Median.age","X..BachelorsDeg.or.higher","Unemployment.rate",
56 | "Total.households","X..Owner.occupied.housing","X..Renter.occupied.housing",
57 | "Median.home.value","Population.growth", "Per.capita.income.growth",
58 | "Winner")
59 | data <- data[,keep] ##KEEP ONLY THE MOST IMPORTANT FEATURES & RESPONSE VARAIBLES
60 |
61 | #################################################################################
62 | ###IN WHICH I BUILD A NUMBER OF MODELS TO PREDICT THE RESPONSE VARIABLE
63 | ###I TRY LOGISTIC REGRESSION, SVMs, NEURAL NETWORKS, RANDOM FORESTS,
64 | ###GENERALIZED BOOSTED MODELS AND NAIVE BAYES.
65 | ###NOTE: PREPROCESSING OCCURS WITHIN EACH TRAINING METHOD.
66 | #################################################################################
67 |
68 |
69 | ###DETAILS OF MODEL TRAINING (REPEATED 10-FOLD CROSS VALIDATION)
70 | fitControl <- trainControl(## 10-fold CV
71 | method = "repeatedcv",
72 | number = 10,
73 | #classProbs = TRUE,
74 | ## repeated ten times
75 | repeats = 10)
76 |
77 | ###I DEFINE THE PREPROCESSING THAT I'LL PERFORM IN EACH MODEL FITTING
78 | preProc = c("center", "scale","pca") ##centre & scale data, pca on predictor variables
79 | tL = 5 #number of levels for each tuning parameter in training: you could do much wider and
80 | #more rigorous tuning by choosing the model-dependent parameter values. Do this and
81 | #your models will perform better!!
82 |
83 |
84 | # Start the clock!
85 | ptm <- proc.time()
86 | ###LOGISTIC REGRESSION (AS A PARTICULAR "GENERAL LINEAR MODEL")
87 | lrfit <- train( Winner ~. , data = data , method = "glm", family = binomial,
88 | trControl = fitControl, preProc ,
89 | tuneLength =tL)
90 |
91 | ###SUPPORT VECTOR MACHINE (RADIAL BASIS KERNEL)
92 | svmfit <- train( Winner ~. , data = data , method = 'svmRadial',
93 | trControl = fitControl, preProc ,
94 | tuneLength = tL)
95 |
96 | ###NEURAL NETWORK
97 |
98 | nnetfit <- train( Winner ~. , data = data , method = "nnet",
99 | trControl = fitControl, preProc)
100 |
101 |
102 | ###RANDOM FOREST
103 |
104 | rffit <- train( Winner ~. , data = data , method = "rf",
105 | trControl = fitControl, preProc)
106 |
107 |
108 | ###GENERALIZED BOOSTED MODEL
109 |
110 | gbmfit <- train( Winner ~. , data = data , method = "gbm",
111 | trControl = fitControl, preProc)
112 |
113 | ###NAIVE BAYES
114 |
115 | nbfit <- train( Winner ~. , data = data , method = "nb",
116 | trControl = fitControl, preProc)
117 | # Stop the clock
118 | proc.time() - ptm
119 | #################################################################################
120 | ###COMPARE ALL MODELS
121 | #################################################################################
122 | ####
123 |
124 | resamps <- resamples(list(nnet = nnetfit , gbm = gbmfit , lr = lrfit,
125 | svm = svmfit , rf = rffit , nb = nbfit))
126 | summary( resamps )
127 | ###GBM HAS THE HIGHEST MEAN ACCURACY
128 |
129 | #################################################################################
130 | ###PRODUCE OUTPUTS
131 | #################################################################################
132 |
133 |
134 | ###SAVE BEST MODEL TO THE FILESYSTEM
135 | save(gbmfit , file = "mymodelgbm.rda")
136 |
137 | ###LOG DATA ABOUT EXPECTED PERFORMANCE OF MODEL
138 | sink(file="performance.txt")
139 | gbmfit
140 | sink(NULL)
141 |
142 |
143 |
144 |
145 | #################################################################################
146 | ###HERE BELOW I INCLUDE SOME PREPROCESSING CODE THAT CHECKS FOR FEATURES THAT
147 | ### REMOVES HIGHLY CORRELATED FEATURES AND LOOKS FOR COLLINEARITY.
148 | ###THIS PREPROCESSING DID NOT IMPROVE MODEL PERFORMANCE
149 | ##SO I DID NOT INCLUDE IT IN THE ABOVE CODE.
150 | #################################################################################
151 |
152 | # ###remove correlated variables
153 | # dummies <- dummyVars( ~ ., data )
154 | # df <- predict(dummies, newdata = data)
155 | # da <- data.frame(df)
156 | # descrCor <- cor( da )
157 | # #summary(descrCor[upper.tri(descrCor)])
158 | # highlyCorDescr <- findCorrelation(descrCor, cutoff = .75)
159 | # filteredDescr <- da[,-highlyCorDescr]
160 | # #descrCor2 <- cor(filteredDescr)
161 | # #summary(descrCor2[upper.tri(descrCor2)])
162 | # filteredDescr$Winner.Barack.Obama <- NULL
163 | # filteredDescr$Winner <- data$Winner
164 | # data <- filteredDescr
165 | # ##find linear combos
166 | # comboInfo <- findLinearCombos(data) #none
--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/notes.txt:
--------------------------------------------------------------------------------
1 | Notes
2 |
3 | I chose to use R to tackle this assignment. In particular, I made use of the package ‘caret’. See all dependencies at the bottom of these notes.
4 |
5 |
6 | Approach
7 |
8 | 1. Before attempting to build any models, I dove into the data: the most important aspects that I noticed immediately were (i) that all predictor variables were numerical AND (ii) that there was a class imbalance in the response variable (approx. 1/4 for “Barack Obama”). I also noticed that the response variable was binary.
9 |
10 | (i) above indicated that preprocessing all features via scaling and centering would be appropriate. (ii) above made me aware that I should use modelling techniques that are good at dealing with class imbalances, for examples ensemble methods such as random forests and boosting (one could also implement up-/down-sampling).
11 |
12 | 2. In order to compare models, I needed to 1st decide on a metric of comparison: I chose ‘Accuracy’, because in predicting voting behaviour, we want to be as accurate as possible. (I am aware that ‘Accuracy’ may be problematic due to the class imbalance problem but a 25%/75% split isn’t too bad). I could also have used other metrics, such as ROC-curve or specificity.
13 |
14 | 3. I wanted to select the most important features so that the others did not introduce unwanted noise into the modelling process. I did so using Lasso regression. Note: I also attempted to engineer new features (linear combinations, products, ratios of existing features) but this did not contribute to overall performance; I also attempted to remove correlated features but this did not contribute to overall performance.
15 |
16 | 4. I wanted to implement as many models as possible and use the best one on the test data (with ‘Accuracy’ as metric, as discussed above): for each model, I preprocessed the data (scaling, centering and principal component signal extraction) and used repeated 10-fold cross validation to retreive the ‘Accuracy’ of each model (with error bars) for different meta-parameters.
17 |
18 | 5. The models I chose were logistic regression, support vector machines, neural networks, random forests, stochastic gradient boosting and naive Bayes.
19 |
20 | 6. Stochastic gradient boosting won with an accuracy of 84.8% (pretty good for out of the box!): as reported in performance.txt, “The final values used for the model were n.trees = 50, interaction.depth = 2 and shrinkage = 0.1.” Note that many of the models I tried had pretty similar accuracies.
21 |
22 | 7. If I had more time, I would have definitely played around with (i) up-, down- and mixed sampling of the training data, (ii) using bagging techniques other than random forests, (iii) SMOTE to address the class imbalance problem in another manner and (iv) really working on feature engineering as this is the real key!
23 |
24 |
25 |
26 | Dependencies
27 |
28 | R libraries:
29 | library( glmnet )
30 | library( ggplot2 )
31 | library( caret )
32 | library( kernlab )
33 | library( klaR )
34 | library(doMC)
35 |
36 |
37 |
38 | HUGO BOWNE-ANDERSON
39 | 07-14-2015
40 |
41 |
--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/predict.R:
--------------------------------------------------------------------------------
1 | #################################################################################
2 | ###THE SET UP
3 | #################################################################################
4 |
5 | library( caret )
6 | rm(list=ls(all=TRUE))
7 | setwd("~/Documents/ML/")#SET YOUR WORKING DIRECTORY HERE
8 | data <- read.csv("test_potus_by_county.csv", header = TRUE )
9 | load( "mymodelgbm.rda")
10 | #################################################################################
11 | ###RUN MODEL AND WRITE PREDICTIONS TO .CSV
12 | #################################################################################
13 | predictions <- predict(gbmfit , data )
14 |
15 | write.table(predictions , "predictions.csv" , row.names = FALSE , col.names = FALSE)
16 |
--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/python_code/python_POTUS_pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
PIPELINING WITH POTUS DATA AND MACHINE LEARNING
"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "IMPORT SOME LIBRARIES AND READ IN DATA
"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/html": [
27 | "\n",
28 | "
\n",
29 | " \n",
30 | " \n",
31 | " | \n",
32 | " Total population | \n",
33 | " Median age | \n",
34 | " % BachelorsDeg or higher | \n",
35 | " Unemployment rate | \n",
36 | " Per capita income | \n",
37 | " Total households | \n",
38 | " Average household size | \n",
39 | " % Owner occupied housing | \n",
40 | " % Renter occupied housing | \n",
41 | " % Vacant housing | \n",
42 | " Median home value | \n",
43 | " Population growth | \n",
44 | " House hold growth | \n",
45 | " Per capita income growth | \n",
46 | " Winner | \n",
47 | " Win_bin | \n",
48 | "
\n",
49 | " \n",
50 | " \n",
51 | " \n",
52 | " 0 | \n",
53 | " 9278 | \n",
54 | " 37.9 | \n",
55 | " 12.6 | \n",
56 | " 21.3 | \n",
57 | " 13992 | \n",
58 | " 3802 | \n",
59 | " 2.42 | \n",
60 | " 51.9 | \n",
61 | " 16.6 | \n",
62 | " 31.6 | \n",
63 | " 63959 | \n",
64 | " -0.69 | \n",
65 | " -0.49 | \n",
66 | " 0.71 | \n",
67 | " Barack Obama | \n",
68 | " True | \n",
69 | "
\n",
70 | " \n",
71 | " 1 | \n",
72 | " 18594 | \n",
73 | " 36.3 | \n",
74 | " 9.7 | \n",
75 | " 14.3 | \n",
76 | " 14622 | \n",
77 | " 6764 | \n",
78 | " 2.55 | \n",
79 | " 63.7 | \n",
80 | " 16.2 | \n",
81 | " 20.1 | \n",
82 | " 74330 | \n",
83 | " -0.13 | \n",
84 | " 0.03 | \n",
85 | " 0.85 | \n",
86 | " Barack Obama | \n",
87 | " True | \n",
88 | "
\n",
89 | " \n",
90 | " 2 | \n",
91 | " 662628 | \n",
92 | " 37.9 | \n",
93 | " 27.9 | \n",
94 | " 12.1 | \n",
95 | " 23909 | \n",
96 | " 267862 | \n",
97 | " 2.41 | \n",
98 | " 57.0 | \n",
99 | " 28.8 | \n",
100 | " 14.2 | \n",
101 | " 112687 | \n",
102 | " -0.09 | \n",
103 | " 0.00 | \n",
104 | " 0.55 | \n",
105 | " Barack Obama | \n",
106 | " True | \n",
107 | "
\n",
108 | " \n",
109 | " 3 | \n",
110 | " 21292 | \n",
111 | " 38.9 | \n",
112 | " 14.1 | \n",
113 | " 15.7 | \n",
114 | " 16829 | \n",
115 | " 8547 | \n",
116 | " 2.47 | \n",
117 | " 63.5 | \n",
118 | " 17.1 | \n",
119 | " 19.4 | \n",
120 | " 73643 | \n",
121 | " -0.59 | \n",
122 | " -0.43 | \n",
123 | " 0.57 | \n",
124 | " Barack Obama | \n",
125 | " True | \n",
126 | "
\n",
127 | " \n",
128 | " 4 | \n",
129 | " 13252 | \n",
130 | " 34.5 | \n",
131 | " 15.0 | \n",
132 | " 15.8 | \n",
133 | " 13012 | \n",
134 | " 5222 | \n",
135 | " 2.47 | \n",
136 | " 53.7 | \n",
137 | " 20.7 | \n",
138 | " 25.6 | \n",
139 | " 56642 | \n",
140 | " -1.16 | \n",
141 | " -1.03 | \n",
142 | " 0.69 | \n",
143 | " Barack Obama | \n",
144 | " True | \n",
145 | "
\n",
146 | " \n",
147 | "
\n",
148 | "
"
149 | ],
150 | "text/plain": [
151 | " Total population Median age % BachelorsDeg or higher Unemployment rate \\\n",
152 | "0 9278 37.9 12.6 21.3 \n",
153 | "1 18594 36.3 9.7 14.3 \n",
154 | "2 662628 37.9 27.9 12.1 \n",
155 | "3 21292 38.9 14.1 15.7 \n",
156 | "4 13252 34.5 15.0 15.8 \n",
157 | "\n",
158 | " Per capita income Total households Average household size \\\n",
159 | "0 13992 3802 2.42 \n",
160 | "1 14622 6764 2.55 \n",
161 | "2 23909 267862 2.41 \n",
162 | "3 16829 8547 2.47 \n",
163 | "4 13012 5222 2.47 \n",
164 | "\n",
165 | " % Owner occupied housing % Renter occupied housing % Vacant housing \\\n",
166 | "0 51.9 16.6 31.6 \n",
167 | "1 63.7 16.2 20.1 \n",
168 | "2 57.0 28.8 14.2 \n",
169 | "3 63.5 17.1 19.4 \n",
170 | "4 53.7 20.7 25.6 \n",
171 | "\n",
172 | " Median home value Population growth House hold growth \\\n",
173 | "0 63959 -0.69 -0.49 \n",
174 | "1 74330 -0.13 0.03 \n",
175 | "2 112687 -0.09 0.00 \n",
176 | "3 73643 -0.59 -0.43 \n",
177 | "4 56642 -1.16 -1.03 \n",
178 | "\n",
179 | " Per capita income growth Winner Win_bin \n",
180 | "0 0.71 Barack Obama True \n",
181 | "1 0.85 Barack Obama True \n",
182 | "2 0.55 Barack Obama True \n",
183 | "3 0.57 Barack Obama True \n",
184 | "4 0.69 Barack Obama True "
185 | ]
186 | },
187 | "execution_count": 1,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "import numpy as np\n",
194 | "import pandas as pd\n",
195 | "import matplotlib.pyplot as plt\n",
196 | "%matplotlib inline\n",
197 | "pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier\n",
198 | "##check out tutorial here:\n",
199 | "##http://nbviewer.ipython.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%201%20-%20Reading%20from%20a%20CSV.ipynb\n",
200 | "df = pd.read_csv('../train.csv')\n",
201 | "df1 = df.drop('Winner', 1)\n",
202 | "df['Win_bin'] = (df['Winner'] == 'Barack Obama') ##new column: logical wrt winner\n",
203 | "df.head()"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "SPLIT DATA INTO TRAINING AND TEST SETS
"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 2,
216 | "metadata": {
217 | "collapsed": false
218 | },
219 | "outputs": [
220 | {
221 | "name": "stdout",
222 | "output_type": "stream",
223 | "text": [
224 | "X_train shape: (1091, 14)\n",
225 | "y_train shape: (1091,)\n",
226 | "X_test shape: (122, 14)\n",
227 | "y_test shape: (122,)\n"
228 | ]
229 | }
230 | ],
231 | "source": [
232 | "from sklearn.cross_validation import train_test_split\n",
233 | "X_train, X_test, y_train, y_test = train_test_split(df1, df['Winner'], test_size = 0.1, \n",
234 | " random_state=0)\n",
235 | "print(\"X_train shape: %s\" % repr(X_train.shape))\n",
236 | "print(\"y_train shape: %s\" % repr(y_train.shape))\n",
237 | "print(\"X_test shape: %s\" % repr(X_test.shape))\n",
238 | "print(\"y_test shape: %s\" % repr(y_test.shape))"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "OPENING THE PIPELINE
"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 3,
251 | "metadata": {
252 | "collapsed": false
253 | },
254 | "outputs": [],
255 | "source": [
256 | "#see here for intuition:\n",
257 | "#http://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html\n",
258 | "from sklearn import linear_model, decomposition, datasets, preprocessing\n",
259 | "from sklearn.pipeline import Pipeline\n",
260 | "from sklearn.grid_search import GridSearchCV\n",
261 | "from sklearn.svm import LinearSVC\n",
262 | "#build a scaler component to pipeline:\n",
263 | "scaler = preprocessing.StandardScaler().fit(X_train)\n",
264 | "#see here for 'scaler in pipeline' details: \n",
265 | "#http://scikit-learn.org/stable/modules/preprocessing.html\n",
266 | "#X_train_scaled = scaler.transform(X_train)\n",
267 | "#Instantiate a model:\n",
268 | "logistic = linear_model.LogisticRegression()\n",
269 | "#this is the pipe!:\n",
270 | "svm = LinearSVC() # Instantiate the model\n",
271 | "tuned_parameters = 10.**np.arange(-3,5)\n",
272 | "pipe = Pipeline(steps=[('scale', scaler), ('svm', svm)])"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 4,
278 | "metadata": {
279 | "collapsed": false
280 | },
281 | "outputs": [
282 | {
283 | "name": "stdout",
284 | "output_type": "stream",
285 | "text": [
286 | "Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
287 | " intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',\n",
288 | " random_state=None, tol=0.0001, verbose=0))])\n"
289 | ]
290 | }
291 | ],
292 | "source": [
293 | "estimator = GridSearchCV(pipe,\n",
294 | " dict(\n",
295 | " svm__C=tuned_parameters))\n",
296 | "estimator.fit(X_train , y_train);\n",
297 | "print(estimator.best_estimator_)"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 13,
303 | "metadata": {
304 | "collapsed": false
305 | },
306 | "outputs": [
307 | {
308 | "data": {
309 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZwAAAEQCAYAAACEM8KaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xt8E2W+P/BPkjZp06SF3lPK3YK0tUBBFxRWvEApUkSW\nFkHFlQV2V9yfe1bOYRftli5UFM9x9awedRe14LqsWw6uFKzItWcrdbkUaLHlspRbIemNXpKmSZrM\n/P4oRAppk5TMLfm+Xy9fMMlk5jNDzbfP88w8I2NZlgUhhBDCMbnQAQghhAQGKjiEEEJ4QQWHEEII\nL6jgEEII4QUVHEIIIbyggkMIIYQXQVxuvLKyElu3bgUA5OTkIDU1tdd1S0tLsWvXLigUCixYsMC5\nbnNzM9555x04HA6MHDkSzz77LJeRCSGEcISzgsMwDIqKipCbmwsAKCgoQEpKCmQymcv1i4uLsWHD\nBlgsFhQUFKCgoAAA8Mknn+DJJ5/E6NGjuYpKCCGEB5wVHIPBAJ1OB6VSCQCIi4tzvuZKYmIiqqur\n0draiqSkJADdRau+vp6KDSGE+AHOCo7JZIJarUZhYSEAQK1Ww2g09lpw0tLSsHPnTtjtdmRkZAAA\n2tvbYbPZsGHDBnR2diIzMxP33XcfV5EJIYRwiLOCo9FoYDabsXTpUrAsi40bNyI8PNzluvX19aio\nqMCqVasAAHl5eUhLS4NGo4FarcbKlSvBMAxyc3Mxbtw4Z6uJEEKIdHBWcOLj46HX653LBoMB8fHx\nLtdlGAYOhwMAwLIsbDZbd7igIERFRaG1tRWRkZEICuo77t69e32UnhBCAssjjzzC+T44KzhyuRzz\n58/H2rVrAQDZ2dnO98rLy6FSqZCeng4A0Ol0SEpKwvr168EwDDIyMpytmKeffhoffPABzGYzJk+e\n7LZ1c2ObhBBCPFNRUcHLfmT+NFv03r17JV1wysrKMGXKFKFj9IuUswOUX2iUX1gVFRW8tHDoxk9C\nCCG8oBYOIYQEOGrhEEII8SucTm1DvCPlfmApZwf4z28ymdDW1tbrzBveamtrQ0REhE+2JQTKzy2W\nZREREQGNRiNoDio4hPCsqakJMpkMCQkJPis4CQkJPtmOUCg/t1iWxbVr12CxWBAdHS1YDupSExEp\ntxCknB3gN7/NZkNUVJTPig0h7shkMkRFRTnvcRQKFRxCCCG8oIIjImVlZUJH6DcpZwekn58QKaCC\nQwjxidbWVmRmZiItLQ3vvPNOr+u9/vrrOHv2bJ/bGjx4sFf7fu+999DZ2XlH+yTco/twCOHZ1atX\nRT/IfCdef/11hIWF4YUXXuj3NoYMGYJLly55vP64ceOwb98+REZG9nufgaC3nz26D4cQwjuj0YjU\n1FTn4LLdbkdaWhra2tqc769YsQLz5s3Dfffdh3Xr1nm87Q8//BCzZs3CsGHDcPz48R7vnThxAtOm\nTUNWVhYKCgpw8+/Bfe3TYrEgMzMTDQ0NePLJJzFr1izU1dV5tM+LFy8iJycHmZmZyMjI6NGt+tpr\nr+E3v/kNli1bhocffhjLli3z+DiPHz+Oxx9/HLNnz8YzzzyD5uZm53uXLl3C/fffj1dffRWPPvoo\nHn/88R6fHTt2LDZv3ozp06dj6tSpPYru//zP/+DRRx/FjBkz8Ktf/QoWi8X53l/+8hesWLECzz33\nHGbMmIFXXnnF47x8ooIjIlIeR5BydkD6+X1Fq9XiwQcfxNdffw0A2LdvHyZNmuS8x0Sr1WLdunXY\ntm0bSktLsWXLFhgMBo+2/ZOf/ARffvkl0tLSbrtC7+c//znWrl2L4uJiPPLII7BarT0y9bbPkJAQ\nlJSUIDY2Fp999hm+/PJLJCYmerTPn/70p1i2bBlKSkqwceNGrFixAq2trc73q6ur8eabb2Lv3r04\ncuQILly44PYYbTYbfvGLX+CDDz7Ajh07MH/+fKxZs6bHOufPn0dycjL27NmDL774osd7MpkMZ86c\nwe7du/GPf/wDQ4YMAQDs378fO3bsQElJCb7++muoVCq8+eabPT574MAB/PrXv8bXX3/t1S8CfKL7\ncAgRoRkbj/lkO18vHe/1Z5566im8//77mD17Nj777DMsXry4x/sKhQK7du3CpUuXoFQq0dDQ0Ouj\nRzzR2toKo9GIqVOnAgAmTZqEkJAQTvdpNBpRV1eH6dOnA+geM/rBD36AQ4cOYcaMGZDJZJgxYwa0\nWq3z/RutvL6cPXsWV65cwfLlywF0P3pFpVL1WGfEiBGYO3dur9t46aWXbntt3759WLhwIYKDgwEA\nS5cuxfLly7F69WoA3YUqKytL9E9HpoIjIlK+l0XK2QHx5e9PofCVyZMnY+XKlbh06RK+++47PPjg\ng873vvvuO/zsZz/DkiVLcM899yAqKgp3Ogwsl/fd0cLFPgHctg2GYXq0gvqzD4VCgSFDhmD79u13\nnO9WDMP0+PutLTYpDMdTlxohpAeZTIb58+dj6dKl+NGPftTjvdLSUsyYMQPPPfccwsPDcenSpTv+\nogsPD0dsbCy+/fZbAMCuXbtgNpu92qdKpUJDQwMAz754tVothg4dipKSEgDAhQsXcOjQoTt+hH1S\nUhKsVit27NjhfM0XheDRRx/Fli1bnF2Nf/rTn5ytM1/tgw9UcEREyuMIUs4OSD+/ry1YsACVlZVY\nuHBhj9fnzZvnLADvvvsuJk+e7Pyiv9mHH36Ixx57DF1dXR7t7+2338bq1asxc+ZMHDlyBGq12qt9\nLlmyBE899RSeeOIJfPLJJx7t8/3338fHH3+MmTNnYvny5Xjvvfd6zIfWn5kgFAoFPv30U2zevBkZ\nGRnIzMzEn/70px7r9LXd3t578MEH8fjjj+Oxxx7D9OnTYbfb8ctf/rLH56QwcwVdFi0iUp4AU8rZ\nAX7z+/tl0US86LJo4iTlL2wpZwekn58QKaCCQwghhBdUcEREyuMIUs4OSD8/IVJABYcQQggvqOCI\niJTHEaScHeA//833VBDCBzH8zFHBIYRn0dHRuHLliii+AEhgYBgGV65cEfRpnwDNNCAqUr60WMrZ\nAX7zK5VKxMXFeTwHmSfa2tp63EMiNZSfe3FxcVAqlYJm4LTgVFZWYuvWrQCAnJwcpKam9rpuaWkp\ndu3aBYVCgQULFvRYt6urCy+++CLmzJmDmTNnchmZ+AGWZWGxM+iwOdBhc8B0/c8OmwMma/ey+frr\nN95rvhaCsv0XMDA0CAPVwd1/hn7/Z0RIEBRy391Yp1QqfXovTm1tLcaMGeOz7fGN8gcGzgoOwzAo\nKipCbm4uAKCgoAApKSm93g1bXFyMDRs2wGKxoKCgAAUFBc73du/ejREjRkjiTto7IeUWgi+zOxgW\n5q6bCoPVgY6u63+6KiLX1zHftE6QXIYwlQIaZRDClHKEKRXQKBXOP9VKBWI1SmhU3a/JZbFo7bTj\nWmcXWjvtqG3uREunHS2dXWjptMNktUOrCkKkOggDQoMRGXrLn+rvC1R4SBDkPP+sSvlnB6D8gYKz\ngmMwGKDT6ZxNuBtdCDqdzuX6iYmJqK6uRmtrK5KSkpyvW61WVFZWYtKkST2e/0DEy+Zg0HFTkXDV\nori5xXFrEbHYGYQG3ygQcoQpg3r8PUwpR7Q6GEMHhiJMKYdG2bOwhCkVCFb4dnjSwbBotdjRYu7q\nUYiazF0429zZvWzufr3D5kBEiOuW0s0tqMjQYGhUCt6LEyFC4azgmEwmqNVqFBYWAgDUajWMRmOv\nBSctLQ07d+6E3W5HRkaG8/WSkhLMnDmzx3Mq/JUYxkFYlkVnF9N3i+KWItJhc6CxzQRGoUSHzQGG\nhfOL32XRUAUhMSL4erEIur5u9981KgVCg+W8fwm7O/cKuQxR6mBEqYPdbqvLwaDNYse1TjtaO7tw\n7XohqjfZcKqxw1mYWi12dHYx3cXpekFy1YIaGBqMgeru89dbK18MPzt3gvIHBs4KjkajgdlsxtKl\nS8GyLDZu3Ijw8HCX69bX16OiogKrVq0CAOTl5SEtLQ12ux2nTp3C3LlzceDAAY/2e/M//I2b+aSy\nXFVVdcfbY1hg3L2T0GFzoOzQUVgcwPDRyeiwOVB16iwsDhmi4gehw+bARX0DLIwMwaEamGwOtHZY\nYWUAVdD1bia7FSoFC13UQGhUCrQ3NyBEzuLukcOgC1eh7vxZDJID901Kw+mTxxGisCBEzmLa1Acg\nk8lc57UDUyZ9v+wAkCaS8+/L5egwJU4dOwQAmHnz+0pgysPfL9sZIDn9PrR0duGbo5UwtcsQqh2G\nq+1WlH53ESa7DIxSjWvmLljtDoQpWMQP0GBgaBBs7c0IC2KRNmoEFLZezjct07KHy3zgbPJOhmGQ\nl5eH3NxcsCyLdevWYe3atS7X1ev12Lx5M1atWgWWZbF69Wrk5+fj5MmT2LlzJ7RaLRobG+FwOPDC\nCy/0eKLfzaQ+eSfLsrA5WI8Hunu0Pq53YVntDMKUCqiDFdCovh+v0PRocdz0d9Xtrwf5cHCc+I7V\nzvQYZ7p2vVvvUksn9EYb/nvOKL8f5yTc4GvyTs5aOHK5HPPnz3cWmezsbOd75eXlUKlUzuKg0+mQ\nlJSE9evXg2EYZGRkQKlUIj093bnOgQMHYLVaey02UuFgWOiNVlxutaKuzYK6Nisut1lwtd0Ko8UB\nAM6B7FsLwY3l6LDgHgXl5veF6I4i/FAFyRGnVSJO2/PSVoZlseLvp/HNhTZMGT5AoHSEuEePJ+BI\nm8WOulYLLrd1F5bLbVZcbrWg3mRDtDoYiREhSBygwuCIECRGqDAoQoXvjh7CtB9Ksx9Y6n3YUs//\n8Vfl+MY0AB/Mu9unl2/zRernX+r5Jd/CCQRdDgb6dhsutVm6WyutVmeLhWWBxAgVEgeEYHCECo/e\nFYnBA1RI0KqgDHJ9BVUvLxPi1l1hDlTZFdj7r2uYMSpK6DiEuEQtHDdYlkVLp93ZSqlrvdENZkVj\nhw2xYUokRqgweEBIjz8HhARRfzrh1XcGE147cBEfZo+B0seXhRP/Ri0cntnsDK60d7dO6lq/7war\na7NCIQMSI0Iw+HoX2D06DRIjQqDTKn1+vwch/ZUSr8GwgSHYWdOEJ1JjhY5DyG0CquCwLIsmcxfq\nWq8XlhvjK61WtHR2IV6r6m6lRKgwNkGLx8ZEY3BECMJD+DlNUu4HlnJ2wH/y/3iiDqu/OoeMUVFQ\nKxVCx/KYv5x/0je/LDidXQ5cud7t5bwSrNWCK+1WhATJuwfsrxeW9EFaDI5QIV6rkuRgKyE3Gxml\nxrgELbZ914inx8cLHYeQHvxuDOc/TyvRbrEjIVzV3Q0WoepxNZhG5Zc1lhCnK21WvLj9ND7KTuat\ndU6kjcZw+um/ZichNkxJrRUSsAZFqPDD4QPx1xP1WP6DQULHIcTJ70a8dRLuGrsx1YQUSTk74H/5\nnxofj11nmtHUYRMokXf87fwT1/yu4BBCgKiwYGSOjsKfj/nuIW+E3Cm/G8MRy0wDhAit3WLHkqJq\nvD1nFAZFhAgdh4gYX2M41MIhxE+FhwRhXmosCo/qhY5CCAAqOKIi5X5gKWcH/Df/E6kxqNKb8K8m\nM8+JvOOv55/0RAWHED8WGqzAk+Pi8fERauUQ4dEYDiF+zuZg8JOiGvzHtKG4J14jdBwiQjSGQwjx\nCaVCjsUT4vHR4avwo98viQRRwRERKfcDSzk74P/5Hx4ZCZPNgUOX23lK5B1/P/+kGxUcQgKAQi7D\njyfo8PERPRhq5RCB0BgOIQGCZVm8uP0MnkiNwUMjI4WOQ0SExnAIIT4lk8nw3L0J2HRUDzvjN79n\nEgmhgiMiUu4HlnJ2IHDyj0/QIk6jwlenmzlO5J1AOf+BjgoOIQFmyb06fHrMAIudEToKCTA0hkNI\nAPrdnvO4O0aNnLFxQkchIkBjOIQQzvx4gg5FVQ0wWe1CRyEBhAqOiEi5H1jK2YHAyz9kYAgmDQlH\nUVUDR4m8E2jnP1Bx/sTPyspKbN26FQCQk5OD1NTUXtctLS3Frl27oFAosGDBAue6f/zjH6HX68Ew\nDJ5//nnExVE3ACF36pl0HX7++SnMTY7BQHWw0HFIAOB0DIdhGOTl5SE3NxcAUFBQgDVr1kAmc/1E\nzpUrV2LDhg2wWCwoKChAQUFBj/dPnjyJ8vJyLFu2zOXnaQyHEO+8V14HhmWx4v7BQkchAvKLMRyD\nwQCdTgelUgmlUom4uDgYDL0/gTAxMRHV1dWoqKhAUlLSbe+HhIQgKIjzRhkhAePJcXHYd64FeqNV\n6CgkAHBacEwmE9RqNQoLC1FYWAi1Wg2j0djr+mlpadi5cydKS0tddr3t378fM2bM4DKyoKTcDyzl\n7EDg5h8YGow5yTH4pELYR1EH6vkPNJwWHI1GA7PZjEWLFmHhwoXo6OhAeHi4y3Xr6+tRUVGBVatW\n4eWXX0ZxcTFsNpvz/SNHjiAhIQGDBg3qc583/8OXlZVJarmqqkpUeWg5MJbn3xOLI5fbsW3vN6LI\nQ8vCLPOBtzEclmWxbt06rF271uW6er0emzdvxqpVq8CyLFavXo38/HwolUrU1tairKwMixcv7nN/\nNIZDSP9srazHyfoOrJk+QugoRAB8jeFwOiAil8sxf/58Z5HJzs52vldeXg6VSuUsEDqdDklJSVi/\nfj0YhkFGRgaUSiUA4M0330RUVBTy8/MxePBgLFmyhMvYhAScrOQYbPuuETUNHRgTGyZ0HOKnaKYB\nESkrK8OUKVOEjtEvUs4OUH4A+PJUEw7UtmDDrNsv2OEanX9h+cVVaoQQ6cgYFYVGUxcqrojzIW1E\n+qiFQwhxOnCuBVurGvCHx0f1er8c8T/UwiGE8O6HIwbAwbIou9AmdBTih6jgiAjflyj6kpSzA5T/\nBrlMhiUTE1B45CocPD6kjc5/YKCCQwjpYWKiFgNCg7HnX9eEjkL8DI3hEEJu8129Cev3X8BH2clQ\nKuj3Un9HYziEEMGkxGkwfGAodtY0CR2F+BEqOCIi5X5gKWcHKL8rz01MwF9P1MNsc/h827ei8x8Y\nqOAQQlwaERWKcQlabDspjoe0EemjMRxCSK+utlvx/744jQ+zkxERQo8G8Vc0hkMIEVxCuAo/HDEQ\nn52oFzoK8QNUcEREyv3AUs4OUP6+PDU+HrvONKOxw+Z+5X6i8x8YqOAQQvoUpQ7GrNFR+LPAD2kj\n0kdjOIQQt4xWO5YU1eD3WUlIjAgROg7xMRrDIYSIhlYVhHmpMdh0VC90FCJhVHBERMr9wFLODlB+\nT8xNiUGV3oSzTWafb5vOf2CggkMI8UhosAILx8Xj4yNXhY5CJIrGcAghHutyMPjJ1hqs/OFQpOk0\nQschPiKaMZyXX34ZpaWl6Orq4jwMIUTcghVyLE7X4aPDV+FHv6sSnrgtOEuXLsW5c+fw0ksvobCw\nEHV1dXzkCkhS7geWcnaA8nvjoZED0dHlwD8v++5R1HT+A4PbuSqGDx+O4cOHw26348iRI1i/fj2i\no6ORlZWFiRMn8pGRECIiCrkMz03UofDIVdw3OBxyehQ18ZBHYzjXrl1DaWkpysrKMGTIEEydOhXH\njx8HACxZsoTzkJ6iMRxC+MGyLH5ZfAaPJ8fg4bsihY5D7hBfYzhuWzivvvoqGhsb8fDDD2PNmjXQ\narUAgPT0dOTm5nIekBAiPjKZDM9NTMBbZZcwdfgABNND2ogH3P6UzJkzB7///e+RlZXlLDY3zJw5\nk7NggUjK/cBSzg5Q/v4Yl6BFvFaFr0433/G26PwHBrctnNTU1F7fe+CBB/r8bGVlJbZu3QoAyMnJ\n6XNbpaWl2LVrFxQKBRYsWOBc15ttEEL4tWRiAvJ212L6qCiEBFErh/TN7U/IhQsXbnutpqbG7YYZ\nhkFRURFeeeUVvPLKKygqKurzMsri4mKsW7cOv/nNb7Bly5Z+bUPqpkyZInSEfpNydoDy99eoGDWS\n48LwxXeNd7QdOv+BwW3B2bhx422v3SgIfTEYDNDpdFAqlVAqlYiLi4PB0Ptss4mJiaiurkZFRQWS\nkpL6tQ1CCP+enaDD1qoGmKx2oaMQkXNbcOTy21fxpJVhMpmgVqtRWFiIwsJCqNVqGI3GXtdPS0vD\nzp07UVpa6uw283YbUiflfmApZwco/50YMiAEk4aEo6iy/4+ipvMfGNwWHIVCgaamJueyXq93WYRu\npdFoYDabsWjRIixcuBAdHR0IDw93uW59fT0qKiqwatUqvPzyyyguLobNZvNqGzfc/A9fVlYmqeWq\nqipR5aFlWvZ0+Zl0Hf5eZcBXB74RRR5a9n6ZD27vw6mursb777+PSZMmgWEYHDx4ECtWrEBKSkqf\nG2YYBnl5ecjNzQXLsli3bh3Wrl3rcl29Xo/Nmzdj1apVYFkWq1evRn5+PoKCgjzeBkD34RAipPe+\nrQPDsFhx/2ChoxAv8XUfjkc3fjY0NODYsWOQyWQYN24cYmNjPdr4iRMnnFeYZWdnIy0tDQBQXl4O\nlUrVozhs27YNp0+fBsMweOCBBzBt2rQ+t+EKFRxChNPa2YWfbK3BO3NHQ6dVCR2HeEFUBUcqpF5w\nysrKJHu1i5SzA5TfVzYf1cNgtOI/pg3z6nNiyd9fUs8vmpkGAKClpQWtra3OiwVaW1sl/cVOCOHG\nj+6JxXN/q8b5a50YHhkqdBwiMm5bOFu2bMGBAwcQHByM8PBwNDQ0YMyYMXjppZf4yugxqbdwCPEH\nW6saUGUwIX/6CKGjEDccDIvXDlxA5oAWcbRwvv32W/zhD3/Avn37MGzYMKjVauzYsYPzYIQQaZoz\nJhrbTjagpqEDY2LDhI5D+vDtpTY0mrqAAfzsz+31zTExMVAqlYiJicGlS5cwZMgQXLlyhY9sAYfv\nSxR9ScrZAcrvS8ogOZ4ZH+/VQ9rElL8/pJq/uKYJWcnRvO3PbcGJjIyEyWTCmDFjsHv3bvzlL3/x\n6+llCCF3bsaoKDSbu1BxxX9v1Ja6ujYLaps7MXU4T80beDCG09nZidDQ7sG/ixcvoqqqCg888AAG\nDhzIS0Bv0BgOIeJRWtuCv1XW453HR0NGD2kTnfe/rUOwQo6f3JvA21Vqbls4N4oNAAwdOhSzZ88W\nZbEhhIjL1OEDwLLAPy60Ch2F3MJiZ7Dn7DU8dncUr/ul+cRFRKr9wIC0swOUnwtymQxL7k1A4RE9\nHEzf3fBizO8NqeU/cK4FY2LDEM/zDbpuC05+fj4fOQghfmjCIC0iQ4Ox++w1oaOQ61iWxfbqRl4v\nFrjBbcGxWq185CCQ9jM1pJwdoPxckV1v5fz5mB42O9PremLN7ykp5T/daIbJ5sDExL4nQuaC24Iz\nduxYlJeX85GFEOKHkuPCMCIyFDtONblfmXBue00TZo+JhlyACzncFpxDhw7hnXfewUsvveT8b+XK\nlXxkCzhS6we+mZSzA5Sfa89NTMBnJ+phtjlcvi/2/O5IJX+bxY7yi23IGMXvxQI3uJ1pYNWqVXzk\nIIT4seGRoRifoMW2kw14Ol0ndJyAtetMM+4fGoGIEI+m0fQ5mi2aEMKLq+1W/OKL0/goO1mwL7xA\nxrAsfvy3aqx+aBjuvmXKIdHch0MIIb6QEK7CtBED8dfjBqGjBKQjde3QqhQYHaMWLIPbXzNee+21\n216TyWTU1cYBKT9TQ8rZAcrPl0Xj47H8f2vwRGosYjVK5+tSyd8bKeQvrm5C1pgYQWd9cFtwsrKy\neiyfPn0aFouFs0CEEP8VpQ7GrLuj8ekxA/5t6hCh4wQMg9GKmoYOvPzIcEFz9GsM56OPPsKSJUu4\nyHNHaAyHEPEzWu1YUlSD32clITEiROg4AeHDw1fR5WDws0mJLt8X7RiOxWJBXV0dF1kIIQFAqwrC\nvNQYbDqiFzpKQLA5GOw63YzZY/ifWeBWbgvOM888g8WLFzv/e/755zF27Fg+sgUcqVzL74qUswOU\nn29zU2JQVW/C2SYzAOnlv5WY8//jfCtGRIWKojXpdgznk08+4SMHISSAhAYrsGhcPD4+chWvzrxL\n6Dh+rbi6CdlpsULHAECXRYuK2K9y6YuUswOUXwiZo6NQ12ZFpd4oyfw3E2v+c81mNHTYMGlIhNBR\nAHhQcM6fP3/bazU1NZyEIYQEjmCFHIvTdfjosJ6eIsyR4pomPHZ3NBRycTwAz23B+fDDD297bcuW\nLZyECXRi7gd2R8rZAcovlIdGDoS5y4GPvvpW6Ch3RIznv8PmwP/VtiJztDDzprnidgxHLr+9Jnn6\n20hlZSW2bt0KAMjJyUFqaqrL9cxmM9544w3ncm1tLTZt2gQAKC0txa5du6BQKLBgwYJet0EIkR6F\nXIZnJ+jwwT868BzLCjKDsb/affYaJiRqEakOFjqKk9uCo1Ao0NTUhOjo7kvq9Hq9yyJ0K4ZhUFRU\nhNzcXABAQUEBUlJSXN7lqlarkZeXBwC4ePEiSkpKnO8VFxdjw4YNsFgsKCgoQEFBgWdHJkFi7Qf2\nhJSzA5RfSPcPjcCnxzT45kIbpg4fIHScfhHb+WdZFsXVjXhxirhurnVbcLKzs/G73/0OkyZNAsMw\nOHjwIFasWOF2wwaDATqdDkpl9/QVcXFxztf6UlJSgszMTOdyYmIiqqur0draiqSkJLf7JYRIi0zW\n3crZePgq7h8aIZrxBik7oTdBLpfhnvgw9yvzyG1TJTk5Ga+88gqioqIQGxuLNWvWICUlxe2GTSYT\n1Go1CgsLUVhYCLVaDaPR2OdnjEYjmpubMXToUOdraWlp2LlzJ0pLS/2+O02M/cCeknJ2gPILzXax\nCqFBcvzf+Raho/SL2M7/9uomZI2JFnTeNFc8uiw6NjYWGRkZmDFjBmJjPbueW6PRwGw2Y9GiRVi4\ncCE6OjoQHt73I0337NnTY3qF+vp6VFRUYNWqVXj55ZdRXFwMm83W5zZu/ocvKyuT1HJVVZWo8tAy\nLfO1LJMBE0Ka8cdvzsPBsILnkfJyU4cNJ/RGhDWe9urzfHA7l5rdbkdQUM+et87OToSGhva5YYZh\nkJeXh9zcXLAsi3Xr1mHt2rW9ru9wOLBmzRrk5+c7x4j0ej02b96MVatWgWVZrF69Gvn5+c5uulvR\nXGqESBcZsJN9AAAWPUlEQVTLsnhp51nMHBWFGQI9kdIfbD6qR5vFjl88MNjjz4hmLrVbiwTLsi4f\nWXDbhuVyzJ8/H2vXrsW6deuQnZ3tfK+8vBwVFRU91j98+DAmTJjQ44IEnU6HpKQkrF+/Hq+++ioy\nMjJ6LTaEEGmTyWT48QQdPj1mgJ2h+3L6w86wKBHJvGmuuL1o4NYGkEwm8/iy6LFjx7qcd23y5Mm3\nvTZp0iSX25g3b55H+/IHZWXif6ZGb6ScHaD8QruRP02nRbxWid1nmpF5tzi/NF0Ry/k/eLEVCeEq\nDI/suwdKKG5bOA6Ho8e4SWdnJ7q6ujgNRQgJXM9OSMCnxw2wORiho0hO8fWLBcTKbQvnwQcfxOuv\nv445c+bA4XDgiy++wLRp03iIFnjE8BtSf0k5O0D5hXZz/uS4MAwdEIqvTjdjTnKMgKk8J4bzf6nF\ngsutFjwwTBzzprnituDMmDEDWq0We/fuhUwmQ0ZGBu6//34+shFCAtSzE3RYs7sWGaOioAqiOYY9\nUVzThJmjoxCsEO/58ijZ5MmT8atf/QrLly+H1WrF+vXruc4VkPi+RNGXpJwdoPxCuzX/qBg1kmLU\n2HmqSaBE3hH6/Hd2ObDv3DXMEvm4l9sWjtlsxpEjR3Dw4EE0NjZi7NixyMrK4iMbISSAPZuuw+qv\n/oXM0VEIDVYIHUfU9p1rwT3xGsRqxH0Vb6/34ZSVleHgwYPQ6/W49957cezYsR4TbIoR3YdDiH9Z\nt/c8RkWrkTM2TugoosWyLH7++Sksu28QJiT2fXN9bwS/D+cPf/gDlEolfvvb32LRokV0/wshhHfP\npMejqKoBZptD6CiiVd3QAaudxfhBWqGjuNVrwXn77bcxZMgQvP7661i7di3a29thMpn4zBZwhO4H\nvhNSzg5QfqH1ln/owFCkD9Li8+8aeU7kHSHPf3F1E2aPiZbEox16HcOJj4/HvHnzMG/ePFy9ehUH\nDx5Efn4+QkNDkZ6ejrlz5/KZkxASoJ5Jj8cvt5/B48nR0KjcDjsHlJbOLhy63I4V9ycKHcUjbudS\nu1VdXR0OHjyInJwcrjL1G43hEOKf/rP0ImI0Sjw7oe/HmwSav54w4EqbFS/9cKj7lfsg+BhObxIT\nE0VZbAgh/uup9Hhsr25Eu8UudBTRcDAsdtY0I0siN8cC/Sg4hDtS7oeXcnaA8gvNXX6dVoWpwweg\nqLKep0TeEeL8H7rcjoGhQRgVreZ93/1FBYcQIgmLxsXjy9PNaOmkuRwBoLimEVnJ4r7R81Zej+GI\nGY3hEOLf3j14GUFyGX46SRqD5Fy52m7Fi9vP4NMnU6D0wdQ/oh3DIYQQoTw5Lh5fn72G5o7AbuXs\nqGnCjKRInxQbPkkrrZ+Tcj+8lLMDlF9onuaPUgdjRlIk/nrCwHEi7/B5/q12BrvPXsNjIn4MQW+o\n4BBCJCVnbBz2nWtBg8nmfmU/VFrbglHRaiSEq4SO4jUawyGESM6Hh6/CaLXjl1OGCB2Fd7/44jSe\nGh+PSUN899wbGsMhhJBeZN8Ti7LzrdC3W4WOwqszjWa0dtpxbz8n6RQaFRwRkXI/vJSzA5RfaN7m\nDw8JwpzkGHx6TBxjOXyd/+KaRjw2JgoKufjnTXOFCg4hRJLmpcbgn5fbUddmEToKL4xWO7650IaM\nUVFCR+k3GsMhhEjWp8cMuNRqwW8eGiZ0FM79b1UDzjaZ8euHhvl82zSGQwghbjyREoOKK0ZcaOkU\nOgqnGJbFjpomyc0scCsqOCIi5X54KWcHKL/Q+ptfrVQg+55YfFIh7FgO1+f/2BUjVEFyJMeGcbof\nrnH6cInKykps3boVAJCTk4PU1FSX65nN5h6Pr66trcWmTZsAAM3NzXjnnXfgcDgwcuRIPPvss1xG\nJoRITFZyNLb9rRrnms0YGSWdiSy9UXy9dSOTwEPW+sLZGA7DMMjLy0Nubi4AoKCgAGvWrHF7wi5e\nvIiSkhL87Gc/AwC89dZbyMzMxOjRo93uk8ZwCAlM20424MRVE/JnjBA6is81mGz4+een8OcnUxAa\nrOBkH5IfwzEYDNDpdFAqlVAqlYiLi4PB4L7ZW1JSgszMTADdRau+vt6jYkMICVyz747G2SYzzjSa\nhY7ic1+easLDIyM5KzZ84qzgmEwmqNVqFBYWorCwEGq1Gkajsc/PGI1GNDc3Y+jQ7qfXtbe3w2az\nYcOGDcjPz8ehQ4e4iisKUu6Hl3J2gPIL7U7zK4PkeHJcHDYd1fsokXe4Ov9dDgZfnW6W/MUCN3BW\ncDQaDcxmMxYtWoSFCxeio6MD4eF93x27Z8+eHs06jUYDtVqNlStX4uWXX8bnn38Om63v+ZNu/ocv\nKyuT1HJVVZWo8tAyLUtpObzpNM4YWlFd3yGKPL5Y/nDXPzFkYAiGDAjhfH984GUMh2VZrFu3DmvX\nru11fYfDgTVr1iA/Px9y+fd18K233sLixYsRGRmJ3Nxc5ObmQqlUutwGjeEQEthKTjXhQG0LXp+V\nJHQUn/jVjjN4IiUWU4cP4HQ/fI3hcHaVmlwux/z5851FJjs72/leeXk5VCpVj+Jw+PBhTJgwoUex\nAYCnn34aH3zwAcxmMyZPntxrsSGEkOmjovBZZT0q9Uak6bRCx7kj5691Qt9uw+ShvpukU2g004CI\nlJWVYcqUKULH6BcpZwcov9B8mX/32WaUnG7Gfz2WxNtlxFyc///+5jIGhgbhmXSdT7friuSvUiOE\nECE8PDISrZ12VFzp+yIlMeuwOVBa24JZo/3jYoEbqIVDCPE7+8+14POTDXh7zihJ3iy5vboRJ/Qm\n5D4ynJf9UQuHEEL66cERA9BpZ3DocrvQUbzGsmz3zAISfIS0O1RwRITvSxR9ScrZAcovNF/nl8tk\nWJyuw6ajevDRiePL/FWGDjAMi7E6jc+2KRZUcAghfumBYRFgAXxzsU3oKF4prmlEVnKMJLsC3aGC\nIyJSvspIytkByi80LvLfaOVsPqoHw3Erx1f5r5m7cLTOiOlJkT7ZnthQwSGE+K1JQ8KhCpLj/2pb\nhY7ikZLTzfjhiAEIU0p/3jRXqOCIiJT74aWcHaD8QuMqv0wmw7MTdPikQg8Hw10rxxf5HQyLnaf8\n82KBG6jgEEL82oRBWoSHBGH/uRaho/Tp20ttiA1T+u0zfQC6D4cQEgCOXzXirbLL+HD+GCjk4hyM\n/3XJvzA9KRKP3MX/+A3dh0MIIT4yLkGLmLBg7D57TegoLtW1WVDb3Mn5JJ1Co4IjIlLuh5dydoDy\nC42P/D+eoMOnxwzocjA+3/ad5t9R04SZo6OgVPj3V7J/Hx0hhFyXEq/B4AEq7DojrlaOxc5gz9lr\neOxu/71Y4AYqOCIi5XsppJwdoPxC4yv/4nQd/nLcAJvdt62cO8m//1wLkuPCEKf1/0evUMEhhASM\nu2PDcFdUKHaeahI6CoDr86ZVNyJrTIzQUXhBBUdEpNwPL+XsAOUXGp/5F6fr8FllPSw+bOX0N/+p\nRjM6bA5MSJT2w+I8RQWHEBJQ7opWIzk2DMXVjUJHQXFNE2aPiYbcD+dNc4UKjohIuR9eytkByi80\nvvM/k65DUWUDzDaHT7bXn/xtFju+vdiGjFFRPskgBVRwCCEBZ3hkKMYlaPCFgK2cXWeaMXloBMJD\nggTLwDcqOCIi5X54KWcHKL/QhMj/TLoO2042osMHrRxv8zMsix1++pC1vlDBIYQEpMEDQnDv4HD8\nb1UD7/s+UtcOrUqB0TH+O2+aK1RwRETK/fBSzg5QfqEJlf/p8fHYXt2Idov9jrbjbf7i6iZkjfHP\nh6z1hQoOISRgJYSr8MCwAby2cgxGK2oaOjBt5EDe9ikWVHBERMr98FLODlB+oQmZf9G4eOw41YTW\nzq5+b8Ob/DtPNePRpEiEBAXe1y+nl0dUVlZi69atAICcnBykpqa6XM9sNuONN95wLtfW1mLTpk3O\n5a6uLrz44ouYM2cOZs6cyWVkQkiAidMqMW3EQPytsgHLfzCI033ZHAx2nW7Gm1lJnO5HrDgrOAzD\noKioCLm5uQCAgoICpKSkuOyzVKvVyMvLAwBcvHgRJSUlPd7fvXs3RowY4ff9nVLuh5dydoDyC03o\n/AvHxeGn205h/j2xiFQHe/15T/P/43wrRkSFIjEixOt9+APO2nQGgwE6nQ5KpRJKpRJxcXEwGAxu\nP1dSUoLMzEznstVqRWVlJSZOnAg/elYcIUREosOUeDQpEn89Uc/pfoqrmzAnObAuhb4ZZwXHZDJB\nrVajsLAQhYWFUKvVMBqNfX7GaDSiubkZQ4cOdb5WUlISMN1oUu6Hl3J2gPILTQz5n0yLw95/XUNj\nh83rz3qS/1yzGY0dNvxgcER/4vkFzgqORqOB2WzGokWLsHDhQnR0dCA8PLzPz+zZs6fHY07NZjNO\nnTqFcePGebzfm//hy8rKJLVcVVUlqjy0TMuBtPxdxT+RGtaJLcfqOdn+n/afRGpoh/MR10If763L\nfJCxHPVTMQyDvLw85ObmgmVZrFu3DmvXru11fYfDgTVr1iA/Px9yeXcdrKiowM6dO6HVatHY2AiH\nw4EXXngBiYmJLrexd+9epKenc3E4hJAA0GaxY0lRNd6dOxrxWpXPtmuy2rH4s2psnD+mX2NEXKuo\nqOjxyz5XOLtoQC6XY/78+c4ik52d7XyvvLwcKpWqR3E4fPgwJkyY4Cw2AJCenu5c58CBA7Barb0W\nG0IIuVMRIUGYPSYanx4z4KUfDnX/AQ/tPnsNExO1oiw2fOL0suixY8di7Nixt70+efLk216bNGlS\nn9uaNm2ar2KJVllZmeBX6/SXlLMDlF9oYso//55YPPe3alxps2JQhGetnL7ysyyL4pom/NvUIb6M\nKUmBd+cRIYT0QasKwtyUGPz5mN4n2zuuNyFILkNqXJhPtidlVHBERCy/4fWHlLMDlF9oYsv/RGos\njtQZcanF4tH6feXvnjct2u/vI/QEFRxCCLlFmFKBH90Tg08q7qyV09Rhwwm9EY/cFemjZNJGBUdE\n+L5E0ZeknB2g/EITY/7Hk2NQaTDh/LVOt+v2lv/LU82YNmIg1EqFr+NJEhUcQghxITRYgey0OGw+\n2r9Wjp1hUXK6GbMD7CFrfaGCIyJi68f2hpSzA5RfaGLNnzUmGqcazTjbZO5zPVf5D15sRUK4CsMj\nQ7mKJzlUcAghpBeqIDmeHNu/Vs6NiwXI96jgiIgY+7E9JeXsAOUXmpjzZ94dhdprnahp6Oh1nVvz\nX2qx4HKrBQ8MC9x501yhgkMIIX1QKuRYND4em7xo5RTXNGHm6CgEK+gr9mZ0NkRErP3YnpBydoDy\nC03s+TNGReFquxVVBpPL92/O39nlwL5z1zDrbupOuxUVHEIIcSNILsNT4+Ox6Yje7XO59p1rQVq8\nBrEaJU/ppIMKjoiIuR/bHSlnByi/0KSQ/9G7ItFs7sLxq7e3cm7kZ1kWxdWNyArgh6z1hQoOIYR4\nQCGX4en07rGc3lo51Q0dsDlYjEvQ8pxOGqjgiIjY+7H7IuXsAOUXmlTyTxsxEB02Bw7Xtfd4/Ub+\n7dVNmD0mGnKaN80lKjiEEOIhhVyGZ9Ljsfmo4bZWTktnFw5fbsf0JJo3rTdUcERECv3YvZFydoDy\nC01K+acMHwA7w6D8UpvztbKyMnx1uhlThg2AVsXpY8YkjQoOIYR4QS6TYfEEHTYf1YO53sphWGDn\nqSa6WMANKjgiIpV+bFeknB2g/EKTWv7JQyIQJJej7EIrACB4yD2IDA1GUrRa4GTiRgWHEEK8JJPJ\nsHhCPD45aoCDYVFcQ5dCe4IKjohIqR/7VlLODlB+oUkx/72J4QhTKrDlRD2q9e14cPhAoSOJHhUc\nQgjpB5lMhmevj+WMjeiCMoi+Tt2Rse7maZCQvXv3Ij09XegYhJAAwbIsNh66isdTYiQ9lU1FRQUe\neeQRzvdD1+8RQkg/yWQyLPvBIKFjSAbnBaeyshJbt24FAOTk5CA1NdXlemazGW+88YZzuba2Fps2\nbQIA/PGPf4RerwfDMHj++ecRFxfHdWxBlJWVSe5qnRuknB2g/EKj/IGB04LDMAyKioqQm5sLACgo\nKEBKSgpkLqZ9UKvVyMvLAwBcvHgRJSUlzveWL18OADh58iS2b9+OZcuWcRmbEEIIBzgd5TIYDNDp\ndFAqlVAqlYiLi4PBYHD7uZKSEmRmZt72ekhICIKC/LcXUMq/IUk5O0D5hUb5AwOn394mkwlqtRqF\nhYUAulsxRqMROp2u188YjUY0Nzdj6NCht723f/9+zJo1i6u4hBBCOMRpC0ej0cBsNmPRokVYuHAh\nOjo6EB4e3udn9uzZ4/JqiSNHjiAhIQGDBvnvAJ0U70W4QcrZAcovNMofGDht4cTHx0Ov//454AaD\nAfHx8b2u73A4UFFRgfz8/B6v19bWorq6GosXL3a7z4qKiv4HFpharZZsfilnByi/0Ch/YOD8PpwT\nJ044r1LLzs5GWloaAKC8vBwqlarHfTPffvstDAYD5s6d22MbL7zwAqKioiCXyzF48GAsWbKEy8iE\nEEI44Fc3fhJCCBEvmouBEEIIL6jgEEII4QUVHEIIIbxQrFmzZo3QIVyprKzEu+++i/379yMmJgax\nsbFer9vb6zU1NXjzzTeh1+sxduxY0R8Hn3n74s2xiCVzf/J4c5xccpXZ22x8H4s3mcVyLL7IzPex\ncJmZ02NhRcjhcLCvvPIKa7VaWavVyv72t79lGYbxeN2+XmdZlj1x4gT7z3/+k928ebPoj4PPvH3x\n5lhYVhyZb+ZpHm+Pk0u3ZvY2mxDH4klmsR3LnWQW6li4yMzHsYiyS82bKXFcravX63t9HQDS0tKg\n0WgkcRx85u2Lt9MUiSHzzTzN09/pmLhwa2ZvswlxLJ5k7u3/T6GO5U4yC3UsXGTm41hEOTGZN1Pi\n9Lbujb97M62Or/niOPjM2xex5/MVMR+nt9nEcCy++v+Tz2PxNrMYjsVXmbk+FlG2cLyZEqe3dfsz\nrY4Yj0MsxJ7PV8R8nN5mE8Ox+Or/Tz6PxdvMYjgWX2Xm+lhE2cLxZkqc3tZlGKbPbbA83O/qi+O4\ngY+8ffF2miJA+My38iRPf46TSzdn9jabUMfiSWZ3/3/eiutjuZPMQh0LF5m5PhbRzjTgzZQ4va3b\n2+t///vfcfz4cbS2tiI5Odn5vB2xHgefefvizbGIJbO7PN78O4ghc1/ZxHAs3mQWy7H4IjPfx8Jl\nZi6PRbQFhxBCiH8R5RgOIYQQ/0MFhxBCCC+o4BBCCOEFFRxCCCG8oIJDCCGEF1RwCCGE8EKUN34S\n4i/a29uxceNG1NfXIyQkBGFhYfj3f/93yGQyoaMRwjsqOIRw6KOPPsL48ePx0EMPAQDMZjMVGxKw\nqEuNEI50dHTg7NmzzmIDdE9uSEigooJDCEcaGhoEe3gbIWJEBYcQQggvqOAQwpHY2FjU19eLbtZs\nQoRCBYcQjoSFhWH06NH4+uuvna/V19cLmIgQYdFs0YRwyGQyYePGjdDr9VAqldBqtXjhhRfo4gES\nkKjgEEII4QV1qRFCCOEFFRxCCCG8oIJDCCGEF1RwCCGE8IIKDiGEEF5QwSGEEMILKjiEEEJ4QQWH\nEEIIL/4/AqL60pT/poMAAAAASUVORK5CYII=\n",
310 | "text/plain": [
311 | ""
312 | ]
313 | },
314 | "metadata": {},
315 | "output_type": "display_data"
316 | }
317 | ],
318 | "source": [
319 | "plt.plot([c.mean_validation_score for c in estimator.grid_scores_], label=\"validation error\")\n",
320 | "plt.xticks(np.arange(len(tuned_parameters)), tuned_parameters); plt.xlabel(\"C\"); plt.ylabel(\"Accuracy\");plt.legend(loc='best');"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 10,
326 | "metadata": {
327 | "collapsed": false
328 | },
329 | "outputs": [
330 | {
331 | "name": "stdout",
332 | "output_type": "stream",
333 | "text": [
334 | "mean: 0.83410, std: 0.00447, params: {'svm__C': 0.001}\n",
335 | "mean: 0.83868, std: 0.00546, params: {'svm__C': 0.01}\n",
336 | "mean: 0.84051, std: 0.00431, params: {'svm__C': 0.10000000000000001}\n",
337 | "mean: 0.84235, std: 0.00448, params: {'svm__C': 1.0}\n",
338 | "mean: 0.83960, std: 0.00714, params: {'svm__C': 10.0}\n",
339 | "mean: 0.78277, std: 0.04901, params: {'svm__C': 100.0}\n",
340 | "mean: 0.72411, std: 0.04093, params: {'svm__C': 1000.0}\n",
341 | "mean: 0.79652, std: 0.02038, params: {'svm__C': 10000.0}\n"
342 | ]
343 | }
344 | ],
345 | "source": []
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "MAKE PREDICTIONS ON TEST SET
"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 45,
357 | "metadata": {
358 | "collapsed": false
359 | },
360 | "outputs": [
361 | {
362 | "name": "stdout",
363 | "output_type": "stream",
364 | "text": [
365 | "accuracy of best SVM = 0.901639344262\n"
366 | ]
367 | }
368 | ],
369 | "source": [
370 | "predictions = estimator.best_estimator_.predict(X_test)\n",
371 | "#linear SVM that performed the best above\n",
372 | "#print(predictions)\n",
373 | "#how accurate was this?\n",
374 | "#do this tomorrow\n",
375 | "acc = float(sum(np.equal(predictions , y_test)))/len(predictions)\n",
376 | "print('accuracy of best SVM = %s' % acc)"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": null,
382 | "metadata": {
383 | "collapsed": true
384 | },
385 | "outputs": [],
386 | "source": []
387 | }
388 | ],
389 | "metadata": {
390 | "kernelspec": {
391 | "display_name": "Python 2",
392 | "language": "python",
393 | "name": "python2"
394 | },
395 | "language_info": {
396 | "codemirror_mode": {
397 | "name": "ipython",
398 | "version": 2
399 | },
400 | "file_extension": ".py",
401 | "mimetype": "text/x-python",
402 | "name": "python",
403 | "nbconvert_exporter": "python",
404 | "pygments_lexer": "ipython2",
405 | "version": "2.7.10"
406 | }
407 | },
408 | "nbformat": 4,
409 | "nbformat_minor": 0
410 | }
411 |
--------------------------------------------------------------------------------
/digit_recoginition/digit_recog_classifier_test_data.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import pylab as pl
3 |
4 | from sklearn import svm, metrics, preprocessing
5 |
6 | import csv
7 |
8 | import time
9 | start_time = time.time()
10 |
11 | from numpy import genfromtxt
12 | my_data = genfromtxt('train.csv', delimiter=',')
13 |
14 | print time.time() - start_time, "seconds" #took ~41 seconds
15 |
16 |
17 |
18 | start_time = time.time()
19 |
20 | images_train = my_data[1:,1:]
21 | images_train = preprocessing.scale(imagestot)
22 | targets_train = my_data[1:,0]
23 |
24 | classifier = svm.SVC(kernel = 'poly', C = 100, gamma = 0.001, degree = 3)
25 |
26 | # We learn the digits
27 | classifier.fit(images_train, targets_train)
28 |
29 | print time.time() - start_time, "seconds"
30 |
31 |
32 |
33 | my_test_data = genfromtxt('test.csv', delimiter=',')
34 | test = my_test_data[1:,]
35 | test = preprocessing.scale(test)
36 | predicted = classifier.predict(test)
37 |
38 | length = len(predicted)
39 |
40 |
41 | with open('pred_test.csv', 'wb') as csvfile:
42 | csv_writer = csv.writer(csvfile)
43 | csv_writer.writerow(['ImageId','Label'])
44 | for y in range(length):
45 | csv_writer.writerow([y+1,int(predicted[y])])
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/digit_recoginition/digit_recog_grid_search.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | ================================================================================
4 | Digit recognition: Support vector machine parameter estimation using grid search
5 | ================================================================================
6 |
7 | Here I implemented a cross-validation algorithm. I used scikit learn's
8 | `sklearn.grid_search.GridSearchCV` to train each classifier on half the
9 | labeled data and used the other half as the cross-validation set to test
10 | the performance of the classifier.
11 |
12 | The classifiers I tested were all support vector machines (SVMs): Gaussian,
13 | linear, and polynomial (degrees 2,3 and 4) over a range of parameters.
14 |
15 | I tested these classifiers for precision, that is, the positive predictive
16 | value or the proportion of those tested that are predicted correctly.
17 | """
18 |
19 | from __future__ import print_function
20 |
21 | from sklearn import datasets
22 | from sklearn.cross_validation import train_test_split
23 | from sklearn.grid_search import GridSearchCV
24 | from sklearn.metrics import classification_report
25 | from sklearn.svm import SVC
26 | from sklearn import svm, metrics, preprocessing
27 | import csv
28 | import time
29 |
30 | print(__doc__)
31 |
32 | # Loading the Digits dataset
33 |
34 |
35 | ###
36 | from numpy import genfromtxt
37 |
38 |
39 | my_data = genfromtxt('train.csv', delimiter=',')
40 |
41 |
42 | x_train = my_data[1:,1:]
43 | x_train = preprocessing.scale(x_train)
44 | t_train = my_data[1:,0]
45 |
46 |
47 |
48 | start_time = time.time()
49 | # Split the dataset in two equal parts
50 | x_train, x_cv, t_train, t_cv = train_test_split(
51 | x_train, t_train, test_size=0.5, random_state=0)
52 |
53 | # Set the parameters by cross-validation
54 | tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
55 | 'C': [1, 10]},
56 | {'kernel': ['linear'], 'C': [1, 10]},{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [2]},
57 | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [3]},
58 | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [4]}]
59 |
60 |
61 | scores = ['precision'] # you can alter this by adding, for example, `recall'
62 |
63 | for score in scores:
64 | print("# Tuning hyper-parameters for %s" % score)
65 | print()
66 |
67 | clf = GridSearchCV(SVC(C=1), tuned_parameters)
68 | clf.fit(x_train, t_train)
69 |
70 | print("Best parameters set found on development set:")
71 | print()
72 | print(clf.best_estimator_)
73 | print()
74 | print("Grid scores on development set:")
75 | print()
76 | for params, mean_score, scores in clf.grid_scores_:
77 | print("%0.3f (+/-%0.03f) for %s"
78 | % (mean_score, scores.std() / 2, params))
79 | print()
80 |
81 |
82 | print(time.time()- start_time)
83 |
84 |
85 | #
86 |
87 |
88 | #
89 |
90 |
91 | #
92 |
93 |
94 |
--------------------------------------------------------------------------------
/homesite/Boris_gradient_boost.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Homesite"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Using Theano backend.\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "import pandas as pd\n",
27 | "import numpy as np\n",
28 | "import copy\n",
29 | "import csv\n",
30 | "from sklearn import linear_model\n",
31 | "import xgboost as xgb\n",
32 | "from sklearn.ensemble import RandomForestClassifier\n",
33 | "from keras.models import Sequential\n",
34 | "from keras.layers.core import Dense, Dropout, Activation\n",
35 | "from keras.optimizers import SGD\n",
36 | "from sklearn import svm\n",
37 | "from sklearn.decomposition import PCA\n",
38 | "from sklearn.preprocessing import PolynomialFeatures\n"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 2,
44 | "metadata": {
45 | "collapsed": false
46 | },
47 | "outputs": [],
48 | "source": [
49 | "def ReplaceWithDummies(column, *DataFrames):\n",
50 | " #The purpose of this function is to replace a column of type 'object' with n distict values,\n",
51 | " #common to all DataFrames passed in\n",
52 | " #For example train and test data sets, with n-1 boolean columns as delete the original culmnn\n",
53 | " for df in DataFrames: #Make sure the column is actually in all data frames\n",
54 | " if column not in df.columns:\n",
55 | " print('column not found')\n",
56 | " return None\n",
57 | " size=[]\n",
58 | " for df in DataFrames:\n",
59 | " size.append(df.shape[0])\n",
60 | " \n",
61 | " long_column=[]\n",
62 | " for i in range(len(DataFrames)):\n",
63 | " long_column.append(DataFrames[i][column])\n",
64 | " long_column = pd.concat(long_column)\n",
65 | " dummies = pd.get_dummies(long_column)\n",
66 | " dummies.drop(list(dummies.columns)[0], axis=1, inplace=True) # dropping one column from dummies\n",
67 | " \n",
68 | " Dummies =[] # As list of dummies to append to the list of DataFrames in order \n",
69 | " for s in size:\n",
70 | " Dummies.append(dummies[:s])\n",
71 | " dummies=dummies[s:]\n",
72 | " \n",
73 | " #drop the column that needs replacing\n",
74 | " for df in DataFrames:\n",
75 | " df.drop(column, axis=1, inplace=True)\n",
76 | "\n",
77 | " \n",
78 | " #Now append the dummy variables:\n",
79 | "\n",
80 | " for i,df in enumerate(DataFrames):\n",
81 | " for column_type in Dummies[i]: \n",
82 | " new_name=str(column) +'_'+ str(column_type)\n",
83 | " df[new_name]=Dummies[i][column_type]\n",
84 | " return DataFrames\n",
85 | " \n",
86 | " "
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 23,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [],
96 | "source": [
97 | "tr=pd.read_csv('train.csv')\n",
98 | "te=pd.read_csv('test.csv')"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 24,
104 | "metadata": {
105 | "collapsed": false
106 | },
107 | "outputs": [],
108 | "source": [
109 | "#Run for local testing\n",
110 | "n= len(tr)\n",
111 | "n = int(n*(float(2)/float(3)))\n",
112 | "train = copy.deepcopy(tr[:n])\n",
113 | "m=int((len(tr)-n)/2)\n",
114 | "validation = copy.deepcopy(tr[n:n+m])\n",
115 | "test = copy.deepcopy(tr[n+m:])\n",
116 | "\n"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 36,
122 | "metadata": {
123 | "collapsed": true
124 | },
125 | "outputs": [],
126 | "source": [
127 | "#Use this for the real thing\n",
128 | "train = tr[:]\n",
129 | "validation = te[:]\n",
130 | "test=te[:]"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 37,
136 | "metadata": {
137 | "collapsed": false
138 | },
139 | "outputs": [
140 | {
141 | "name": "stderr",
142 | "output_type": "stream",
143 | "text": [
144 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: \n",
145 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
146 | "\n",
147 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
148 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: \n",
149 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
150 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
151 | "\n",
152 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
153 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: \n",
154 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
155 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
156 | "\n",
157 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
158 | "/Users/blerner/anaconda/lib/python3.4/site-packages/pandas/core/generic.py:2862: SettingWithCopyWarning: \n",
159 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
160 | "\n",
161 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
162 | " self._update_inplace(new_data)\n",
163 | "/Users/blerner/anaconda/lib/python3.4/site-packages/pandas/core/generic.py:3117: SettingWithCopyWarning: \n",
164 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
165 | "\n",
166 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
167 | " self._update_inplace(new_data)\n"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "#Converts date to an int. Seems to work better than previous attemopts of using categorical variables.\n",
173 | "\n",
174 | "ALL = [train, validation, test]\n",
175 | "for frame in ALL:\n",
176 | " frame.drop('QuoteNumber', axis=1, inplace=True)\n",
177 | " frame['Original_Quote_Date']= pd.to_datetime(frame['Original_Quote_Date'])\n",
178 | " frame['Original_Quote_Date'] = frame['Original_Quote_Date'].astype(int)\n",
179 | " for c in frame:\n",
180 | " frame[c].fillna(0, inplace=True)\n",
181 | " frame[c].replace(-1, 0, inplace=True)"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 43,
187 | "metadata": {
188 | "collapsed": false
189 | },
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | "173836"
195 | ]
196 | },
197 | "execution_count": 43,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "len(validation)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 40,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stderr",
215 | "output_type": "stream",
216 | "text": [
217 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: \n",
218 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
219 | "\n",
220 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "#dropping useless columns\n",
226 | "for c in train.drop('QuoteConversion_Flag', axis=1):\n",
227 | " x=train[c].unique()\n",
228 | " if len(x) < 2:\n",
229 | " for frame in ALL:\n",
230 | " frame.drop(c, axis=1, inplace=True)\n",
231 | "#rescaling\n",
232 | "for c in train.drop('QuoteConversion_Flag', axis=1):\n",
233 | " if train[c].dtype != 'object':\n",
234 | " mean=train[c].mean()\n",
235 | " std = train[c].std()\n",
236 | " if std > 0.0001:\n",
237 | " for frame in ALL:\n",
238 | " frame = (frame[c]-mean)/std"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 42,
244 | "metadata": {
245 | "collapsed": false
246 | },
247 | "outputs": [
248 | {
249 | "name": "stderr",
250 | "output_type": "stream",
251 | "text": [
252 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:27: SettingWithCopyWarning: \n",
253 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
254 | "\n",
255 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
256 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:35: SettingWithCopyWarning: \n",
257 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
258 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
259 | "\n",
260 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
261 | ]
262 | }
263 | ],
264 | "source": [
265 | "#Replacing all categorical variables with dummy variables\n",
266 | "for column in train:\n",
267 | " if train[column].dtype == 'object':\n",
268 | " [train, validation, test] = ReplaceWithDummies(column, train, validation, test)\n",
269 | " "
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {
276 | "collapsed": true
277 | },
278 | "outputs": [],
279 | "source": [
280 | "#Run this for partial\n",
281 | "\n",
282 | "X_train=train.drop('QuoteConversion_Flag', axis=1)\n",
283 | "X_test=test.drop('QuoteConversion_Flag', axis=1)\n",
284 | "X_validation=test.drop('QuoteConversion_Flag', axis=1)\n",
285 | "y_train=train['QuoteConversion_Flag']\n",
286 | "y_test=test['QuoteConversion_Flag']\n",
287 | "y_validation=validation['QuoteConversion_Flag']"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 44,
293 | "metadata": {
294 | "collapsed": true
295 | },
296 | "outputs": [],
297 | "source": [
298 | "#Run this for full\n",
299 | "X_train=train.drop('QuoteConversion_Flag', axis=1)\n",
300 | "y_train=train['QuoteConversion_Flag']\n",
301 | "X_validation= validation"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 49,
307 | "metadata": {
308 | "collapsed": false
309 | },
310 | "outputs": [
311 | {
312 | "data": {
313 | "text/plain": [
314 | "173836"
315 | ]
316 | },
317 | "execution_count": 49,
318 | "metadata": {},
319 | "output_type": "execute_result"
320 | }
321 | ],
322 | "source": [
323 | "len(X_validation)"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": null,
329 | "metadata": {
330 | "collapsed": true
331 | },
332 | "outputs": [],
333 | "source": [
334 | "#Support Vector machine\n",
335 | "model_svc = svm.SVC()\n",
336 | "model_svc.fit(X_train, y_train)\n",
337 | "print(sum(y_train))\n",
338 | "print(1-sum(abs(np.array(y_train)-np.array(model_svc.predict(X_train))))/float(len(y_train)))"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 48,
344 | "metadata": {
345 | "collapsed": false
346 | },
347 | "outputs": [
348 | {
349 | "data": {
350 | "text/plain": [
351 | "(260753, 601)"
352 | ]
353 | },
354 | "execution_count": 48,
355 | "metadata": {},
356 | "output_type": "execute_result"
357 | }
358 | ],
359 | "source": [
360 | "X_train.shape\n"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 19,
366 | "metadata": {
367 | "collapsed": false
368 | },
369 | "outputs": [],
370 | "source": [
371 | "#Poly\n",
372 | "pca = PCA(n_components=550) #Instantiate the model & set parameters\n",
373 | "pca.fit(X_train); #Fit the model\n",
374 | "X_train_red = pca.transform(X_train)\n",
375 | "X_validation_red = pca.transform(X_validation)"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 11,
381 | "metadata": {
382 | "collapsed": true
383 | },
384 | "outputs": [],
385 | "source": [
386 | "#introduce interaction terms\n",
387 | "poly = PolynomialFeatures()\n",
388 | "poly.fit(X_train_red)\n",
389 | "X_train_poly=poly.transform(X_train_red)\n",
390 | "X_validation_poly = poly.transform(X_validation_red)"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": 12,
396 | "metadata": {
397 | "collapsed": false
398 | },
399 | "outputs": [
400 | {
401 | "data": {
402 | "text/plain": [
403 | "(260753, 1326)"
404 | ]
405 | },
406 | "execution_count": 12,
407 | "metadata": {},
408 | "output_type": "execute_result"
409 | }
410 | ],
411 | "source": [
412 | "X_train_poly.shape"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": null,
418 | "metadata": {
419 | "collapsed": false
420 | },
421 | "outputs": [],
422 | "source": [
423 | "X_train_red.shape"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": null,
429 | "metadata": {
430 | "collapsed": false
431 | },
432 | "outputs": [],
433 | "source": [
434 | "#Logistioc regression\n",
435 | "model = linear_model.LogisticRegression(C=0.1)\n",
436 | "model.fit(X_train, y_train)\n",
437 | "predictions=model.predict_proba(X_validation)[:,1]\n",
438 | "#print(sum(y_train))\n",
439 | "#print(1-sum(abs(np.array(y_train)-np.array(model.predict(X_train))))/float(len(y_train)))\n",
440 | "#p1=model.predict(validation)\n",
441 | "#print(sum(y_validation))\n",
442 | "#print(1-sum(abs(np.array(y_validation)-np.array(model.predict(X_validation))))/float(len(y_validation)))\n"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": 50,
448 | "metadata": {
449 | "collapsed": false
450 | },
451 | "outputs": [],
452 | "source": [
453 | "#Gradient boosting \n",
454 | "model_xgb = xgb.DMatrix(np.array(X_train), label=np.array(y_train))\n",
455 | "bst = xgb.train({'objective':'reg:logistic'},dtrain=model_xgb)\n",
456 | "predictions = bst.predict(xgb.DMatrix(X_validation))\n",
457 | "#predictions = bst.predict(xgb.DMatrix(X_train))\n",
458 | "#predictions_binary = []\n",
459 | "#for x in list(predictions):\n",
460 | "# if x>=0.5:\n",
461 | "# predictions_binary.append(1)\n",
462 | "# else:\n",
463 | "# predictions_binary.append(0)\n",
464 | "\n",
465 | "#print(sum(y_train))\n",
466 | "#print(1-sum(abs(np.array(y_train)-np.array(predictions_binary)))/float(len(y_train)))"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 51,
472 | "metadata": {
473 | "collapsed": false
474 | },
475 | "outputs": [
476 | {
477 | "data": {
478 | "text/plain": [
479 | "173836"
480 | ]
481 | },
482 | "execution_count": 51,
483 | "metadata": {},
484 | "output_type": "execute_result"
485 | }
486 | ],
487 | "source": [
488 | "len(predictions)"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {
495 | "collapsed": false
496 | },
497 | "outputs": [],
498 | "source": [
499 | "predictions = bst.predict(xgb.DMatrix(X_validation))\n",
500 | "#predictions_binary = []\n",
501 | "#for x in list(predictions):\n",
502 | "# if x>=0.5:\n",
503 | "# predictions_binary.append(1)\n",
504 | "# else:\n",
505 | "# predictions_binary.append(0)\n",
506 | "#print(sum(y_train))\n",
507 | "#print(1-sum(abs(np.array(y_validation)-np.array(predictions_binary)))/float(len(y_validation)))\n",
508 | "#p2=predictions_binary"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": null,
514 | "metadata": {
515 | "collapsed": false
516 | },
517 | "outputs": [],
518 | "source": [
519 | "len(p1)"
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": null,
525 | "metadata": {
526 | "collapsed": false
527 | },
528 | "outputs": [],
529 | "source": [
530 | "#Let's try random forest\n",
531 | "\n",
532 | "rfc_model = RandomForestClassifier(n_estimators = 10, n_jobs=-1)\n",
533 | "rfc_model.fit(X_train,y_train)\n",
534 | "print(sum(y_train))\n",
535 | "print(1-sum(abs(np.array(y_train)-np.array(rfc_model.predict(X_train))))/float(len(y_train)))\n",
536 | "#predict2=rfc_model.predict(validation)\n",
537 | "#print(sum(y_validation))\n",
538 | "#print(1-sum(abs(np.array(y_validation)-np.array(rfc_model.predict(X_validation))))/float(len(y_validation)))\n",
539 | "p3=rfc_model.predict(X_validation)"
540 | ]
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": null,
545 | "metadata": {
546 | "collapsed": false
547 | },
548 | "outputs": [],
549 | "source": [
550 | "#Not run for full\n",
551 | "ensemble_train =list(np.logical_or(np.array(p1),np.array(p2), np.array(p3)))\n",
552 | "ensemble_train = int(ensemble_train==1)\n",
553 | "print(1-sum(abs(np.array(y_train)-np.array(ensemble_train)))/float(len(y_train)))\n",
554 | "\n",
555 | "ensemble_validation =list(np.logical_or(np.array(model.predict(X_validation)),np.array(rfc_model.predict(X_validation))))\n",
556 | "ensemble_validation = int(ensemble_validation==1)\n",
557 | "print(1-sum(abs(np.array(y_validation)-np.array(ensemble_validation)))/float(len(y_validation)))"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": null,
563 | "metadata": {
564 | "collapsed": false
565 | },
566 | "outputs": [],
567 | "source": [
568 | "ensemble_predict =np.logical_or(np.array(p1),np.array(p2), np.array(p3)).astype(int)\n",
569 | "len(ensemble_predict)\n"
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": 53,
575 | "metadata": {
576 | "collapsed": false
577 | },
578 | "outputs": [],
579 | "source": [
580 | "#creates the ouput to be submitted\n",
581 | "output =pd.DataFrame()\n",
582 | "output[\"QuoteNumber\"] = te[\"QuoteNumber\"]\n",
583 | "output[\"QuoteConversion_Flag\"] = predictions\n",
584 | "output.to_csv(\"output_boost_newdate.csv\", index=False)\n",
585 | "\n",
586 | "\n"
587 | ]
588 | },
589 | {
590 | "cell_type": "code",
591 | "execution_count": null,
592 | "metadata": {
593 | "collapsed": false
594 | },
595 | "outputs": [],
596 | "source": [
597 | "#attempt at neural network: not working right now\n",
598 | "\n",
599 | "model = Sequential()\n",
600 | "\n",
601 | "# Dense(64) is a fully-connected layer with 64 hidden units.\n",
602 | "# in the first layer, you must specify the expected input data shape:\n",
603 | "# here, 20-dimensional vectors.\n",
604 | "model.add(Dense(64, input_dim=584, init='uniform'))\n",
605 | "#model.add(Activation('tanh'))\n",
606 | "#model.add(Dropout(0.5))\n",
607 | "#model.add(Dense(64, init='uniform'))\n",
608 | "#model.add(Activation('tanh'))\n",
609 | "#model.add(Dropout(0.5))\n",
610 | "#model.add(Dense(2, init='uniform'))\n",
611 | "#model.add(Activation('softmax'))\n",
612 | "\n",
613 | "#sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)\n",
614 | "model.compile(loss='mean_squared_error', optimizer='sgd')\n",
615 | "\n",
616 | "model.fit(X_train1, y_train1)"
617 | ]
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": null,
622 | "metadata": {
623 | "collapsed": false
624 | },
625 | "outputs": [],
626 | "source": []
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": null,
631 | "metadata": {
632 | "collapsed": true
633 | },
634 | "outputs": [],
635 | "source": []
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": null,
640 | "metadata": {
641 | "collapsed": true
642 | },
643 | "outputs": [],
644 | "source": []
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": null,
649 | "metadata": {
650 | "collapsed": true
651 | },
652 | "outputs": [],
653 | "source": []
654 | },
655 | {
656 | "cell_type": "code",
657 | "execution_count": null,
658 | "metadata": {
659 | "collapsed": true
660 | },
661 | "outputs": [],
662 | "source": []
663 | },
664 | {
665 | "cell_type": "markdown",
666 | "metadata": {},
667 | "source": [
668 | "https://www.kaggle.com/mpearmain/homesite-quote-conversion/xgboost-benchmark/discussion"
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": null,
674 | "metadata": {
675 | "collapsed": true
676 | },
677 | "outputs": [],
678 | "source": []
679 | },
680 | {
681 | "cell_type": "code",
682 | "execution_count": null,
683 | "metadata": {
684 | "collapsed": true
685 | },
686 | "outputs": [],
687 | "source": []
688 | },
689 | {
690 | "cell_type": "code",
691 | "execution_count": null,
692 | "metadata": {
693 | "collapsed": true
694 | },
695 | "outputs": [],
696 | "source": []
697 | },
698 | {
699 | "cell_type": "code",
700 | "execution_count": null,
701 | "metadata": {
702 | "collapsed": true
703 | },
704 | "outputs": [],
705 | "source": []
706 | }
707 | ],
708 | "metadata": {
709 | "kernelspec": {
710 | "display_name": "Python 3",
711 | "language": "python",
712 | "name": "python3"
713 | },
714 | "language_info": {
715 | "codemirror_mode": {
716 | "name": "ipython",
717 | "version": 3
718 | },
719 | "file_extension": ".py",
720 | "mimetype": "text/x-python",
721 | "name": "python",
722 | "nbconvert_exporter": "python",
723 | "pygments_lexer": "ipython3",
724 | "version": "3.4.3"
725 | }
726 | },
727 | "nbformat": 4,
728 | "nbformat_minor": 0
729 | }
730 |
--------------------------------------------------------------------------------
/homesite/initial_foray_insurance_grad_boosting.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# INITIAL FORAY INTO INSURANCE DATA"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "(260753, 299)\n"
22 | ]
23 | },
24 | {
25 | "data": {
26 | "text/html": [
27 | "\n",
28 | "
\n",
29 | " \n",
30 | " \n",
31 | " | \n",
32 | " QuoteNumber | \n",
33 | " Original_Quote_Date | \n",
34 | " QuoteConversion_Flag | \n",
35 | " Field6 | \n",
36 | " Field7 | \n",
37 | " Field8 | \n",
38 | " Field9 | \n",
39 | " Field10 | \n",
40 | " Field11 | \n",
41 | " Field12 | \n",
42 | " ... | \n",
43 | " GeographicField59A | \n",
44 | " GeographicField59B | \n",
45 | " GeographicField60A | \n",
46 | " GeographicField60B | \n",
47 | " GeographicField61A | \n",
48 | " GeographicField61B | \n",
49 | " GeographicField62A | \n",
50 | " GeographicField62B | \n",
51 | " GeographicField63 | \n",
52 | " GeographicField64 | \n",
53 | "
\n",
54 | " \n",
55 | " \n",
56 | " \n",
57 | " 0 | \n",
58 | " 1 | \n",
59 | " 2013-08-16 | \n",
60 | " 0 | \n",
61 | " B | \n",
62 | " 23 | \n",
63 | " 0.9403 | \n",
64 | " 0.0006 | \n",
65 | " 965 | \n",
66 | " 1.0200 | \n",
67 | " N | \n",
68 | " ... | \n",
69 | " 9 | \n",
70 | " 9 | \n",
71 | " -1 | \n",
72 | " 8 | \n",
73 | " -1 | \n",
74 | " 18 | \n",
75 | " -1 | \n",
76 | " 10 | \n",
77 | " N | \n",
78 | " CA | \n",
79 | "
\n",
80 | " \n",
81 | " 1 | \n",
82 | " 2 | \n",
83 | " 2014-04-22 | \n",
84 | " 0 | \n",
85 | " F | \n",
86 | " 7 | \n",
87 | " 1.0006 | \n",
88 | " 0.0040 | \n",
89 | " 548 | \n",
90 | " 1.2433 | \n",
91 | " N | \n",
92 | " ... | \n",
93 | " 10 | \n",
94 | " 10 | \n",
95 | " -1 | \n",
96 | " 11 | \n",
97 | " -1 | \n",
98 | " 17 | \n",
99 | " -1 | \n",
100 | " 20 | \n",
101 | " N | \n",
102 | " NJ | \n",
103 | "
\n",
104 | " \n",
105 | " 2 | \n",
106 | " 4 | \n",
107 | " 2014-08-25 | \n",
108 | " 0 | \n",
109 | " F | \n",
110 | " 7 | \n",
111 | " 1.0006 | \n",
112 | " 0.0040 | \n",
113 | " 548 | \n",
114 | " 1.2433 | \n",
115 | " N | \n",
116 | " ... | \n",
117 | " 15 | \n",
118 | " 18 | \n",
119 | " -1 | \n",
120 | " 21 | \n",
121 | " -1 | \n",
122 | " 11 | \n",
123 | " -1 | \n",
124 | " 8 | \n",
125 | " N | \n",
126 | " NJ | \n",
127 | "
\n",
128 | " \n",
129 | " 3 | \n",
130 | " 6 | \n",
131 | " 2013-04-15 | \n",
132 | " 0 | \n",
133 | " J | \n",
134 | " 10 | \n",
135 | " 0.9769 | \n",
136 | " 0.0004 | \n",
137 | " 1,165 | \n",
138 | " 1.2665 | \n",
139 | " N | \n",
140 | " ... | \n",
141 | " 6 | \n",
142 | " 5 | \n",
143 | " -1 | \n",
144 | " 10 | \n",
145 | " -1 | \n",
146 | " 9 | \n",
147 | " -1 | \n",
148 | " 21 | \n",
149 | " N | \n",
150 | " TX | \n",
151 | "
\n",
152 | " \n",
153 | " 4 | \n",
154 | " 8 | \n",
155 | " 2014-01-25 | \n",
156 | " 0 | \n",
157 | " E | \n",
158 | " 23 | \n",
159 | " 0.9472 | \n",
160 | " 0.0006 | \n",
161 | " 1,487 | \n",
162 | " 1.3045 | \n",
163 | " N | \n",
164 | " ... | \n",
165 | " 18 | \n",
166 | " 22 | \n",
167 | " -1 | \n",
168 | " 10 | \n",
169 | " -1 | \n",
170 | " 11 | \n",
171 | " -1 | \n",
172 | " 12 | \n",
173 | " N | \n",
174 | " IL | \n",
175 | "
\n",
176 | " \n",
177 | "
\n",
178 | "
5 rows × 299 columns
\n",
179 | "
"
180 | ],
181 | "text/plain": [
182 | " QuoteNumber Original_Quote_Date QuoteConversion_Flag Field6 Field7 \\\n",
183 | "0 1 2013-08-16 0 B 23 \n",
184 | "1 2 2014-04-22 0 F 7 \n",
185 | "2 4 2014-08-25 0 F 7 \n",
186 | "3 6 2013-04-15 0 J 10 \n",
187 | "4 8 2014-01-25 0 E 23 \n",
188 | "\n",
189 | " Field8 Field9 Field10 Field11 Field12 ... \\\n",
190 | "0 0.9403 0.0006 965 1.0200 N ... \n",
191 | "1 1.0006 0.0040 548 1.2433 N ... \n",
192 | "2 1.0006 0.0040 548 1.2433 N ... \n",
193 | "3 0.9769 0.0004 1,165 1.2665 N ... \n",
194 | "4 0.9472 0.0006 1,487 1.3045 N ... \n",
195 | "\n",
196 | " GeographicField59A GeographicField59B GeographicField60A \\\n",
197 | "0 9 9 -1 \n",
198 | "1 10 10 -1 \n",
199 | "2 15 18 -1 \n",
200 | "3 6 5 -1 \n",
201 | "4 18 22 -1 \n",
202 | "\n",
203 | " GeographicField60B GeographicField61A GeographicField61B \\\n",
204 | "0 8 -1 18 \n",
205 | "1 11 -1 17 \n",
206 | "2 21 -1 11 \n",
207 | "3 10 -1 9 \n",
208 | "4 10 -1 11 \n",
209 | "\n",
210 | " GeographicField62A GeographicField62B GeographicField63 \\\n",
211 | "0 -1 10 N \n",
212 | "1 -1 20 N \n",
213 | "2 -1 8 N \n",
214 | "3 -1 21 N \n",
215 | "4 -1 12 N \n",
216 | "\n",
217 | " GeographicField64 \n",
218 | "0 CA \n",
219 | "1 NJ \n",
220 | "2 NJ \n",
221 | "3 TX \n",
222 | "4 IL \n",
223 | "\n",
224 | "[5 rows x 299 columns]"
225 | ]
226 | },
227 | "execution_count": 1,
228 | "metadata": {},
229 | "output_type": "execute_result"
230 | }
231 | ],
232 | "source": [
233 | "#data from this kaggle comp.: https://www.kaggle.com/c/homesite-quote-conversion\n",
234 | "#I NEED TO ADD MORE COMMENTS, I KNOW!\n",
235 | "import numpy as np\n",
236 | "import pandas as pd\n",
237 | "import matplotlib.pyplot as plt\n",
238 | "%matplotlib inline\n",
239 | "pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier\n",
240 | "##check out tutorial here:\n",
241 | "##http://nbviewer.ipython.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%201%20-%20Reading%20from%20a%20CSV.ipynb\n",
242 | "df_train = pd.read_csv('train.csv')\n",
243 | "print np.shape(df_train)\n",
244 | "df_train.head()"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": null,
250 | "metadata": {
251 | "collapsed": false
252 | },
253 | "outputs": [],
254 | "source": [
255 | "##CHOOSE A SUBSET TO WORK WITH INITIALLY\n",
256 | "# df_train = df_train[0:10000]\n",
257 | "df_train.head()"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 2,
263 | "metadata": {
264 | "collapsed": false
265 | },
266 | "outputs": [
267 | {
268 | "data": {
269 | "text/html": [
270 | "\n",
271 | "
\n",
272 | " \n",
273 | " \n",
274 | " | \n",
275 | " QuoteNumber | \n",
276 | " Original_Quote_Date | \n",
277 | " Field6 | \n",
278 | " Field7 | \n",
279 | " Field8 | \n",
280 | " Field9 | \n",
281 | " Field10 | \n",
282 | " Field11 | \n",
283 | " Field12 | \n",
284 | " CoverageField1A | \n",
285 | " ... | \n",
286 | " GeographicField59A | \n",
287 | " GeographicField59B | \n",
288 | " GeographicField60A | \n",
289 | " GeographicField60B | \n",
290 | " GeographicField61A | \n",
291 | " GeographicField61B | \n",
292 | " GeographicField62A | \n",
293 | " GeographicField62B | \n",
294 | " GeographicField63 | \n",
295 | " GeographicField64 | \n",
296 | "
\n",
297 | " \n",
298 | " \n",
299 | " \n",
300 | " 0 | \n",
301 | " 3 | \n",
302 | " 2014-08-12 | \n",
303 | " E | \n",
304 | " 16 | \n",
305 | " 0.9364 | \n",
306 | " 0.0006 | \n",
307 | " 1,487 | \n",
308 | " 1.3045 | \n",
309 | " N | \n",
310 | " 4 | \n",
311 | " ... | \n",
312 | " 1 | \n",
313 | " 1 | \n",
314 | " -1 | \n",
315 | " 1 | \n",
316 | " -1 | \n",
317 | " 20 | \n",
318 | " -1 | \n",
319 | " 25 | \n",
320 | " Y | \n",
321 | " IL | \n",
322 | "
\n",
323 | " \n",
324 | " 1 | \n",
325 | " 5 | \n",
326 | " 2013-09-07 | \n",
327 | " F | \n",
328 | " 11 | \n",
329 | " 0.9919 | \n",
330 | " 0.0038 | \n",
331 | " 564 | \n",
332 | " 1.1886 | \n",
333 | " N | \n",
334 | " 8 | \n",
335 | " ... | \n",
336 | " 10 | \n",
337 | " 10 | \n",
338 | " -1 | \n",
339 | " 5 | \n",
340 | " -1 | \n",
341 | " 5 | \n",
342 | " -1 | \n",
343 | " 21 | \n",
344 | " N | \n",
345 | " NJ | \n",
346 | "
\n",
347 | " \n",
348 | " 2 | \n",
349 | " 7 | \n",
350 | " 2013-03-29 | \n",
351 | " F | \n",
352 | " 15 | \n",
353 | " 0.8945 | \n",
354 | " 0.0038 | \n",
355 | " 564 | \n",
356 | " 1.0670 | \n",
357 | " N | \n",
358 | " 11 | \n",
359 | " ... | \n",
360 | " 10 | \n",
361 | " 11 | \n",
362 | " -1 | \n",
363 | " 20 | \n",
364 | " -1 | \n",
365 | " 22 | \n",
366 | " -1 | \n",
367 | " 11 | \n",
368 | " N | \n",
369 | " NJ | \n",
370 | "
\n",
371 | " \n",
372 | " 3 | \n",
373 | " 9 | \n",
374 | " 2015-03-21 | \n",
375 | " K | \n",
376 | " 21 | \n",
377 | " 0.8870 | \n",
378 | " 0.0004 | \n",
379 | " 1,113 | \n",
380 | " 1.2665 | \n",
381 | " Y | \n",
382 | " 14 | \n",
383 | " ... | \n",
384 | " 8 | \n",
385 | " 8 | \n",
386 | " -1 | \n",
387 | " 13 | \n",
388 | " -1 | \n",
389 | " 8 | \n",
390 | " -1 | \n",
391 | " 21 | \n",
392 | " N | \n",
393 | " TX | \n",
394 | "
\n",
395 | " \n",
396 | " 4 | \n",
397 | " 10 | \n",
398 | " 2014-12-10 | \n",
399 | " B | \n",
400 | " 25 | \n",
401 | " 0.9153 | \n",
402 | " 0.0007 | \n",
403 | " 935 | \n",
404 | " 1.0200 | \n",
405 | " N | \n",
406 | " 4 | \n",
407 | " ... | \n",
408 | " 7 | \n",
409 | " 7 | \n",
410 | " -1 | \n",
411 | " 3 | \n",
412 | " -1 | \n",
413 | " 22 | \n",
414 | " -1 | \n",
415 | " 21 | \n",
416 | " N | \n",
417 | " CA | \n",
418 | "
\n",
419 | " \n",
420 | "
\n",
421 | "
5 rows × 298 columns
\n",
422 | "
"
423 | ],
424 | "text/plain": [
425 | " QuoteNumber Original_Quote_Date Field6 Field7 Field8 Field9 Field10 \\\n",
426 | "0 3 2014-08-12 E 16 0.9364 0.0006 1,487 \n",
427 | "1 5 2013-09-07 F 11 0.9919 0.0038 564 \n",
428 | "2 7 2013-03-29 F 15 0.8945 0.0038 564 \n",
429 | "3 9 2015-03-21 K 21 0.8870 0.0004 1,113 \n",
430 | "4 10 2014-12-10 B 25 0.9153 0.0007 935 \n",
431 | "\n",
432 | " Field11 Field12 CoverageField1A ... GeographicField59A \\\n",
433 | "0 1.3045 N 4 ... 1 \n",
434 | "1 1.1886 N 8 ... 10 \n",
435 | "2 1.0670 N 11 ... 10 \n",
436 | "3 1.2665 Y 14 ... 8 \n",
437 | "4 1.0200 N 4 ... 7 \n",
438 | "\n",
439 | " GeographicField59B GeographicField60A GeographicField60B \\\n",
440 | "0 1 -1 1 \n",
441 | "1 10 -1 5 \n",
442 | "2 11 -1 20 \n",
443 | "3 8 -1 13 \n",
444 | "4 7 -1 3 \n",
445 | "\n",
446 | " GeographicField61A GeographicField61B GeographicField62A \\\n",
447 | "0 -1 20 -1 \n",
448 | "1 -1 5 -1 \n",
449 | "2 -1 22 -1 \n",
450 | "3 -1 8 -1 \n",
451 | "4 -1 22 -1 \n",
452 | "\n",
453 | " GeographicField62B GeographicField63 GeographicField64 \n",
454 | "0 25 Y IL \n",
455 | "1 21 N NJ \n",
456 | "2 11 N NJ \n",
457 | "3 21 N TX \n",
458 | "4 21 N CA \n",
459 | "\n",
460 | "[5 rows x 298 columns]"
461 | ]
462 | },
463 | "execution_count": 2,
464 | "metadata": {},
465 | "output_type": "execute_result"
466 | }
467 | ],
468 | "source": [
469 | "X_test = pd.read_csv('test.csv')\n",
470 | "# X_test = X_test[0:5000]\n",
471 | "X_test.head()"
472 | ]
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {},
477 | "source": [
478 | "WE FIRST DEAL WITH MISSING VALUES
"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 3,
484 | "metadata": {
485 | "collapsed": false
486 | },
487 | "outputs": [
488 | {
489 | "data": {
490 | "text/plain": [
491 | "(434589, 1489)"
492 | ]
493 | },
494 | "execution_count": 3,
495 | "metadata": {},
496 | "output_type": "execute_result"
497 | }
498 | ],
499 | "source": [
500 | "from sklearn.decomposition import PCA #import principal component analysis\n",
501 | "from sklearn.preprocessing import Imputer\n",
502 | "from sklearn.preprocessing import scale\n",
503 | "df_train_nt = df_train.drop('QuoteConversion_Flag', 1)\n",
504 | "from sklearn.decomposition import PCA #import principal component analysis\n",
505 | "from sklearn.preprocessing import Imputer\n",
506 | "from sklearn.preprocessing import scale\n",
507 | "# df_train_nt = df_train.drop('QuoteConversion_Flag', 1)\n",
508 | "frames = [df_train_nt , X_test]\n",
509 | "X = pd.concat( frames )\n",
510 | "X_hot = pd.get_dummies( X )\n",
511 | "imp = Imputer(missing_values='NaN', strategy='mean', axis=0)\n",
512 | "imp.fit( X_hot )\n",
513 | "X_hot_imp = imp.transform( X_hot )\n",
514 | "df_all = scale(X_hot_imp) #scaled data\n",
515 | "np.shape(df_all)"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": 31,
521 | "metadata": {
522 | "collapsed": false
523 | },
524 | "outputs": [
525 | {
526 | "name": "stdout",
527 | "output_type": "stream",
528 | "text": [
529 | "(260753,)\n"
530 | ]
531 | }
532 | ],
533 | "source": [
534 | "# # X_train = df_all[0:10000]\n",
535 | "# X_train = df_all[0:260753]\n",
536 | "# # X_test = df_all[10000:]\n",
537 | "X_test = df_all[260753:]\n",
538 | "# print np.shape(X_train)\n",
539 | "# print np.shape(X_test)\n",
540 | "# # y_train = df_train['QuoteConversion_Flag'][0:10000]\n",
541 | "# y_train = df_train['QuoteConversion_Flag'][0:260753]\n",
542 | "X_train = df_all[0:260753]\n",
543 | "y_train = df_train['QuoteConversion_Flag'][0:260753]\n",
544 | "print np.shape(y_train)"
545 | ]
546 | },
547 | {
548 | "cell_type": "markdown",
549 | "metadata": {
550 | "collapsed": true
551 | },
552 | "source": [
553 | "# LET'S TRY SOME GRADIENT BOOSTING!"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": null,
559 | "metadata": {
560 | "collapsed": false
561 | },
562 | "outputs": [],
563 | "source": [
564 | "# import xgboost as xgb\n",
565 | "\n",
566 | "# #also see here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md\n",
567 | "# # hacking this: https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py\n",
568 | "# dtrain = xgb.DMatrix(np.array(X_train), label=np.array(y_train))\n",
569 | "# param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:logistic'}\n",
570 | "# num_round = 2\n",
571 | "# print ('running cross validation')\n",
572 | "# bst_cv = xgb.cv(param, dtrain, num_round, nfold=5,\n",
573 | "# metrics={'error'}, seed = 0)\n",
574 | "\n",
575 | "# print 'done'\n",
576 | "\n",
577 | "#you actually wasnt to use GridSearchCV, I think:\n",
578 | "#https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py"
579 | ]
580 | },
581 | {
582 | "cell_type": "code",
583 | "execution_count": null,
584 | "metadata": {
585 | "collapsed": false
586 | },
587 | "outputs": [],
588 | "source": [
589 | "# type(bst_cv)\n",
590 | "# print bst_cv\n",
591 | "# # predictions = bst_cv.predict(xgb.DMatrix(X_test))"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 38,
597 | "metadata": {
598 | "collapsed": false
599 | },
600 | "outputs": [],
601 | "source": [
602 | "#Gradient boosting a la BL\n",
603 | "import xgboost as xgb\n",
604 | "model_xgb = xgb.DMatrix(np.array(X_train), label=np.array(y_train))\n",
605 | "bst = xgb.train({'max_depth':10 , 'n_estimators': 50 , 'objective':'reg:logistic'},dtrain=model_xgb)\n",
606 | "predictions = bst.predict(xgb.DMatrix(X_test))"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": 33,
612 | "metadata": {
613 | "collapsed": false
614 | },
615 | "outputs": [
616 | {
617 | "name": "stdout",
618 | "output_type": "stream",
619 | "text": [
620 | "(173836,)\n",
621 | "[ 0.04766377 0.1459558 0.12043708 ..., 0.45562789 0.04766377\n",
622 | " 0.2594822 ]\n"
623 | ]
624 | }
625 | ],
626 | "source": [
627 | "print np.shape(predictions)\n",
628 | "print predictions"
629 | ]
630 | },
631 | {
632 | "cell_type": "code",
633 | "execution_count": 39,
634 | "metadata": {
635 | "collapsed": false
636 | },
637 | "outputs": [],
638 | "source": [
639 | "X_test_q = pd.read_csv('test.csv')\n",
640 | "output =pd.DataFrame()\n",
641 | "output[\"QuoteNumber\"] = X_test_q[\"QuoteNumber\"]\n",
642 | "output[\"QuoteConversion_Flag\"] = predictions\n",
643 | "output.to_csv(\"pred_test_6.csv\", index=False)"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": 20,
649 | "metadata": {
650 | "collapsed": false
651 | },
652 | "outputs": [
653 | {
654 | "name": "stderr",
655 | "output_type": "stream",
656 | "text": [
657 | "/Users/hugobowne-anderson/repos/scikit-learn/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
658 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n",
659 | "/Users/hugobowne-anderson/repos/scikit-learn/sklearn/grid_search.py:43: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
660 | " DeprecationWarning)\n"
661 | ]
662 | }
663 | ],
664 | "source": [
665 | "from sklearn.cross_validation import KFold, train_test_split\n",
666 | "from sklearn.metrics import confusion_matrix, mean_squared_error\n",
667 | "from sklearn.grid_search import GridSearchCV"
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": 27,
673 | "metadata": {
674 | "collapsed": false
675 | },
676 | "outputs": [
677 | {
678 | "name": "stdout",
679 | "output_type": "stream",
680 | "text": [
681 | "0.100004890254\n",
682 | "0.0794442980374\n",
683 | "Parameter optimization\n"
684 | ]
685 | }
686 | ],
687 | "source": [
688 | "#taking lead from here: https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py\n",
689 | "rng = np.random.RandomState(31337)\n",
690 | "kf = KFold(y_train.shape[0], n_folds=2, shuffle=True, random_state=rng)\n",
691 | "for train_index, test_index in kf:\n",
692 | " model_xgb = xgb.DMatrix(np.array(X_train[train_index]), label=np.array(y_train[train_index]))\n",
693 | " bst = xgb.train({'max_depth':6 , 'n_estimators': 150 , 'objective':'reg:logistic'},dtrain=model_xgb)\n",
694 | " predictions = bst.predict(xgb.DMatrix(X_train[test_index]))\n",
695 | " actuals = y_train[test_index]\n",
696 | " print(mean_squared_error(actuals, predictions))\n",
697 | "\n",
698 | "print(\"Parameter optimization\")"
699 | ]
700 | },
701 | {
702 | "cell_type": "code",
703 | "execution_count": 37,
704 | "metadata": {
705 | "collapsed": false
706 | },
707 | "outputs": [
708 | {
709 | "name": "stdout",
710 | "output_type": "stream",
711 | "text": [
712 | "Fitting 3 folds for each of 15 candidates, totalling 45 fits\n",
713 | "0.644338929872\n",
714 | "{'n_estimators': 50, 'max_depth': 10}\n"
715 | ]
716 | },
717 | {
718 | "name": "stderr",
719 | "output_type": "stream",
720 | "text": [
721 | "[Parallel(n_jobs=1)]: Done 45 out of 45 | elapsed: 897.3min finished\n"
722 | ]
723 | }
724 | ],
725 | "source": [
726 | "xgb_model = xgb.XGBRegressor()\n",
727 | "clf = GridSearchCV(xgb_model,\n",
728 | " {'max_depth': [2,4,6,8,10],\n",
729 | " 'n_estimators': [50,100,200]}, verbose=1)\n",
730 | "clf.fit(X_train,y_train)\n",
731 | "print(clf.best_score_)\n",
732 | "print(clf.best_params_)"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": null,
738 | "metadata": {
739 | "collapsed": true
740 | },
741 | "outputs": [],
742 | "source": []
743 | }
744 | ],
745 | "metadata": {
746 | "kernelspec": {
747 | "display_name": "Python 2",
748 | "language": "python",
749 | "name": "python2"
750 | },
751 | "language_info": {
752 | "codemirror_mode": {
753 | "name": "ipython",
754 | "version": 2
755 | },
756 | "file_extension": ".py",
757 | "mimetype": "text/x-python",
758 | "name": "python",
759 | "nbconvert_exporter": "python",
760 | "pygments_lexer": "ipython2",
761 | "version": "2.7.10"
762 | }
763 | },
764 | "nbformat": 4,
765 | "nbformat_minor": 0
766 | }
767 |
--------------------------------------------------------------------------------
/paribas/README.md:
--------------------------------------------------------------------------------
1 | for this kaggle comp.: https://www.kaggle.com/c/bnp-paribas-cardif-claims-management
--------------------------------------------------------------------------------
/paribas/boosting_in_barbados.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#I want to get to know gradient boosting methods (in particular, the xgboost library) and i am also currently in barbados.\n",
12 | "#Import libraries:\n",
13 | "import numpy as np\n",
14 | "import pandas as pd\n",
15 | "import xgboost as xgb\n",
16 | "import time\n",
17 | "#load data:\n",
18 | "train = pd.read_csv(\"train.csv\")\n",
19 | "target = train['target']\n",
20 | "#drop targets & (unique row) IDs from training data\n",
21 | "train = train.drop(['ID','target'],axis=1)\n",
22 | "test = pd.read_csv(\"test.csv\")\n",
23 | "IDs = test['ID'].values\n",
24 | "test = test.drop(['ID'],axis=1)"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "# PREPROCESSING"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 4,
37 | "metadata": {
38 | "collapsed": true
39 | },
40 | "outputs": [],
41 | "source": [
42 | "#impute both numerical & categorical features a la\n",
43 | "#http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn\n",
44 | "\n",
45 | "from sklearn.base import TransformerMixin\n",
46 | "\n",
47 | "class DataFrameImputer(TransformerMixin):\n",
48 | "\n",
49 | " def __init__(self):\n",
50 | " \"\"\"Impute missing values.\n",
51 | "\n",
52 | " Columns of dtype object are imputed with the most frequent value \n",
53 | " in column.\n",
54 | "\n",
55 | " Columns of other types are imputed with mean of column.\n",
56 | "\n",
57 | " \"\"\"\n",
58 | " def fit(self, X, y=None):\n",
59 | "\n",
60 | " self.fill = pd.Series([X[c].value_counts().index[0]\n",
61 | " if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],\n",
62 | " index=X.columns)\n",
63 | "\n",
64 | " return self\n",
65 | "\n",
66 | " def transform(self, X, y=None):\n",
67 | " return X.fillna(self.fill)\n",
68 | " \n",
69 | "xtrain = DataFrameImputer().fit_transform( train )\n",
70 | "xtest = DataFrameImputer().fit_transform( test )"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 6,
76 | "metadata": {
77 | "collapsed": true
78 | },
79 | "outputs": [],
80 | "source": [
81 | "#factorize categorical columns:\n",
82 | "for column in xtrain:\n",
83 | " if xtrain[column].dtype == 'O':\n",
84 | "# print pd.factorize(xtrain[column])\n",
85 | " xtrain[column] = pd.factorize(xtrain[column])[0]\n",
86 | " \n",
87 | "for column in xtest:\n",
88 | " if xtest[column].dtype == 'O':\n",
89 | "# print pd.factorize(xtrain[column])\n",
90 | " xtest[column] = pd.factorize(xtest[column])[0]"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "Next up: scaling/transforms/get_dummies/dimensionality reduction"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "# GRADIENT BOOSTING & CROSS VALIDATION"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 8,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [
114 | {
115 | "name": "stderr",
116 | "output_type": "stream",
117 | "text": [
118 | "/Users/hugobowne-anderson/repos/scikit-learn/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
119 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "#check this out: http://xgboost.readthedocs.org/en/latest/model.html\n",
125 | "from sklearn.cross_validation import KFold, train_test_split\n",
126 | "X = xtrain.values\n",
127 | "y = target.values\n",
128 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1 , random_state=0)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 9,
134 | "metadata": {
135 | "collapsed": true
136 | },
137 | "outputs": [
138 | {
139 | "name": "stderr",
140 | "output_type": "stream",
141 | "text": [
142 | "Will train until validation_0 error hasn't decreased in 50 rounds.\n",
143 | "[0]\tvalidation_0-logloss:0.660512\n",
144 | "[1]\tvalidation_0-logloss:0.633833\n",
145 | "[2]\tvalidation_0-logloss:0.611452\n",
146 | "[3]\tvalidation_0-logloss:0.592876\n",
147 | "[4]\tvalidation_0-logloss:0.577408\n",
148 | "[5]\tvalidation_0-logloss:0.564382\n",
149 | "[6]\tvalidation_0-logloss:0.553405\n",
150 | "[7]\tvalidation_0-logloss:0.543860\n",
151 | "[8]\tvalidation_0-logloss:0.536023\n",
152 | "[9]\tvalidation_0-logloss:0.528992\n",
153 | "[10]\tvalidation_0-logloss:0.523194\n",
154 | "[11]\tvalidation_0-logloss:0.518142\n",
155 | "[12]\tvalidation_0-logloss:0.514029\n",
156 | "[13]\tvalidation_0-logloss:0.510511\n",
157 | "[14]\tvalidation_0-logloss:0.507166\n",
158 | "[15]\tvalidation_0-logloss:0.504451\n",
159 | "[16]\tvalidation_0-logloss:0.502286\n",
160 | "[17]\tvalidation_0-logloss:0.500022\n",
161 | "[18]\tvalidation_0-logloss:0.498165\n",
162 | "[19]\tvalidation_0-logloss:0.496556\n",
163 | "[20]\tvalidation_0-logloss:0.495312\n",
164 | "[21]\tvalidation_0-logloss:0.493927\n",
165 | "[22]\tvalidation_0-logloss:0.492970\n",
166 | "[23]\tvalidation_0-logloss:0.491965\n",
167 | "[24]\tvalidation_0-logloss:0.491137\n",
168 | "[25]\tvalidation_0-logloss:0.490338\n",
169 | "[26]\tvalidation_0-logloss:0.489665\n",
170 | "[27]\tvalidation_0-logloss:0.489089\n",
171 | "[28]\tvalidation_0-logloss:0.488625\n",
172 | "[29]\tvalidation_0-logloss:0.488127\n",
173 | "[30]\tvalidation_0-logloss:0.487602\n",
174 | "[31]\tvalidation_0-logloss:0.487334\n",
175 | "[32]\tvalidation_0-logloss:0.486997\n",
176 | "[33]\tvalidation_0-logloss:0.486487\n",
177 | "[34]\tvalidation_0-logloss:0.486237\n",
178 | "[35]\tvalidation_0-logloss:0.485890\n",
179 | "[36]\tvalidation_0-logloss:0.485579\n",
180 | "[37]\tvalidation_0-logloss:0.485430\n",
181 | "[38]\tvalidation_0-logloss:0.485202\n",
182 | "[39]\tvalidation_0-logloss:0.484802\n",
183 | "[40]\tvalidation_0-logloss:0.484583\n",
184 | "[41]\tvalidation_0-logloss:0.484348\n",
185 | "[42]\tvalidation_0-logloss:0.484242\n",
186 | "[43]\tvalidation_0-logloss:0.483940\n",
187 | "[44]\tvalidation_0-logloss:0.483843\n",
188 | "[45]\tvalidation_0-logloss:0.483686\n",
189 | "[46]\tvalidation_0-logloss:0.483566\n",
190 | "[47]\tvalidation_0-logloss:0.483306\n",
191 | "[48]\tvalidation_0-logloss:0.482803\n",
192 | "[49]\tvalidation_0-logloss:0.482678\n",
193 | "[50]\tvalidation_0-logloss:0.482645\n",
194 | "[51]\tvalidation_0-logloss:0.482599\n",
195 | "[52]\tvalidation_0-logloss:0.482483\n",
196 | "[53]\tvalidation_0-logloss:0.482222\n",
197 | "[54]\tvalidation_0-logloss:0.482145\n",
198 | "[55]\tvalidation_0-logloss:0.482033\n",
199 | "[56]\tvalidation_0-logloss:0.481891\n",
200 | "[57]\tvalidation_0-logloss:0.481790\n",
201 | "[58]\tvalidation_0-logloss:0.481366\n",
202 | "[59]\tvalidation_0-logloss:0.481314\n",
203 | "[60]\tvalidation_0-logloss:0.481271\n",
204 | "[61]\tvalidation_0-logloss:0.481124\n",
205 | "[62]\tvalidation_0-logloss:0.481000\n",
206 | "[63]\tvalidation_0-logloss:0.480625\n",
207 | "[64]\tvalidation_0-logloss:0.480605\n",
208 | "[65]\tvalidation_0-logloss:0.480526\n",
209 | "[66]\tvalidation_0-logloss:0.480480\n",
210 | "[67]\tvalidation_0-logloss:0.480338\n",
211 | "[68]\tvalidation_0-logloss:0.480286\n",
212 | "[69]\tvalidation_0-logloss:0.480209\n",
213 | "[70]\tvalidation_0-logloss:0.480185\n",
214 | "[71]\tvalidation_0-logloss:0.480126\n",
215 | "[72]\tvalidation_0-logloss:0.480102\n",
216 | "[73]\tvalidation_0-logloss:0.479896\n",
217 | "[74]\tvalidation_0-logloss:0.479858\n",
218 | "[75]\tvalidation_0-logloss:0.479808\n",
219 | "[76]\tvalidation_0-logloss:0.479728\n",
220 | "[77]\tvalidation_0-logloss:0.479648\n",
221 | "[78]\tvalidation_0-logloss:0.479611\n",
222 | "[79]\tvalidation_0-logloss:0.479599\n",
223 | "[80]\tvalidation_0-logloss:0.479577\n",
224 | "[81]\tvalidation_0-logloss:0.479537\n",
225 | "[82]\tvalidation_0-logloss:0.479479\n",
226 | "[83]\tvalidation_0-logloss:0.479466\n",
227 | "[84]\tvalidation_0-logloss:0.479452\n",
228 | "[85]\tvalidation_0-logloss:0.479426\n",
229 | "[86]\tvalidation_0-logloss:0.479434\n",
230 | "[87]\tvalidation_0-logloss:0.479411\n",
231 | "[88]\tvalidation_0-logloss:0.479377\n",
232 | "[89]\tvalidation_0-logloss:0.479222\n",
233 | "[90]\tvalidation_0-logloss:0.479141\n",
234 | "[91]\tvalidation_0-logloss:0.479086\n",
235 | "[92]\tvalidation_0-logloss:0.479091\n",
236 | "[93]\tvalidation_0-logloss:0.479068\n",
237 | "[94]\tvalidation_0-logloss:0.479094\n",
238 | "[95]\tvalidation_0-logloss:0.479089\n",
239 | "[96]\tvalidation_0-logloss:0.479038\n",
240 | "[97]\tvalidation_0-logloss:0.479036\n",
241 | "[98]\tvalidation_0-logloss:0.478962\n",
242 | "[99]\tvalidation_0-logloss:0.478884\n"
243 | ]
244 | },
245 | {
246 | "data": {
247 | "text/plain": [
248 | "XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\n",
249 | " gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
250 | " min_child_weight=1, missing=None, n_estimators=100, nthread=-1,\n",
251 | " objective='binary:logistic', reg_alpha=0, reg_lambda=1,\n",
252 | " scale_pos_weight=1, seed=0, silent=True, subsample=1)"
253 | ]
254 | },
255 | "execution_count": 9,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | }
259 | ],
260 | "source": [
261 | "# Early-stopping\n",
262 | "#http://xgboost.readthedocs.org/en/latest/python/python_intro.html#early-stopping\n",
263 | "#Also see https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py (Jamie Hall et al.)\n",
264 | "clf = xgb.XGBClassifier()\n",
265 | "clf.fit(X_train, y_train, early_stopping_rounds=50, eval_metric=\"logloss\",\n",
266 | " eval_set=[(X_test, y_test)])"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 14,
272 | "metadata": {
273 | "collapsed": false
274 | },
275 | "outputs": [],
276 | "source": [
277 | "preds = clf.predict_proba(xtest.values, ntree_limit=clf.best_iteration)[:,1]"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 16,
283 | "metadata": {
284 | "collapsed": false
285 | },
286 | "outputs": [],
287 | "source": [
288 | "import csv\n",
289 | "predictions_file = open(\"xgboost_predictions.csv\", \"w\")\n",
290 | "open_file_object = csv.writer(predictions_file)\n",
291 | "open_file_object.writerow([\"ID\", \"PredictedProb\"])\n",
292 | "open_file_object.writerows(zip(IDs, preds))\n",
293 | "predictions_file.close()"
294 | ]
295 | },
296 | {
297 | "cell_type": "markdown",
298 | "metadata": {
299 | "collapsed": true
300 | },
301 | "source": [
302 | "This above performed okay: logloss = -0.5252. But I think we need to increase num_rounds and at least try to change preprocessing:"
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "metadata": {},
308 | "source": [
309 | "# TESTING ANOTHER APPROACH"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {},
315 | "source": [
316 | "Loading & preprocessing:"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 53,
322 | "metadata": {
323 | "collapsed": false
324 | },
325 | "outputs": [
326 | {
327 | "name": "stdout",
328 | "output_type": "stream",
329 | "text": [
330 | "Load data...\n",
331 | "Clearing...\n"
332 | ]
333 | }
334 | ],
335 | "source": [
336 | "#https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code\n",
337 | "print('Load data...')\n",
338 | "train = pd.read_csv(\"train.csv\")\n",
339 | "target = train['target']\n",
340 | "train = train.drop(['ID','target'],axis=1)\n",
341 | "test = pd.read_csv(\"test.csv\")\n",
342 | "ids = test['ID'].values\n",
343 | "test = test.drop(['ID'],axis=1)\n",
344 | "#\n",
345 | "print('Clearing...')\n",
346 | "for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):\n",
347 | " if train_series.dtype == 'O':\n",
348 | " #for objects: factorize\n",
349 | " train[train_name], tmp_indexer = pd.factorize(train[train_name])\n",
350 | " test[test_name] = tmp_indexer.get_indexer(test[test_name])\n",
351 | " #but now we have -1 values (NaN)\n",
352 | " else:\n",
353 | " #for int or float: fill NaN\n",
354 | " tmp_len = len(train[train_series.isnull()])\n",
355 | " if tmp_len>0:\n",
356 | " train.loc[train_series.isnull(), train_name] = train_series.mean()\n",
357 | " #and Test\n",
358 | " tmp_len = len(test[test_series.isnull()])\n",
359 | " if tmp_len>0:\n",
360 | " test.loc[test_series.isnull(), test_name] = train_series.mean() #TODO"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "metadata": {},
366 | "source": [
367 | "A little function to report best scores (from cross validation):"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 60,
373 | "metadata": {
374 | "collapsed": true
375 | },
376 | "outputs": [],
377 | "source": [
378 | "from operator import itemgetter\n",
379 | "# Utility function to report best scores\n",
380 | "def report(grid_scores, n_top=3):\n",
381 | " top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]\n",
382 | " for i, score in enumerate(top_scores):\n",
383 | " print(\"Model with rank: {0}\".format(i + 1))\n",
384 | " print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n",
385 | " score.mean_validation_score,\n",
386 | " np.std(score.cv_validation_scores)))\n",
387 | " print(\"Parameters: {0}\".format(score.parameters))\n",
388 | " print(\"\")"
389 | ]
390 | },
391 | {
392 | "cell_type": "markdown",
393 | "metadata": {},
394 | "source": [
395 | "Now we perform a randomizedsearchCV over a number of parameters (using xgb.XGBClassifier()) --\n",
396 | "I do this because I don't know how to do it with xgb.train() --\n",
397 | "Important question: what is the relation between these two xgb.train() & xgb.XGBClassifier()?\n",
398 | "This is important because I can only do hyperparameter tuning on the latter AND I can only alter num_rounds on the former (which is necessary for a good model, it seems). Any thoughts?"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 65,
404 | "metadata": {
405 | "collapsed": false
406 | },
407 | "outputs": [
408 | {
409 | "name": "stdout",
410 | "output_type": "stream",
411 | "text": [
412 | "5031.24653196\n",
413 | "Model with rank: 1\n",
414 | "Mean validation score: -0.515 (std: 0.000)\n",
415 | "Parameters: {'objective': 'binary:logistic', 'subsample': 0.80000000000000004, 'learning_rate': 0.01, 'colsample_bytree': 0.90000000000000002, 'max_depth': 11}\n",
416 | "\n",
417 | "Model with rank: 2\n",
418 | "Mean validation score: -0.516 (std: 0.000)\n",
419 | "Parameters: {'objective': 'binary:logistic', 'subsample': 0.40000000000000002, 'learning_rate': 0.01, 'colsample_bytree': 1.0, 'max_depth': 12}\n",
420 | "\n",
421 | "Model with rank: 3\n",
422 | "Mean validation score: -0.516 (std: 0.000)\n",
423 | "Parameters: {'objective': 'binary:logistic', 'subsample': 0.5, 'learning_rate': 0.01, 'colsample_bytree': 0.90000000000000002, 'max_depth': 13}\n",
424 | "\n",
425 | "None\n"
426 | ]
427 | }
428 | ],
429 | "source": [
430 | "# X = train.values\n",
431 | "# y = target.values\n",
432 | "#https://www.kaggle.com/c/springleaf-marketing-response/forums/t/16627/help-with-xgboost-sklearn-randomized-grid-search\n",
433 | "# -*- coding: utf-8 -*-\n",
434 | "\"\"\"\n",
435 | "\"\"\"\n",
436 | "t0 = time.time()\n",
437 | "#http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html\n",
438 | "#http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.RandomizedSearchCV.html\n",
439 | "param_grid = {'max_depth': range(4,15),\n",
440 | "# 'min_child_weight': [1,40],\n",
441 | " 'objective':['binary:logistic'],\n",
442 | "# 'n_estimators':[5],\n",
443 | " 'learning_rate':[0.01], #this is same as eta\n",
444 | " 'subsample': np.arange(0.1,1.1,0.1),\n",
445 | " 'colsample_bytree': np.arange(0.1,1.1,0.1),\n",
446 | " #'scale_pos_weight': [0.5, 1]\n",
447 | " #'model__eta':[0.01,0.02],\n",
448 | " #'model__scale_pos_weight':[0.8,1.0]\n",
449 | " #'model__silent':[1],\n",
450 | " }\n",
451 | "\n",
452 | "\n",
453 | "from sklearn.grid_search import GridSearchCV, RandomizedSearchCV\n",
454 | "from sklearn import metrics\n",
455 | "\n",
456 | "xgb_model = xgb.XGBClassifier()\n",
457 | "n_iter_search=20\n",
458 | "random_search = RandomizedSearchCV(xgb_model, param_distributions=param_grid,\n",
459 | " n_iter=n_iter_search, scoring =\"log_loss\")\n",
460 | "\n",
461 | "# start = time()\n",
462 | "# training and y_training are \n",
463 | "# small dataset and target variable that I generated from the training dataset\n",
464 | "random_search.fit(train, target) \n",
465 | "t1 = time.time()\n",
466 | "total_time = t1 - t0\n",
467 | "print total_time\n",
468 | "\n",
469 | "print report(random_search.grid_scores_)\n",
470 | "xgb_model_best = xgb.XGBClassifier()\n",
471 | "xgb_model_best.set_params(**random_search.best_params_)\n",
472 | "#http://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python\n",
473 | "xgb_model_best.fit(X , y)\n",
474 | "preds = xgb_model_best.predict_proba(xtest.values)[:,1]\n",
475 | "#also see this! https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings/forums/t/18494/gridsearchcv-on-xgboost/105272"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 66,
481 | "metadata": {
482 | "collapsed": true
483 | },
484 | "outputs": [],
485 | "source": [
486 | "import csv\n",
487 | "predictions_file = open(\"xgb_rgs_larger_predictions.csv\", \"w\")\n",
488 | "open_file_object = csv.writer(predictions_file)\n",
489 | "open_file_object.writerow([\"ID\", \"PredictedProb\"])\n",
490 | "open_file_object.writerows(zip(IDs, preds))\n",
491 | "predictions_file.close()"
492 | ]
493 | },
494 | {
495 | "cell_type": "markdown",
496 | "metadata": {},
497 | "source": [
498 | "This above performed ok (logloss = -.53791) but not as well as other people's xgbtrain() w/ a large num_rounds. For example, see:\n",
499 | "https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": 67,
505 | "metadata": {
506 | "collapsed": true
507 | },
508 | "outputs": [],
509 | "source": [
510 | "#https://www.kaggle.com/mpearmain/homesite-quote-conversion/xgboost-benchmark\n",
511 | "#https://www.kaggle.com/c/springleaf-marketing-response/forums/t/17089/beating-the-benchmark/96855\n",
512 | "#https://github.com/lenguyenthedat/kaggle-for-fun/blob/master/springleaf-marketing-response/springleaf-xgb-native.py"
513 | ]
514 | },
515 | {
516 | "cell_type": "markdown",
517 | "metadata": {},
518 | "source": [
519 | "So now I'll try using the best parameters for xgb.XGBClassifier() in xgb.train() AND make num_boost_round = 200."
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": null,
525 | "metadata": {
526 | "collapsed": true
527 | },
528 | "outputs": [],
529 | "source": [
530 | "#cf https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code\n",
531 | "t0 = time.time()\n",
532 | "xgtrain = xgb.DMatrix(train.values, target.values)\n",
533 | "xgtest = xgb.DMatrix(test.values)\n",
534 | "\n",
535 | "#Now let's fit the model\n",
536 | "print('Fit the model...')\n",
537 | "boost_round = 2000 #1800 CHANGE THIS BEFORE START\n",
538 | "clf = xgb.train(random_search.best_params_,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)\n",
539 | "\n",
540 | "#Make predict\n",
541 | "print('Predict...')\n",
542 | "preds = clf.predict(xgtest, ntree_limit=clf.best_iteration )\n",
543 | "##check here for eval metrics + https://github.com/dmlc/xgboost/blob/master/demo/guide-python/evals_result.py\n",
544 | "t1 = time.time()\n",
545 | "total_time = t1 - t0\n",
546 | "print total_time"
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": null,
552 | "metadata": {
553 | "collapsed": true
554 | },
555 | "outputs": [],
556 | "source": [
557 | "import csv\n",
558 | "predictions_file = open(\"xgb_rgs_more_rounds_predictions.csv\", \"w\")\n",
559 | "open_file_object = csv.writer(predictions_file)\n",
560 | "open_file_object.writerow([\"ID\", \"PredictedProb\"])\n",
561 | "open_file_object.writerows(zip(IDs, preds))\n",
562 | "predictions_file.close()"
563 | ]
564 | },
565 | {
566 | "cell_type": "markdown",
567 | "metadata": {},
568 | "source": [
569 | "This performed well: logloss = -0.45991 . "
570 | ]
571 | }
572 | ],
573 | "metadata": {
574 | "kernelspec": {
575 | "display_name": "Python 2",
576 | "language": "python",
577 | "name": "python2"
578 | },
579 | "language_info": {
580 | "codemirror_mode": {
581 | "name": "ipython",
582 | "version": 2
583 | },
584 | "file_extension": ".py",
585 | "mimetype": "text/x-python",
586 | "name": "python",
587 | "nbconvert_exporter": "python",
588 | "pygments_lexer": "ipython2",
589 | "version": "2.7.11"
590 | }
591 | },
592 | "nbformat": 4,
593 | "nbformat_minor": 0
594 | }
595 |
--------------------------------------------------------------------------------
/paribas/stratified_CV_with_xgboost.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#I want to get to know gradient boosting methods (in particular, the xgboost library) and i am also currently in barbados.\n",
12 | "#Import libraries:\n",
13 | "import numpy as np\n",
14 | "import pandas as pd\n",
15 | "import xgboost as xgb\n",
16 | "import time"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "# Stratified CV w/ XGBoost"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "Loading & preprocessing:"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {
37 | "collapsed": false
38 | },
39 | "outputs": [
40 | {
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "Load data...\n",
45 | "Clearing...\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "#https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code\n",
51 | "print('Load data...')\n",
52 | "train = pd.read_csv(\"train.csv\")\n",
53 | "target = train['target']\n",
54 | "train = train.drop(['ID','target'],axis=1)\n",
55 | "test = pd.read_csv(\"test.csv\")\n",
56 | "ids = test['ID'].values\n",
57 | "test = test.drop(['ID'],axis=1)\n",
58 | "#\n",
59 | "print('Clearing...')\n",
60 | "for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):\n",
61 | " if train_series.dtype == 'O':\n",
62 | " #for objects: factorize\n",
63 | " train[train_name], tmp_indexer = pd.factorize(train[train_name])\n",
64 | " test[test_name] = tmp_indexer.get_indexer(test[test_name])\n",
65 | " #but now we have -1 values (NaN)\n",
66 | " else:\n",
67 | " #for int or float: fill NaN\n",
68 | " tmp_len = len(train[train_series.isnull()])\n",
69 | " if tmp_len>0:\n",
70 | " train.loc[train_series.isnull(), train_name] = train_series.mean()\n",
71 | " #and Test\n",
72 | " tmp_len = len(test[test_series.isnull()])\n",
73 | " if tmp_len>0:\n",
74 | " test.loc[test_series.isnull(), test_name] = train_series.mean() #TODO"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {
81 | "collapsed": false
82 | },
83 | "outputs": [],
84 | "source": [
85 | "#https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/19083/best-practices-for-parameter-tuning-on-models/\n",
86 | "#https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 48,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [
96 | {
97 | "name": "stdout",
98 | "output_type": "stream",
99 | "text": [
100 | "Fit the model...\n",
101 | "408.922273874\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "t0 = time.time()\n",
107 | "xgtrain = xgb.DMatrix(train.values, target.values)\n",
108 | "xgtest = xgb.DMatrix(test.values)\n",
109 | "\n",
110 | "params = {'objective': 'binary:logistic', \n",
111 | " 'subsample': 1, \n",
112 | " 'eta': 0.1, \n",
113 | " 'colsample_bytree': 0.9, \n",
114 | " 'max_depth': 10,\n",
115 | " 'min_child_weight' : 5,\n",
116 | " 'silent':1}\n",
117 | "\n",
118 | "#Now let's fit the model\n",
119 | "print('Fit the model...')\n",
120 | "num_round = 50 #1800 CHANGE THIS BEFORE START\n",
121 | "clf = xgb.cv(params,xgtrain,num_boost_round=num_round,metrics={'logloss'}, nfold = 5 ,\n",
122 | " seed = 0 ,maximize=False)\n",
123 | "\n",
124 | "#i have attempted this with argument stratified = 1 and get the following error:\n",
125 | "#TypeError: cv() got an unexpected keyword argument 'stratified'\n",
126 | "\n",
127 | "\n",
128 | "#Make predict\n",
129 | "# print('Predict...')\n",
130 | "##check here for eval metrics + https://github.com/dmlc/xgboost/blob/master/demo/guide-python/evals_result.py\n",
131 | "t1 = time.time()\n",
132 | "total_time = t1 - t0\n",
133 | "print total_time"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 49,
139 | "metadata": {
140 | "collapsed": true
141 | },
142 | "outputs": [
143 | {
144 | "data": {
145 | "text/html": [
146 | "\n",
147 | "
\n",
148 | " \n",
149 | " \n",
150 | " | \n",
151 | " test-logloss-mean | \n",
152 | " test-logloss-std | \n",
153 | " train-logloss-mean | \n",
154 | " train-logloss-std | \n",
155 | "
\n",
156 | " \n",
157 | " \n",
158 | " \n",
159 | " 0 | \n",
160 | " 0.657933 | \n",
161 | " 0.000591 | \n",
162 | " 0.655429 | \n",
163 | " 0.000550 | \n",
164 | "
\n",
165 | " \n",
166 | " 1 | \n",
167 | " 0.628350 | \n",
168 | " 0.000739 | \n",
169 | " 0.623086 | \n",
170 | " 0.000943 | \n",
171 | "
\n",
172 | " \n",
173 | " 2 | \n",
174 | " 0.603884 | \n",
175 | " 0.001104 | \n",
176 | " 0.595802 | \n",
177 | " 0.001063 | \n",
178 | "
\n",
179 | " \n",
180 | " 3 | \n",
181 | " 0.583370 | \n",
182 | " 0.001012 | \n",
183 | " 0.572541 | \n",
184 | " 0.000836 | \n",
185 | "
\n",
186 | " \n",
187 | " 4 | \n",
188 | " 0.566755 | \n",
189 | " 0.000924 | \n",
190 | " 0.553194 | \n",
191 | " 0.000422 | \n",
192 | "
\n",
193 | " \n",
194 | " 5 | \n",
195 | " 0.552614 | \n",
196 | " 0.000855 | \n",
197 | " 0.536283 | \n",
198 | " 0.000151 | \n",
199 | "
\n",
200 | " \n",
201 | " 6 | \n",
202 | " 0.540211 | \n",
203 | " 0.000855 | \n",
204 | " 0.521392 | \n",
205 | " 0.000222 | \n",
206 | "
\n",
207 | " \n",
208 | " 7 | \n",
209 | " 0.529721 | \n",
210 | " 0.000880 | \n",
211 | " 0.508403 | \n",
212 | " 0.000270 | \n",
213 | "
\n",
214 | " \n",
215 | " 8 | \n",
216 | " 0.520790 | \n",
217 | " 0.000867 | \n",
218 | " 0.497102 | \n",
219 | " 0.000514 | \n",
220 | "
\n",
221 | " \n",
222 | " 9 | \n",
223 | " 0.513212 | \n",
224 | " 0.000905 | \n",
225 | " 0.487175 | \n",
226 | " 0.000640 | \n",
227 | "
\n",
228 | " \n",
229 | " 10 | \n",
230 | " 0.506994 | \n",
231 | " 0.000959 | \n",
232 | " 0.478506 | \n",
233 | " 0.000730 | \n",
234 | "
\n",
235 | " \n",
236 | " 11 | \n",
237 | " 0.501447 | \n",
238 | " 0.001094 | \n",
239 | " 0.470861 | \n",
240 | " 0.000729 | \n",
241 | "
\n",
242 | " \n",
243 | " 12 | \n",
244 | " 0.496789 | \n",
245 | " 0.001237 | \n",
246 | " 0.464108 | \n",
247 | " 0.000729 | \n",
248 | "
\n",
249 | " \n",
250 | " 13 | \n",
251 | " 0.492689 | \n",
252 | " 0.001184 | \n",
253 | " 0.457947 | \n",
254 | " 0.000677 | \n",
255 | "
\n",
256 | " \n",
257 | " 14 | \n",
258 | " 0.489190 | \n",
259 | " 0.001212 | \n",
260 | " 0.452237 | \n",
261 | " 0.000786 | \n",
262 | "
\n",
263 | " \n",
264 | " 15 | \n",
265 | " 0.486248 | \n",
266 | " 0.001257 | \n",
267 | " 0.447101 | \n",
268 | " 0.000871 | \n",
269 | "
\n",
270 | " \n",
271 | " 16 | \n",
272 | " 0.483681 | \n",
273 | " 0.001239 | \n",
274 | " 0.442427 | \n",
275 | " 0.000849 | \n",
276 | "
\n",
277 | " \n",
278 | " 17 | \n",
279 | " 0.481431 | \n",
280 | " 0.001276 | \n",
281 | " 0.438261 | \n",
282 | " 0.000909 | \n",
283 | "
\n",
284 | " \n",
285 | " 18 | \n",
286 | " 0.479627 | \n",
287 | " 0.001380 | \n",
288 | " 0.434358 | \n",
289 | " 0.000855 | \n",
290 | "
\n",
291 | " \n",
292 | " 19 | \n",
293 | " 0.477896 | \n",
294 | " 0.001453 | \n",
295 | " 0.430595 | \n",
296 | " 0.000764 | \n",
297 | "
\n",
298 | " \n",
299 | " 20 | \n",
300 | " 0.476538 | \n",
301 | " 0.001381 | \n",
302 | " 0.427351 | \n",
303 | " 0.000817 | \n",
304 | "
\n",
305 | " \n",
306 | " 21 | \n",
307 | " 0.475284 | \n",
308 | " 0.001424 | \n",
309 | " 0.424263 | \n",
310 | " 0.000826 | \n",
311 | "
\n",
312 | " \n",
313 | " 22 | \n",
314 | " 0.474239 | \n",
315 | " 0.001395 | \n",
316 | " 0.421472 | \n",
317 | " 0.000912 | \n",
318 | "
\n",
319 | " \n",
320 | " 23 | \n",
321 | " 0.473294 | \n",
322 | " 0.001422 | \n",
323 | " 0.418848 | \n",
324 | " 0.000894 | \n",
325 | "
\n",
326 | " \n",
327 | " 24 | \n",
328 | " 0.472517 | \n",
329 | " 0.001478 | \n",
330 | " 0.416437 | \n",
331 | " 0.000846 | \n",
332 | "
\n",
333 | " \n",
334 | " 25 | \n",
335 | " 0.471818 | \n",
336 | " 0.001503 | \n",
337 | " 0.414173 | \n",
338 | " 0.000894 | \n",
339 | "
\n",
340 | " \n",
341 | " 26 | \n",
342 | " 0.471244 | \n",
343 | " 0.001540 | \n",
344 | " 0.412018 | \n",
345 | " 0.000970 | \n",
346 | "
\n",
347 | " \n",
348 | " 27 | \n",
349 | " 0.470718 | \n",
350 | " 0.001583 | \n",
351 | " 0.410244 | \n",
352 | " 0.001135 | \n",
353 | "
\n",
354 | " \n",
355 | " 28 | \n",
356 | " 0.470290 | \n",
357 | " 0.001598 | \n",
358 | " 0.408401 | \n",
359 | " 0.001012 | \n",
360 | "
\n",
361 | " \n",
362 | " 29 | \n",
363 | " 0.469917 | \n",
364 | " 0.001631 | \n",
365 | " 0.406792 | \n",
366 | " 0.001054 | \n",
367 | "
\n",
368 | " \n",
369 | " 30 | \n",
370 | " 0.469591 | \n",
371 | " 0.001632 | \n",
372 | " 0.405117 | \n",
373 | " 0.001180 | \n",
374 | "
\n",
375 | " \n",
376 | " 31 | \n",
377 | " 0.469236 | \n",
378 | " 0.001656 | \n",
379 | " 0.403349 | \n",
380 | " 0.000978 | \n",
381 | "
\n",
382 | " \n",
383 | " 32 | \n",
384 | " 0.468996 | \n",
385 | " 0.001689 | \n",
386 | " 0.401859 | \n",
387 | " 0.000776 | \n",
388 | "
\n",
389 | " \n",
390 | " 33 | \n",
391 | " 0.468792 | \n",
392 | " 0.001664 | \n",
393 | " 0.400643 | \n",
394 | " 0.000672 | \n",
395 | "
\n",
396 | " \n",
397 | " 34 | \n",
398 | " 0.468562 | \n",
399 | " 0.001643 | \n",
400 | " 0.399290 | \n",
401 | " 0.000633 | \n",
402 | "
\n",
403 | " \n",
404 | " 35 | \n",
405 | " 0.468300 | \n",
406 | " 0.001700 | \n",
407 | " 0.397969 | \n",
408 | " 0.000679 | \n",
409 | "
\n",
410 | " \n",
411 | " 36 | \n",
412 | " 0.468079 | \n",
413 | " 0.001699 | \n",
414 | " 0.396654 | \n",
415 | " 0.000702 | \n",
416 | "
\n",
417 | " \n",
418 | " 37 | \n",
419 | " 0.467950 | \n",
420 | " 0.001706 | \n",
421 | " 0.395548 | \n",
422 | " 0.000823 | \n",
423 | "
\n",
424 | " \n",
425 | " 38 | \n",
426 | " 0.467791 | \n",
427 | " 0.001684 | \n",
428 | " 0.394453 | \n",
429 | " 0.000800 | \n",
430 | "
\n",
431 | " \n",
432 | " 39 | \n",
433 | " 0.467616 | \n",
434 | " 0.001657 | \n",
435 | " 0.393437 | \n",
436 | " 0.000796 | \n",
437 | "
\n",
438 | " \n",
439 | " 40 | \n",
440 | " 0.467466 | \n",
441 | " 0.001650 | \n",
442 | " 0.392171 | \n",
443 | " 0.000925 | \n",
444 | "
\n",
445 | " \n",
446 | " 41 | \n",
447 | " 0.467381 | \n",
448 | " 0.001702 | \n",
449 | " 0.391102 | \n",
450 | " 0.001146 | \n",
451 | "
\n",
452 | " \n",
453 | " 42 | \n",
454 | " 0.467253 | \n",
455 | " 0.001774 | \n",
456 | " 0.389948 | \n",
457 | " 0.000796 | \n",
458 | "
\n",
459 | " \n",
460 | " 43 | \n",
461 | " 0.467122 | \n",
462 | " 0.001733 | \n",
463 | " 0.389037 | \n",
464 | " 0.000645 | \n",
465 | "
\n",
466 | " \n",
467 | " 44 | \n",
468 | " 0.467043 | \n",
469 | " 0.001747 | \n",
470 | " 0.387892 | \n",
471 | " 0.000712 | \n",
472 | "
\n",
473 | " \n",
474 | " 45 | \n",
475 | " 0.466965 | \n",
476 | " 0.001757 | \n",
477 | " 0.386938 | \n",
478 | " 0.000627 | \n",
479 | "
\n",
480 | " \n",
481 | " 46 | \n",
482 | " 0.466865 | \n",
483 | " 0.001787 | \n",
484 | " 0.385881 | \n",
485 | " 0.000904 | \n",
486 | "
\n",
487 | " \n",
488 | " 47 | \n",
489 | " 0.466820 | \n",
490 | " 0.001841 | \n",
491 | " 0.384970 | \n",
492 | " 0.000891 | \n",
493 | "
\n",
494 | " \n",
495 | " 48 | \n",
496 | " 0.466724 | \n",
497 | " 0.001895 | \n",
498 | " 0.384210 | \n",
499 | " 0.000894 | \n",
500 | "
\n",
501 | " \n",
502 | " 49 | \n",
503 | " 0.466667 | \n",
504 | " 0.001911 | \n",
505 | " 0.383509 | \n",
506 | " 0.000787 | \n",
507 | "
\n",
508 | " \n",
509 | "
\n",
510 | "
"
511 | ],
512 | "text/plain": [
513 | " test-logloss-mean test-logloss-std train-logloss-mean train-logloss-std\n",
514 | "0 0.657933 0.000591 0.655429 0.000550\n",
515 | "1 0.628350 0.000739 0.623086 0.000943\n",
516 | "2 0.603884 0.001104 0.595802 0.001063\n",
517 | "3 0.583370 0.001012 0.572541 0.000836\n",
518 | "4 0.566755 0.000924 0.553194 0.000422\n",
519 | "5 0.552614 0.000855 0.536283 0.000151\n",
520 | "6 0.540211 0.000855 0.521392 0.000222\n",
521 | "7 0.529721 0.000880 0.508403 0.000270\n",
522 | "8 0.520790 0.000867 0.497102 0.000514\n",
523 | "9 0.513212 0.000905 0.487175 0.000640\n",
524 | "10 0.506994 0.000959 0.478506 0.000730\n",
525 | "11 0.501447 0.001094 0.470861 0.000729\n",
526 | "12 0.496789 0.001237 0.464108 0.000729\n",
527 | "13 0.492689 0.001184 0.457947 0.000677\n",
528 | "14 0.489190 0.001212 0.452237 0.000786\n",
529 | "15 0.486248 0.001257 0.447101 0.000871\n",
530 | "16 0.483681 0.001239 0.442427 0.000849\n",
531 | "17 0.481431 0.001276 0.438261 0.000909\n",
532 | "18 0.479627 0.001380 0.434358 0.000855\n",
533 | "19 0.477896 0.001453 0.430595 0.000764\n",
534 | "20 0.476538 0.001381 0.427351 0.000817\n",
535 | "21 0.475284 0.001424 0.424263 0.000826\n",
536 | "22 0.474239 0.001395 0.421472 0.000912\n",
537 | "23 0.473294 0.001422 0.418848 0.000894\n",
538 | "24 0.472517 0.001478 0.416437 0.000846\n",
539 | "25 0.471818 0.001503 0.414173 0.000894\n",
540 | "26 0.471244 0.001540 0.412018 0.000970\n",
541 | "27 0.470718 0.001583 0.410244 0.001135\n",
542 | "28 0.470290 0.001598 0.408401 0.001012\n",
543 | "29 0.469917 0.001631 0.406792 0.001054\n",
544 | "30 0.469591 0.001632 0.405117 0.001180\n",
545 | "31 0.469236 0.001656 0.403349 0.000978\n",
546 | "32 0.468996 0.001689 0.401859 0.000776\n",
547 | "33 0.468792 0.001664 0.400643 0.000672\n",
548 | "34 0.468562 0.001643 0.399290 0.000633\n",
549 | "35 0.468300 0.001700 0.397969 0.000679\n",
550 | "36 0.468079 0.001699 0.396654 0.000702\n",
551 | "37 0.467950 0.001706 0.395548 0.000823\n",
552 | "38 0.467791 0.001684 0.394453 0.000800\n",
553 | "39 0.467616 0.001657 0.393437 0.000796\n",
554 | "40 0.467466 0.001650 0.392171 0.000925\n",
555 | "41 0.467381 0.001702 0.391102 0.001146\n",
556 | "42 0.467253 0.001774 0.389948 0.000796\n",
557 | "43 0.467122 0.001733 0.389037 0.000645\n",
558 | "44 0.467043 0.001747 0.387892 0.000712\n",
559 | "45 0.466965 0.001757 0.386938 0.000627\n",
560 | "46 0.466865 0.001787 0.385881 0.000904\n",
561 | "47 0.466820 0.001841 0.384970 0.000891\n",
562 | "48 0.466724 0.001895 0.384210 0.000894\n",
563 | "49 0.466667 0.001911 0.383509 0.000787"
564 | ]
565 | },
566 | "execution_count": 49,
567 | "metadata": {},
568 | "output_type": "execute_result"
569 | }
570 | ],
571 | "source": [
572 | "clf"
573 | ]
574 | },
575 | {
576 | "cell_type": "markdown",
577 | "metadata": {},
578 | "source": [
579 | "# Some notes on xgb.train() "
580 | ]
581 | },
582 | {
583 | "cell_type": "code",
584 | "execution_count": 89,
585 | "metadata": {
586 | "collapsed": false
587 | },
588 | "outputs": [],
589 | "source": [
590 | "from sklearn.cross_validation import KFold, train_test_split\n",
591 | "X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.05 ,random_state=0)"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 90,
597 | "metadata": {
598 | "collapsed": true
599 | },
600 | "outputs": [],
601 | "source": [
602 | "xgtrains = xgb.DMatrix(X_train.values, y_train.values)\n",
603 | "xgtest = xgb.DMatrix(X_test.values, y_test.values)\n",
604 | "# xgtest = xgb.DMatrix(test.values)\n",
605 | "params = {'objective': 'binary:logistic', \n",
606 | " 'subsample': 1, \n",
607 | " 'eta': 0.1, \n",
608 | " 'colsample_bytree': 0.9, \n",
609 | " 'max_depth': 10,\n",
610 | " 'min_child_weight' : 5,\n",
611 | " 'silent':1}"
612 | ]
613 | },
614 | {
615 | "cell_type": "code",
616 | "execution_count": 91,
617 | "metadata": {
618 | "collapsed": true
619 | },
620 | "outputs": [
621 | {
622 | "name": "stderr",
623 | "output_type": "stream",
624 | "text": [
625 | "Will train until logloss error hasn't decreased in 10 rounds.\n",
626 | "[0]\tlogloss-error:0.235963\n",
627 | "[1]\tlogloss-error:0.232639\n",
628 | "[2]\tlogloss-error:0.232290\n",
629 | "[3]\tlogloss-error:0.231065\n",
630 | "[4]\tlogloss-error:0.229491\n",
631 | "[5]\tlogloss-error:0.230191\n",
632 | "[6]\tlogloss-error:0.227567\n",
633 | "[7]\tlogloss-error:0.226517\n",
634 | "[8]\tlogloss-error:0.225993\n",
635 | "[9]\tlogloss-error:0.226867\n",
636 | "[10]\tlogloss-error:0.226517\n",
637 | "[11]\tlogloss-error:0.227917\n",
638 | "[12]\tlogloss-error:0.226168\n",
639 | "[13]\tlogloss-error:0.227567\n",
640 | "[14]\tlogloss-error:0.226692\n",
641 | "[15]\tlogloss-error:0.227042\n",
642 | "[16]\tlogloss-error:0.227742\n",
643 | "[17]\tlogloss-error:0.227567\n",
644 | "[18]\tlogloss-error:0.226692\n",
645 | "Stopping. Best iteration:\n",
646 | "[8]\tlogloss-error:0.225993\n",
647 | "\n"
648 | ]
649 | }
650 | ],
651 | "source": [
652 | "clft = xgb.train(params,xgtrains,num_boost_round=num_round,\n",
653 | " evals= [(xgtest,'logloss')] , early_stopping_rounds = 10,\n",
654 | " verbose_eval=True)"
655 | ]
656 | },
657 | {
658 | "cell_type": "code",
659 | "execution_count": 92,
660 | "metadata": {
661 | "collapsed": true
662 | },
663 | "outputs": [],
664 | "source": [
665 | "#see here:\n",
666 | "#https://www.kaggle.com/ashhafez/springleaf-marketing-response/xgb-learning-rate-eta-decay/run/78945/code\n",
667 | "#http://discuss.analyticsvidhya.com/t/how-to-predict-class-labels-using-xgboost-in-python-when-objective-function-is-binary-logistic/7809"
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": null,
673 | "metadata": {
674 | "collapsed": true
675 | },
676 | "outputs": [],
677 | "source": []
678 | }
679 | ],
680 | "metadata": {
681 | "kernelspec": {
682 | "display_name": "Python 2",
683 | "language": "python",
684 | "name": "python2"
685 | },
686 | "language_info": {
687 | "codemirror_mode": {
688 | "name": "ipython",
689 | "version": 2
690 | },
691 | "file_extension": ".py",
692 | "mimetype": "text/x-python",
693 | "name": "python",
694 | "nbconvert_exporter": "python",
695 | "pygments_lexer": "ipython2",
696 | "version": "2.7.11"
697 | }
698 | },
699 | "nbformat": 4,
700 | "nbformat_minor": 0
701 | }
702 |
--------------------------------------------------------------------------------
/wine_quality/README.txt:
--------------------------------------------------------------------------------
1 | #dataset is from here: #http://archive.ics.uci.edu/ml/datasets/Wine+Quality
2 | #there are many more great data sets there!
--------------------------------------------------------------------------------
/wine_quality/ipython_notebooks/box_cox.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Sept 29 2015
4 |
5 | @author: hugobowne-anderson
6 | @email: hugobowne at gmail dot com
7 | """
8 |
9 | from scipy import stats
10 | import pandas as pd
11 |
12 | def box_cox(df, lmbda=None, alpha=None):
13 | """
14 | Performs a Box-Cox Transformation on all columns (features) of a pandas
15 | dataframe. Currently, there is some ambiguity as to how to deal with
16 | non-positive values & I need to check this out: at the moment, I just centre
17 | the data so that min(value) > 0, for all features, as necessitated by
18 | the very nature of the Box-Cox Transformation.
19 | """
20 | df_tr = pd.DataFrame(columns=df.columns) #initialize empty data frame with same features as df
21 | for val in list(df.columns):
22 | df_tr[val] = stats.boxcox(df[val] - min(df[val]) + 0.1,lmbda, alpha)[0] #populate dataframe with transformed data
23 | return df_tr
24 |
--------------------------------------------------------------------------------
/wine_quality/ipython_notebooks/yeo_johnson.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Sept 29 2015
4 |
5 | @author: hugobowne-anderson
6 | @email: hugobowne at gmail dot com
7 | """
8 |
9 | from scipy import stats
10 | import numpy as np
11 | import pandas as pd
12 | import math as math
13 |
14 | def yeo_johnson(x, lmbda=0 ):
15 | """
16 | Performs a Yeo-Johnson Transformation on a numpy array.
17 | Arguments:
18 | Input array. Should be 1-dimensional.
19 | lmbda : {scalar}, optional.
20 | IN PROGRESS: I WILL COMMENT CODE BELOW ASAP; A RUNTIME WARNING MAY BE THROWN
21 | DURING EXECUTION BUT THE RESULT SHOULD NOT BE AFFECTED.
22 | I HAVE USED THE DEFINITION OF YEO-JOHNSON TRANSFORMATION FROM THE ORIGINAL PAPER:
23 | Yeo, In-Kwon and Johnson, Richard (2000). A new family of power transformations
24 | to improve normality or symmetry. Biometrika, 87, 954-959.
25 | """
26 | #The Yeo-Johnson Transform is defined differently for differing values of lambda
27 | if lmbda == 0:
28 | #as transform is defined piecewise, I compute it using the sum of relational
29 | #operators: for this reason, I 1st define the 2 functions
30 | A1 = np.log(abs(x+1))
31 | A1[A1 == -np.inf] = 0 #subtlety: if value = -inf , then term will not be used
32 | # BUT I do need to set it to 0 so that it IS unused below
33 | A2 = (np.power(1-x , 2) - 1)/2
34 | A2[np.isnan(A2)] = 0#subtlety: if value = NaN , then term will not be used
35 | # BUT I do need to set it to 0 so that it IS unused below
36 | x_yj = (x>=0)*A1 - (x<0)*A2
37 | elif lmbda == 2:
38 | #as transform is defined piecewise, I compute it using the sum of relational
39 | #operators: for this reason, I 1st define the 2 functions
40 | B1 = (np.power(x+1 , 2) - 1)/2
41 | B1[np.isnan(B1)] = 0#subtlety: if value = NaN , then term will not be used
42 | # BUT I do need to set it to 0 so that it IS unused below
43 | B2 = np.log(abs(1-x))
44 | B2[B2==-np.inf] = 0#subtlety: if value = -inf , then term will not be used
45 | # BUT I do need to set it to 0 so that it IS unused below
46 | x_yj = (x>=0)*B1 - (x<0)*B2
47 | else:
48 | #as transform is defined piecewise, I compute it using the sum of relational
49 | #operators: for this reason, I 1st define the 2 functions
50 | C1 = (np.power(x+1 , lmbda) - 1)/lmbda
51 | C1[np.isnan(C1)] = 0#subtlety: if value = NaN , then term will not be used
52 | # BUT I do need to set it to 0 so that it IS unused below
53 | C2 = (np.power(1-x , 2-lmbda) - 1)/(2 - lmbda)
54 | C2[np.isnan(C2)] = 0#subtlety: if value = NaN , then term will not be used
55 | # BUT I do need to set it to 0 so that it IS unused below
56 | x_yj = (x>=0)*C1 + (x<0)*C2
57 |
58 | return x_yj
59 |
60 | def dfyeo_johnson(df, lmbda=0 ):
61 | """
62 | Performs a Yeo-Johnson Transformation on all columns (features)of a dataframe
63 | """
64 | df_yj = pd.DataFrame(columns=df.columns) #initialize empty data frame with same features as df
65 | for val in list(df.columns):
66 | df_yj[val] = yeo_johnson(df[val]) #populate dataframe with transformed data
67 | return df_yj
68 |
--------------------------------------------------------------------------------
/wine_quality/ipython_notebooks/yjscratch.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Sep 29 13:53:59 2015
4 |
5 | @author: hugobowne-anderson
6 | """
7 |
8 | import numpy as np
9 |
10 | s = np.arange(10)-5
11 | import math as math
12 | import yeo_johnson as yj
13 |
14 |
15 | math.log(math.exp(1))
16 |
17 | yeo_johnson(s , lmbda = 2)
18 |
--------------------------------------------------------------------------------
/wine_quality/python/wine_classifier.py:
--------------------------------------------------------------------------------
1 | __author__ = "Fernando Carrillo"
2 | __email__ = "fernando at carrillo.at"
3 |
4 | from sklearn.grid_search import GridSearchCV
5 | from sklearn.cross_validation import cross_val_score
6 | from sklearn.metrics import classification_report
7 |
8 | class WineClassifier(object):
9 | """
10 | Use classification (not regression) for wine quality.
11 | """
12 | def __init__(self, X_train, y_train, X_valid, y_valid, pipeline, param_grid):
13 | """
14 | Set the data sets.
15 | """
16 | self.X_train = X_train
17 | self.y_train = y_train
18 | self.X_valid = X_valid
19 | self.y_valid = y_valid
20 | self.pipeline = pipeline
21 | self.param_grid = param_grid
22 |
23 | def train(self, verbose=1, n_jobs=-1, scoring='accuracy', cv=10):
24 | """
25 | Train the classifier by grid search
26 | """
27 | if len(self.param_grid) != 0:
28 | grid_search = GridSearchCV(self.pipeline, param_grid=self.param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs, scoring=scoring)
29 | grid_search.fit(self.X_train, self.y_train)
30 | if verbose > 1:
31 | print( ('Best score %s with parameters %s') % (grid_search.best_score_, grid_search.best_params_))
32 | self.pipeline = grid_search.best_estimator_
33 | else:
34 | if verbose > 1:
35 | scores = cross_val_score(self.pipeline, self.X_train, self.y_train, cv=cv)
36 | print(('Best score %s') % (scores.mean()))
37 | self.pipeline.fit(self.X_train, self.y_train)
38 |
39 | def classification_report(self, print_stdout=True):
40 | """
41 | Valid classifier on validation set.
42 | """
43 | report = classification_report(self.y_valid, self.pipeline.predict(self.X_valid))
44 | if print_stdout: print(report)
45 | return(report)
46 |
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/wine_quality/python/wine_data.py:
--------------------------------------------------------------------------------
1 | __author__ = "Fernando Carrillo"
2 | __email__ = "fernando at carrillo.at"
3 |
4 | import pandas as pd
5 | import numpy as np
6 |
7 | class WineData(object):
8 | """docstring for WineData"""
9 | def __init__(self, path_to_red, path_to_white):
10 | self.path_to_red = path_to_red
11 | self.path_to_white = path_to_white
12 |
13 | def _load(self, path_to_data):
14 | """
15 | Loads the data from data
16 | """
17 | data = np.array(pd.read_csv(path_to_data, header=0, sep=';'))
18 | X = data[:,:-1]
19 | y = data[:,-1]
20 | return X, y
21 |
22 | def load_red(self):
23 | """
24 | Loads the red wine data
25 | """
26 | return self._load(self.path_to_red)
27 |
28 | def load_white(self):
29 | """
30 | Loads the white wine data
31 | """
32 | return self._load(self.path_to_white)
--------------------------------------------------------------------------------
/wine_quality/python/wine_explore.py:
--------------------------------------------------------------------------------
1 | __author__ = "Fernando Carrillo"
2 | __email__ = "fernando at carrillo.at"
3 |
4 | from matplotlib import pyplot as plt
5 | from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler
6 | from sklearn.decomposition import PCA
7 | from sklearn.manifold import Isomap, TSNE, LocallyLinearEmbedding, SpectralEmbedding, MDS
8 |
9 | import pandas as pd
10 |
11 | def plot2d(X, y, scale=True, normalize=False, embedding='pca', title=''):
12 | """
13 | Plot data transformed into two dimensions by PCA.
14 | PCA transforms into a new embedding dimension such that
15 | the first dimension contains the maximal variance and following
16 | dimensions maximal remaining variance.
17 | This shoudl spread the observed n-dimensional data maximal. This
18 | is unsupervised and will not consider target values.
19 | """
20 | if (scale):
21 | scaler = StandardScaler()
22 | X = scaler.fit_transform(X)
23 |
24 | if (normalize):
25 | normalizer = Normalizer(norm='l2')
26 | X = normalizer.fit_transform(X)
27 |
28 | if (embedding is 'pca'):
29 | pca = PCA(n_components=2)
30 | X_transformed = pca.fit_transform(X)
31 | elif (embedding is 'isomap'):
32 | isomap = Isomap(n_components=2, n_neighbors=20)
33 | X_transformed = isomap.fit_transform(X)
34 | elif (embedding is 'lle' ):
35 | lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5)
36 | X_transformed = lle.fit_transform(X)
37 | elif (embedding is 'tsne'):
38 | t_sne = TSNE(n_components=2)
39 | X_transformed = t_sne.fit_transform(X)
40 | elif (embedding is 'spectral'):
41 | se = SpectralEmbedding(n_components=2)
42 | X_transformed = se.fit_transform(X)
43 | elif (embedding is 'mds'):
44 | mds = MDS(n_components=2)
45 | X_transformed = mds.fit_transform(X)
46 | elif (embedding is 'gallery'):
47 | plt.figure(1)
48 |
49 | plt.subplot(231)
50 | plt.title('pca')
51 | X_t = PCA(n_components=2).fit_transform(X)
52 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
53 |
54 | plt.subplot(232)
55 | plt.title('isomap')
56 | X_t = Isomap(n_neighbors=20).fit_transform(X)
57 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
58 |
59 | plt.subplot(233)
60 | plt.title('lle')
61 | X_t = LocallyLinearEmbedding(n_neighbors=20).fit_transform(X)
62 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
63 |
64 | plt.subplot(234)
65 | plt.title('tsne')
66 | X_t = TSNE().fit_transform(X)
67 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
68 |
69 | plt.subplot(235)
70 | plt.title('spectral')
71 | X_t = SpectralEmbedding().fit_transform(X)
72 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
73 |
74 | plt.subplot(236)
75 | plt.title('mds')
76 | X_t = MDS().fit_transform(X)
77 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
78 |
79 | plt.suptitle('Gallery transforms ' + title)
80 |
81 | return plt
82 | else:
83 | raise ValueError("Choose between pca, isomap and tsne")
84 |
85 | plt.title(title + ' ' + embedding + ' plot')
86 | sc = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y)
87 | plt.colorbar(sc)
88 | return plt
89 |
90 | def pairs(X, y, title):
91 | """
92 | Quick and dirty version of pairs.
93 | """
94 | df = pd.DataFrame(X)
95 | df[df.shape[1]] = y
96 | plt.title(title + ' Pairwise plot')
97 | axes = pd.tools.plotting.scatter_matrix(df, alpha=0.2)
98 | return plt
99 |
--------------------------------------------------------------------------------
/wine_quality/python/wine_main.py:
--------------------------------------------------------------------------------
1 | __author__ = "Fernando Carrillo"
2 | __email__ = "fernando at carrillo.at"
3 |
4 | from wine_data import WineData
5 | from wine_preprocesser import WinePreprocesser
6 | from wine_explore import plot2d, pairs
7 | from wine_classifier import WineClassifier
8 |
9 | from time import time
10 | import numpy as np
11 |
12 | from sklearn.pipeline import Pipeline
13 | from sklearn.cross_validation import train_test_split
14 | from sklearn.preprocessing import StandardScaler
15 | from sklearn.naive_bayes import GaussianNB
16 | from sklearn.neighbors import KNeighborsClassifier
17 | from sklearn.svm import LinearSVC
18 | from sklearn.linear_model import LogisticRegression
19 | from sklearn.decomposition import PCA
20 |
21 | # Load data and preprocess (everything you don't put in the pipeline)
22 | data = WineData('../winequality-red.csv', '../winequality-white.csv')
23 |
24 | print('Preprocesing.')
25 | t0 = time()
26 | wp = WinePreprocesser(data)
27 | #wp.add_divided_features(replace_inf_with_absmax=True)
28 | wp.polynomial_expansion(rank=2)
29 | wp.remove_low_variance_features(variance_threshold=0)
30 | X_red, y_red = wp.get_red()
31 | X_white, y_white = wp.get_white()
32 | print('Preprocesing. Done in %fs' % (time()-t0) )
33 | ###############################
34 | # Explore data
35 | # 1. Plot in 2d, color code classes:
36 | # -> no simple low dimension linear separation
37 | # 2. Plot paris
38 | # -> correlation: transform data or use regularized methods
39 | # -> non-normal distributed featues: Box-Cox transform
40 | ###############################
41 | do_plot = False
42 | if (do_plot):
43 | plot2d(X_red, y_red, embedding='gallery', title='Red wine').show()#.savefig('../data/red_whine_2d_gallery.png')
44 | plot2d(X_white, y_white, embedding='gallery', title='White wine').show()#.savefig('../data/white_whine_2d_gallery.png')
45 | pairs(X_red, y_red, 'Red wine')
46 | pairs(X_white, y_white, 'White wine')
47 |
48 | ###############################
49 | # Classification
50 | # Prepare data
51 | ###############################
52 | #X = X_white
53 | #y = y_white
54 | X = X_red
55 | y = y_red
56 | X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state=23, test_size=0.2)
57 |
58 | ###############################
59 | # Classify on transformed dataset.
60 | ###############################
61 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('cls', GaussianNB())])
62 | cls_nb = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10)})
63 | cls_nb.train(verbose=1, n_jobs=-1, scoring='f1_micro')
64 | cls_nb.classification_report()
65 |
66 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('nn', KNeighborsClassifier())])
67 | cls_nn = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10), 'nn__n_neighbors': [1, 2, 4, 8, 32, 64]})
68 | cls_nn.train(verbose=1, n_jobs=1, scoring='f1_micro') # crashes with n_jobs > 1
69 | cls_nn.classification_report()
70 |
71 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('svc', LinearSVC())])
72 | cls_svc = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10), 'svc__C': 10. ** np.arange(-3, 4)})
73 | cls_svc.train(verbose=1, n_jobs=1, scoring='f1_micro')
74 | cls_svc.classification_report()
75 |
76 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('logistic', LogisticRegression(multi_class='multinomial', solver='lbfgs'))])
77 | cls_log = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10), 'logistic__C': 10. ** np.arange(-3, 4)})
78 | cls_log.train(verbose=1, n_jobs=1, scoring='f1_micro') # Not sure why, but multi_class logisitc regression crashes with multithreading.
79 | cls_log.classification_report()
--------------------------------------------------------------------------------
/wine_quality/python/wine_preprocesser.py:
--------------------------------------------------------------------------------
1 | __author__ = "Fernando Carrillo"
2 | __email__ = "fernando at carrillo.at"
3 |
4 | import numpy as np
5 | from sklearn.preprocessing import PolynomialFeatures
6 |
7 | class WinePreprocesser(object):
8 | """docstring for WinePreprocesser"""
9 | def __init__(self, wine_data):
10 | self.X_red, self.y_red = wine_data.load_red()
11 | self.X_white, self.y_white = wine_data.load_white()
12 |
13 | def _divide_features(self, X, replace_inf_with_absmax):
14 | """
15 | Divide 1 by feature value.
16 | """
17 | # Do the division
18 | nf = np.divide(1, X)
19 | # Replace inf by nan or by the maximal absolute value
20 | for i in np.arange(nf.shape[1]):
21 | if np.inf in nf[:,i]:
22 | a = nf[:,i]
23 | if replace_inf_with_absmax:
24 | a[np.isinf(a)] = a[np.argmax(abs(a[np.isfinite(a)]))]
25 | else:
26 | a[np.isinf(a)] = np.nan
27 | nf[:,i] = a
28 | return(nf)
29 |
30 |
31 | def add_divided_features(self, replace_inf_with_absmax=True):
32 | """
33 | For each feature y_i add 1/y_i
34 | """
35 | X_red_divided = self._divide_features(X=self.X_red, replace_inf_with_absmax=replace_inf_with_absmax)
36 | self.X_red = np.concatenate((self.X_red, X_red_divided), axis=1)
37 | X_white_divided = self._divide_features(X=self.X_white, replace_inf_with_absmax=replace_inf_with_absmax)
38 | self.X_white = np.concatenate((self.X_white, X_white_divided), axis=1)
39 |
40 | def polynomial_expansion(self, rank=2):
41 | """
42 | Expand the features with polynonial of rank rnank
43 | """
44 | pf = PolynomialFeatures(degree=2)
45 | self.X_red = pf.fit_transform(self.X_red)
46 | self.X_white = pf.fit_transform(self.X_white)
47 |
48 | def _remove_low_var(self, X, variance_threshold):
49 | """
50 | Remove features with variance below threshold.
51 | """
52 | remove_index = []
53 | for col in range(X.shape[1]):
54 | if np.var(X[:,col]) < variance_threshold:
55 | remove_index.append(col)
56 | return(np.delete(X, remove_index, 1))
57 |
58 | def remove_low_variance_features(self, variance_threshold=0):
59 | """
60 | Remove features with variance below threshold.
61 | """
62 | self.X_red = self._remove_low_var(self.X_red, variance_threshold)
63 | self.X_white = self._remove_low_var(self.X_white, variance_threshold)
64 |
65 | def yeo_johnson_transform(self):
66 | """
67 | Implement yeo johnson transform
68 | """
69 | raise NotImplementedError
70 |
71 | def get_red(self):
72 | """
73 | Returns X, y of red wine data
74 | """
75 | return self.X_red, self.y_red
76 |
77 | def get_white(self):
78 | """
79 | Returns X, y of white wine data
80 | """
81 | return self.X_white, self.y_white
82 |
--------------------------------------------------------------------------------