├── .gitignore ├── POTUS_aka_El_Presidente ├── README.txt ├── best_model.txt ├── model_selection.R ├── notes.txt ├── predict.R ├── python_code │ ├── python_POTUS.ipynb │ ├── python_POTUS_notes.ipynb │ └── python_POTUS_pipeline.ipynb ├── test.csv └── train.csv ├── Preprocessing_note.html ├── Preprocessing_note.ipynb ├── digit_recoginition ├── digit_recog_classifier_test_data.py └── digit_recog_grid_search.py ├── expedia └── EDA_1st_model.ipynb ├── homesite ├── Boris_gradient_boost.ipynb ├── initial_foray_insurance.ipynb └── initial_foray_insurance_grad_boosting.ipynb ├── notes_on_ML ├── K-NN_and_preprocessing.html ├── K-NN_and_preprocessing.ipynb ├── Logistic_regression_and_preprocessing.ipynb └── Scaling_synthesized_data.ipynb ├── paribas ├── README.md ├── boosting_in_barbados.ipynb ├── exploratory_analysis.ipynb ├── extra_trees_classifier.ipynb ├── paribas_I.ipynb └── stratified_CV_with_xgboost.ipynb └── wine_quality ├── README.txt ├── ipython_notebooks ├── Predicting_Wine_Quality.ipynb ├── Testing Box_Cox.ipynb ├── box_cox.py ├── explore_wine_data.ipynb ├── yeo_johnson.py └── yjscratch.py ├── python ├── .ipynb_checkpoints │ └── wine_notebook-checkpoint.ipynb ├── wine_classifier.py ├── wine_data.py ├── wine_explore.py ├── wine_main.py ├── wine_notebook.ipynb └── wine_preprocesser.py ├── winequality-red.csv └── winequality-white.csv /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .RData 3 | .Rhistory 4 | .project 5 | .pydevproject 6 | *.pickle 7 | *.pyc 8 | data/ 9 | __init__.py 10 | -------------------------------------------------------------------------------- /POTUS_aka_El_Presidente/README.txt: -------------------------------------------------------------------------------- 1 | Predicting Presidential Votes Across Counties 2 | 3 | This is a typical example of a job interview challenge (usually given as a homework assignment to complete in a couple of hours): they will provide you with a training .csv, ask you to build some predicitve models and choose the best one; then to apply it to their test data .csv and send them the results, along with your code. 4 | Herein find: 5 | -an example training set train.csv which contains metadata about counties and who won in- that county (Obama/Romney) and a test data set test.csv; 6 | -an R script model_selection.R that builds a variety of predictive models for the problem and chooses the one that performs best on train.csv (using repeated 10-fold cross validation); 7 | -an R script predict.R that that predicts the election outcome for the counties (rows) in test.csv; 8 | -notes.txt, that explains the model building process; 9 | -best_model.txt, that describes the performance of the best model. 10 | 11 | Enjoy! 12 | 13 | -------------------------------------------------------------------------------- /POTUS_aka_El_Presidente/best_model.txt: -------------------------------------------------------------------------------- 1 | Stochastic Gradient Boosting 2 | 3 | 1213 samples 4 | 9 predictor 5 | 2 classes: 'Barack Obama', 'Mitt Romney' 6 | 7 | Pre-processing: centered, scaled, principal component signal extraction 8 | Resampling: Cross-Validated (10 fold, repeated 10 times) 9 | 10 | Summary of sample sizes: 1092, 1092, 1091, 1092, 1091, 1092, ... 11 | 12 | Resampling results across tuning parameters: 13 | 14 | interaction.depth n.trees Accuracy Kappa Accuracy SD Kappa SD 15 | 1 50 0.843 0.426 0.0243 0.1026 16 | 1 100 0.847 0.467 0.0258 0.0988 17 | 1 150 0.847 0.471 0.0274 0.1019 18 | 2 50 0.848 0.469 0.0248 0.0976 19 | 2 100 0.844 0.469 0.0263 0.0966 20 | 2 150 0.843 0.468 0.0262 0.0960 21 | 3 50 0.847 0.474 0.0256 0.0979 22 | 3 100 0.844 0.473 0.0280 0.0997 23 | 3 150 0.841 0.466 0.0280 0.0966 24 | 25 | Tuning parameter 'shrinkage' was held constant at a value of 0.1 26 | Accuracy was used to select the optimal model using the largest value. 27 | The final values used for the model were n.trees = 50, interaction.depth = 2 and shrinkage = 0.1. 28 | -------------------------------------------------------------------------------- /POTUS_aka_El_Presidente/model_selection.R: -------------------------------------------------------------------------------- 1 | ################################################################################# 2 | ###THE SET UP 3 | ################################################################################# 4 | library( glmnet ) 5 | library( ggplot2 ) 6 | library( caret ) 7 | library( kernlab ) 8 | library( klaR ) 9 | library(doMC) 10 | 11 | 12 | ###HERE I TAKE ADVANTAGE OF MULTITHREADING 13 | ###Using multithreading with my Dual Core 2.8 GHz Intel Core i7 processor, 14 | ###the code below takes ~6minutes to run 15 | nc <- detectCores() 16 | registerDoMC(cores = nc) 17 | 18 | 19 | rm(list=ls(all=TRUE)) 20 | setwd("~/Documents/ML/")#SET YOUR WORKING DIRECTORY HERE 21 | data <- read.csv("train_potus_by_county.csv", header = TRUE ) 22 | 23 | 24 | ################################################################################# 25 | ###INITIAL DATA DIVE 26 | ################################################################################# 27 | #HISTOGRAM OF RESPONSE VARIABLE TO CHEK FOR CLASS IMBALANCE 28 | q <- ggplot( data , aes(x=Winner)) 29 | q + geom_histogram() ##note a class imbalance! 30 | ##CHECK FOR FEATURES WITH NEAR-ZERO VARIANCE (MAY THROW OFF MODELS) 31 | nzv <- nearZeroVar( data , saveMetrics=TRUE ) 32 | #View(nzv) ##no variables with near-zero variance 33 | #VISUALIZE ALL VARIABLES AND THEIR RELATIONSHIPS 34 | #ggpairs( data ) #THIS FUNCTION IS COMPUTATIONALLY INTENSIVE AND NOT ESSENTIAL FOR WHAT FOLLOWS 35 | 36 | ################################################################################# 37 | ###FEATURE SELECTION: I USE LASSO REGRESSION TO SELECT THE MOST IMPORTANT 38 | ###FEATURES IN DETERMINING THE WINNER 39 | ###(YOU COULD ALSO USE A NONLINEAR ALGORITHM, SUCH AS A RANDOM FOREST 40 | ###TO SELECT FEATURES: AN ADVANTAGE OF LASSO REGRESSION IS THAT IT 41 | ###SELECTS FEATURES AND TELLS YOU WHETHER THEY ARE +VELY OR -VELY 42 | ###CORRELATED WITH THE TARGET VARIABLE) 43 | ################################################################################# 44 | ###SETUP INPUTS TO MODEL 45 | n <- length( data ) 46 | x<- as.matrix(data[,-n]) 47 | y <- as.matrix(data$Winner) 48 | ###RUN THE MODEL 49 | cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class", 50 | nfolds = 20 , nlambda = 1000 , alpha = 1) 51 | ##VARIABLES WITH NONZERO COOEFICIENTS ARE THE IMPORTANT VARIABLES 52 | coef(cvfit$glmnet.fit,s=cvfit$lambda.1se) 53 | 54 | ###KEEP IMPORTANT FEATURES AND RESPONSE VARIABLE 55 | keep <- c("Median.age","X..BachelorsDeg.or.higher","Unemployment.rate", 56 | "Total.households","X..Owner.occupied.housing","X..Renter.occupied.housing", 57 | "Median.home.value","Population.growth", "Per.capita.income.growth", 58 | "Winner") 59 | data <- data[,keep] ##KEEP ONLY THE MOST IMPORTANT FEATURES & RESPONSE VARAIBLES 60 | 61 | ################################################################################# 62 | ###IN WHICH I BUILD A NUMBER OF MODELS TO PREDICT THE RESPONSE VARIABLE 63 | ###I TRY LOGISTIC REGRESSION, SVMs, NEURAL NETWORKS, RANDOM FORESTS, 64 | ###GENERALIZED BOOSTED MODELS AND NAIVE BAYES. 65 | ###NOTE: PREPROCESSING OCCURS WITHIN EACH TRAINING METHOD. 66 | ################################################################################# 67 | 68 | 69 | ###DETAILS OF MODEL TRAINING (REPEATED 10-FOLD CROSS VALIDATION) 70 | fitControl <- trainControl(## 10-fold CV 71 | method = "repeatedcv", 72 | number = 10, 73 | #classProbs = TRUE, 74 | ## repeated ten times 75 | repeats = 10) 76 | 77 | ###I DEFINE THE PREPROCESSING THAT I'LL PERFORM IN EACH MODEL FITTING 78 | preProc = c("center", "scale","pca") ##centre & scale data, pca on predictor variables 79 | tL = 5 #number of levels for each tuning parameter in training: you could do much wider and 80 | #more rigorous tuning by choosing the model-dependent parameter values. Do this and 81 | #your models will perform better!! 82 | 83 | 84 | # Start the clock! 85 | ptm <- proc.time() 86 | ###LOGISTIC REGRESSION (AS A PARTICULAR "GENERAL LINEAR MODEL") 87 | lrfit <- train( Winner ~. , data = data , method = "glm", family = binomial, 88 | trControl = fitControl, preProc , 89 | tuneLength =tL) 90 | 91 | ###SUPPORT VECTOR MACHINE (RADIAL BASIS KERNEL) 92 | svmfit <- train( Winner ~. , data = data , method = 'svmRadial', 93 | trControl = fitControl, preProc , 94 | tuneLength = tL) 95 | 96 | ###NEURAL NETWORK 97 | 98 | nnetfit <- train( Winner ~. , data = data , method = "nnet", 99 | trControl = fitControl, preProc) 100 | 101 | 102 | ###RANDOM FOREST 103 | 104 | rffit <- train( Winner ~. , data = data , method = "rf", 105 | trControl = fitControl, preProc) 106 | 107 | 108 | ###GENERALIZED BOOSTED MODEL 109 | 110 | gbmfit <- train( Winner ~. , data = data , method = "gbm", 111 | trControl = fitControl, preProc) 112 | 113 | ###NAIVE BAYES 114 | 115 | nbfit <- train( Winner ~. , data = data , method = "nb", 116 | trControl = fitControl, preProc) 117 | # Stop the clock 118 | proc.time() - ptm 119 | ################################################################################# 120 | ###COMPARE ALL MODELS 121 | ################################################################################# 122 | #### 123 | 124 | resamps <- resamples(list(nnet = nnetfit , gbm = gbmfit , lr = lrfit, 125 | svm = svmfit , rf = rffit , nb = nbfit)) 126 | summary( resamps ) 127 | ###GBM HAS THE HIGHEST MEAN ACCURACY 128 | 129 | ################################################################################# 130 | ###PRODUCE OUTPUTS 131 | ################################################################################# 132 | 133 | 134 | ###SAVE BEST MODEL TO THE FILESYSTEM 135 | save(gbmfit , file = "mymodelgbm.rda") 136 | 137 | ###LOG DATA ABOUT EXPECTED PERFORMANCE OF MODEL 138 | sink(file="performance.txt") 139 | gbmfit 140 | sink(NULL) 141 | 142 | 143 | 144 | 145 | ################################################################################# 146 | ###HERE BELOW I INCLUDE SOME PREPROCESSING CODE THAT CHECKS FOR FEATURES THAT 147 | ### REMOVES HIGHLY CORRELATED FEATURES AND LOOKS FOR COLLINEARITY. 148 | ###THIS PREPROCESSING DID NOT IMPROVE MODEL PERFORMANCE 149 | ##SO I DID NOT INCLUDE IT IN THE ABOVE CODE. 150 | ################################################################################# 151 | 152 | # ###remove correlated variables 153 | # dummies <- dummyVars( ~ ., data ) 154 | # df <- predict(dummies, newdata = data) 155 | # da <- data.frame(df) 156 | # descrCor <- cor( da ) 157 | # #summary(descrCor[upper.tri(descrCor)]) 158 | # highlyCorDescr <- findCorrelation(descrCor, cutoff = .75) 159 | # filteredDescr <- da[,-highlyCorDescr] 160 | # #descrCor2 <- cor(filteredDescr) 161 | # #summary(descrCor2[upper.tri(descrCor2)]) 162 | # filteredDescr$Winner.Barack.Obama <- NULL 163 | # filteredDescr$Winner <- data$Winner 164 | # data <- filteredDescr 165 | # ##find linear combos 166 | # comboInfo <- findLinearCombos(data) #none -------------------------------------------------------------------------------- /POTUS_aka_El_Presidente/notes.txt: -------------------------------------------------------------------------------- 1 | Notes 2 | 3 | I chose to use R to tackle this assignment. In particular, I made use of the package ‘caret’. See all dependencies at the bottom of these notes. 4 | 5 | 6 | Approach 7 | 8 | 1. Before attempting to build any models, I dove into the data: the most important aspects that I noticed immediately were (i) that all predictor variables were numerical AND (ii) that there was a class imbalance in the response variable (approx. 1/4 for “Barack Obama”). I also noticed that the response variable was binary. 9 | 10 | (i) above indicated that preprocessing all features via scaling and centering would be appropriate. (ii) above made me aware that I should use modelling techniques that are good at dealing with class imbalances, for examples ensemble methods such as random forests and boosting (one could also implement up-/down-sampling). 11 | 12 | 2. In order to compare models, I needed to 1st decide on a metric of comparison: I chose ‘Accuracy’, because in predicting voting behaviour, we want to be as accurate as possible. (I am aware that ‘Accuracy’ may be problematic due to the class imbalance problem but a 25%/75% split isn’t too bad). I could also have used other metrics, such as ROC-curve or specificity. 13 | 14 | 3. I wanted to select the most important features so that the others did not introduce unwanted noise into the modelling process. I did so using Lasso regression. Note: I also attempted to engineer new features (linear combinations, products, ratios of existing features) but this did not contribute to overall performance; I also attempted to remove correlated features but this did not contribute to overall performance. 15 | 16 | 4. I wanted to implement as many models as possible and use the best one on the test data (with ‘Accuracy’ as metric, as discussed above): for each model, I preprocessed the data (scaling, centering and principal component signal extraction) and used repeated 10-fold cross validation to retreive the ‘Accuracy’ of each model (with error bars) for different meta-parameters. 17 | 18 | 5. The models I chose were logistic regression, support vector machines, neural networks, random forests, stochastic gradient boosting and naive Bayes. 19 | 20 | 6. Stochastic gradient boosting won with an accuracy of 84.8% (pretty good for out of the box!): as reported in performance.txt, “The final values used for the model were n.trees = 50, interaction.depth = 2 and shrinkage = 0.1.” Note that many of the models I tried had pretty similar accuracies. 21 | 22 | 7. If I had more time, I would have definitely played around with (i) up-, down- and mixed sampling of the training data, (ii) using bagging techniques other than random forests, (iii) SMOTE to address the class imbalance problem in another manner and (iv) really working on feature engineering as this is the real key! 23 | 24 | 25 | 26 | Dependencies 27 | 28 | R libraries: 29 | library( glmnet ) 30 | library( ggplot2 ) 31 | library( caret ) 32 | library( kernlab ) 33 | library( klaR ) 34 | library(doMC) 35 | 36 | 37 | 38 | HUGO BOWNE-ANDERSON 39 | 07-14-2015 40 | 41 | -------------------------------------------------------------------------------- /POTUS_aka_El_Presidente/predict.R: -------------------------------------------------------------------------------- 1 | ################################################################################# 2 | ###THE SET UP 3 | ################################################################################# 4 | 5 | library( caret ) 6 | rm(list=ls(all=TRUE)) 7 | setwd("~/Documents/ML/")#SET YOUR WORKING DIRECTORY HERE 8 | data <- read.csv("test_potus_by_county.csv", header = TRUE ) 9 | load( "mymodelgbm.rda") 10 | ################################################################################# 11 | ###RUN MODEL AND WRITE PREDICTIONS TO .CSV 12 | ################################################################################# 13 | predictions <- predict(gbmfit , data ) 14 | 15 | write.table(predictions , "predictions.csv" , row.names = FALSE , col.names = FALSE) 16 | -------------------------------------------------------------------------------- /POTUS_aka_El_Presidente/python_code/python_POTUS_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

PIPELINING WITH POTUS DATA AND MACHINE LEARNING

" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "

IMPORT SOME LIBRARIES AND READ IN DATA

" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/html": [ 27 | "
\n", 28 | "\n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | "
Total populationMedian age% BachelorsDeg or higherUnemployment ratePer capita incomeTotal householdsAverage household size% Owner occupied housing% Renter occupied housing% Vacant housingMedian home valuePopulation growthHouse hold growthPer capita income growthWinnerWin_bin
0 9278 37.9 12.6 21.3 13992 3802 2.42 51.9 16.6 31.6 63959-0.69-0.49 0.71 Barack Obama True
1 18594 36.3 9.7 14.3 14622 6764 2.55 63.7 16.2 20.1 74330-0.13 0.03 0.85 Barack Obama True
2 662628 37.9 27.9 12.1 23909 267862 2.41 57.0 28.8 14.2 112687-0.09 0.00 0.55 Barack Obama True
3 21292 38.9 14.1 15.7 16829 8547 2.47 63.5 17.1 19.4 73643-0.59-0.43 0.57 Barack Obama True
4 13252 34.5 15.0 15.8 13012 5222 2.47 53.7 20.7 25.6 56642-1.16-1.03 0.69 Barack Obama True
\n", 148 | "
" 149 | ], 150 | "text/plain": [ 151 | " Total population Median age % BachelorsDeg or higher Unemployment rate \\\n", 152 | "0 9278 37.9 12.6 21.3 \n", 153 | "1 18594 36.3 9.7 14.3 \n", 154 | "2 662628 37.9 27.9 12.1 \n", 155 | "3 21292 38.9 14.1 15.7 \n", 156 | "4 13252 34.5 15.0 15.8 \n", 157 | "\n", 158 | " Per capita income Total households Average household size \\\n", 159 | "0 13992 3802 2.42 \n", 160 | "1 14622 6764 2.55 \n", 161 | "2 23909 267862 2.41 \n", 162 | "3 16829 8547 2.47 \n", 163 | "4 13012 5222 2.47 \n", 164 | "\n", 165 | " % Owner occupied housing % Renter occupied housing % Vacant housing \\\n", 166 | "0 51.9 16.6 31.6 \n", 167 | "1 63.7 16.2 20.1 \n", 168 | "2 57.0 28.8 14.2 \n", 169 | "3 63.5 17.1 19.4 \n", 170 | "4 53.7 20.7 25.6 \n", 171 | "\n", 172 | " Median home value Population growth House hold growth \\\n", 173 | "0 63959 -0.69 -0.49 \n", 174 | "1 74330 -0.13 0.03 \n", 175 | "2 112687 -0.09 0.00 \n", 176 | "3 73643 -0.59 -0.43 \n", 177 | "4 56642 -1.16 -1.03 \n", 178 | "\n", 179 | " Per capita income growth Winner Win_bin \n", 180 | "0 0.71 Barack Obama True \n", 181 | "1 0.85 Barack Obama True \n", 182 | "2 0.55 Barack Obama True \n", 183 | "3 0.57 Barack Obama True \n", 184 | "4 0.69 Barack Obama True " 185 | ] 186 | }, 187 | "execution_count": 1, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "import numpy as np\n", 194 | "import pandas as pd\n", 195 | "import matplotlib.pyplot as plt\n", 196 | "%matplotlib inline\n", 197 | "pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier\n", 198 | "##check out tutorial here:\n", 199 | "##http://nbviewer.ipython.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%201%20-%20Reading%20from%20a%20CSV.ipynb\n", 200 | "df = pd.read_csv('../train.csv')\n", 201 | "df1 = df.drop('Winner', 1)\n", 202 | "df['Win_bin'] = (df['Winner'] == 'Barack Obama') ##new column: logical wrt winner\n", 203 | "df.head()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "

SPLIT DATA INTO TRAINING AND TEST SETS

" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 2, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "X_train shape: (1091, 14)\n", 225 | "y_train shape: (1091,)\n", 226 | "X_test shape: (122, 14)\n", 227 | "y_test shape: (122,)\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "from sklearn.cross_validation import train_test_split\n", 233 | "X_train, X_test, y_train, y_test = train_test_split(df1, df['Winner'], test_size = 0.1, \n", 234 | " random_state=0)\n", 235 | "print(\"X_train shape: %s\" % repr(X_train.shape))\n", 236 | "print(\"y_train shape: %s\" % repr(y_train.shape))\n", 237 | "print(\"X_test shape: %s\" % repr(X_test.shape))\n", 238 | "print(\"y_test shape: %s\" % repr(y_test.shape))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "

OPENING THE PIPELINE

" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 3, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "#see here for intuition:\n", 257 | "#http://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html\n", 258 | "from sklearn import linear_model, decomposition, datasets, preprocessing\n", 259 | "from sklearn.pipeline import Pipeline\n", 260 | "from sklearn.grid_search import GridSearchCV\n", 261 | "from sklearn.svm import LinearSVC\n", 262 | "#build a scaler component to pipeline:\n", 263 | "scaler = preprocessing.StandardScaler().fit(X_train)\n", 264 | "#see here for 'scaler in pipeline' details: \n", 265 | "#http://scikit-learn.org/stable/modules/preprocessing.html\n", 266 | "#X_train_scaled = scaler.transform(X_train)\n", 267 | "#Instantiate a model:\n", 268 | "logistic = linear_model.LogisticRegression()\n", 269 | "#this is the pipe!:\n", 270 | "svm = LinearSVC() # Instantiate the model\n", 271 | "tuned_parameters = 10.**np.arange(-3,5)\n", 272 | "pipe = Pipeline(steps=[('scale', scaler), ('svm', svm)])" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 4, 278 | "metadata": { 279 | "collapsed": false 280 | }, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 287 | " intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',\n", 288 | " random_state=None, tol=0.0001, verbose=0))])\n" 289 | ] 290 | } 291 | ], 292 | "source": [ 293 | "estimator = GridSearchCV(pipe,\n", 294 | " dict(\n", 295 | " svm__C=tuned_parameters))\n", 296 | "estimator.fit(X_train , y_train);\n", 297 | "print(estimator.best_estimator_)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 13, 303 | "metadata": { 304 | "collapsed": false 305 | }, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZwAAAEQCAYAAACEM8KaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xt8E2W+P/BPkjZp06SF3lPK3YK0tUBBFxRWvEApUkSW\nFkHFlQV2V9yfe1bOYRftli5UFM9x9awedRe14LqsWw6uFKzItWcrdbkUaLHlspRbIemNXpKmSZrM\n/P4oRAppk5TMLfm+Xy9fMMlk5jNDzbfP88w8I2NZlgUhhBDCMbnQAQghhAQGKjiEEEJ4QQWHEEII\nL6jgEEII4QUVHEIIIbyggkMIIYQXQVxuvLKyElu3bgUA5OTkIDU1tdd1S0tLsWvXLigUCixYsMC5\nbnNzM9555x04HA6MHDkSzz77LJeRCSGEcISzgsMwDIqKipCbmwsAKCgoQEpKCmQymcv1i4uLsWHD\nBlgsFhQUFKCgoAAA8Mknn+DJJ5/E6NGjuYpKCCGEB5wVHIPBAJ1OB6VSCQCIi4tzvuZKYmIiqqur\n0draiqSkJADdRau+vp6KDSGE+AHOCo7JZIJarUZhYSEAQK1Ww2g09lpw0tLSsHPnTtjtdmRkZAAA\n2tvbYbPZsGHDBnR2diIzMxP33XcfV5EJIYRwiLOCo9FoYDabsXTpUrAsi40bNyI8PNzluvX19aio\nqMCqVasAAHl5eUhLS4NGo4FarcbKlSvBMAxyc3Mxbtw4Z6uJEEKIdHBWcOLj46HX653LBoMB8fHx\nLtdlGAYOhwMAwLIsbDZbd7igIERFRaG1tRWRkZEICuo77t69e32UnhBCAssjjzzC+T44KzhyuRzz\n58/H2rVrAQDZ2dnO98rLy6FSqZCeng4A0Ol0SEpKwvr168EwDDIyMpytmKeffhoffPABzGYzJk+e\n7LZ1c2ObhBBCPFNRUcHLfmT+NFv03r17JV1wysrKMGXKFKFj9IuUswOUX2iUX1gVFRW8tHDoxk9C\nCCG8oBYOIYQEOGrhEEII8SucTm1DvCPlfmApZwf4z28ymdDW1tbrzBveamtrQ0REhE+2JQTKzy2W\nZREREQGNRiNoDio4hPCsqakJMpkMCQkJPis4CQkJPtmOUCg/t1iWxbVr12CxWBAdHS1YDupSExEp\ntxCknB3gN7/NZkNUVJTPig0h7shkMkRFRTnvcRQKFRxCCCG8oIIjImVlZUJH6DcpZwekn58QKaCC\nQwjxidbWVmRmZiItLQ3vvPNOr+u9/vrrOHv2bJ/bGjx4sFf7fu+999DZ2XlH+yTco/twCOHZ1atX\nRT/IfCdef/11hIWF4YUXXuj3NoYMGYJLly55vP64ceOwb98+REZG9nufgaC3nz26D4cQwjuj0YjU\n1FTn4LLdbkdaWhra2tqc769YsQLz5s3Dfffdh3Xr1nm87Q8//BCzZs3CsGHDcPz48R7vnThxAtOm\nTUNWVhYKCgpw8+/Bfe3TYrEgMzMTDQ0NePLJJzFr1izU1dV5tM+LFy8iJycHmZmZyMjI6NGt+tpr\nr+E3v/kNli1bhocffhjLli3z+DiPHz+Oxx9/HLNnz8YzzzyD5uZm53uXLl3C/fffj1dffRWPPvoo\nHn/88R6fHTt2LDZv3ozp06dj6tSpPYru//zP/+DRRx/FjBkz8Ktf/QoWi8X53l/+8hesWLECzz33\nHGbMmIFXXnnF47x8ooIjIlIeR5BydkD6+X1Fq9XiwQcfxNdffw0A2LdvHyZNmuS8x0Sr1WLdunXY\ntm0bSktLsWXLFhgMBo+2/ZOf/ARffvkl0tLSbrtC7+c//znWrl2L4uJiPPLII7BarT0y9bbPkJAQ\nlJSUIDY2Fp999hm+/PJLJCYmerTPn/70p1i2bBlKSkqwceNGrFixAq2trc73q6ur8eabb2Lv3r04\ncuQILly44PYYbTYbfvGLX+CDDz7Ajh07MH/+fKxZs6bHOufPn0dycjL27NmDL774osd7MpkMZ86c\nwe7du/GPf/wDQ4YMAQDs378fO3bsQElJCb7++muoVCq8+eabPT574MAB/PrXv8bXX3/t1S8CfKL7\ncAgRoRkbj/lkO18vHe/1Z5566im8//77mD17Nj777DMsXry4x/sKhQK7du3CpUuXoFQq0dDQ0Ouj\nRzzR2toKo9GIqVOnAgAmTZqEkJAQTvdpNBpRV1eH6dOnA+geM/rBD36AQ4cOYcaMGZDJZJgxYwa0\nWq3z/RutvL6cPXsWV65cwfLlywF0P3pFpVL1WGfEiBGYO3dur9t46aWXbntt3759WLhwIYKDgwEA\nS5cuxfLly7F69WoA3YUqKytL9E9HpoIjIlK+l0XK2QHx5e9PofCVyZMnY+XKlbh06RK+++47PPjg\ng873vvvuO/zsZz/DkiVLcM899yAqKgp3Ogwsl/fd0cLFPgHctg2GYXq0gvqzD4VCgSFDhmD79u13\nnO9WDMP0+PutLTYpDMdTlxohpAeZTIb58+dj6dKl+NGPftTjvdLSUsyYMQPPPfccwsPDcenSpTv+\nogsPD0dsbCy+/fZbAMCuXbtgNpu92qdKpUJDQwMAz754tVothg4dipKSEgDAhQsXcOjQoTt+hH1S\nUhKsVit27NjhfM0XheDRRx/Fli1bnF2Nf/rTn5ytM1/tgw9UcEREyuMIUs4OSD+/ry1YsACVlZVY\nuHBhj9fnzZvnLADvvvsuJk+e7Pyiv9mHH36Ixx57DF1dXR7t7+2338bq1asxc+ZMHDlyBGq12qt9\nLlmyBE899RSeeOIJfPLJJx7t8/3338fHH3+MmTNnYvny5Xjvvfd6zIfWn5kgFAoFPv30U2zevBkZ\nGRnIzMzEn/70px7r9LXd3t578MEH8fjjj+Oxxx7D9OnTYbfb8ctf/rLH56QwcwVdFi0iUp4AU8rZ\nAX7z+/tl0US86LJo4iTlL2wpZwekn58QKaCCQwghhBdUcEREyuMIUs4OSD8/IVJABYcQQggvqOCI\niJTHEaScHeA//833VBDCBzH8zFHBIYRn0dHRuHLliii+AEhgYBgGV65cEfRpnwDNNCAqUr60WMrZ\nAX7zK5VKxMXFeTwHmSfa2tp63EMiNZSfe3FxcVAqlYJm4LTgVFZWYuvWrQCAnJwcpKam9rpuaWkp\ndu3aBYVCgQULFvRYt6urCy+++CLmzJmDmTNnchmZ+AGWZWGxM+iwOdBhc8B0/c8OmwMma/ey+frr\nN95rvhaCsv0XMDA0CAPVwd1/hn7/Z0RIEBRy391Yp1QqfXovTm1tLcaMGeOz7fGN8gcGzgoOwzAo\nKipCbm4uAKCgoAApKSm93g1bXFyMDRs2wGKxoKCgAAUFBc73du/ejREjRkjiTto7IeUWgi+zOxgW\n5q6bCoPVgY6u63+6KiLX1zHftE6QXIYwlQIaZRDClHKEKRXQKBXOP9VKBWI1SmhU3a/JZbFo7bTj\nWmcXWjvtqG3uREunHS2dXWjptMNktUOrCkKkOggDQoMRGXrLn+rvC1R4SBDkPP+sSvlnB6D8gYKz\ngmMwGKDT6ZxNuBtdCDqdzuX6iYmJqK6uRmtrK5KSkpyvW61WVFZWYtKkST2e/0DEy+Zg0HFTkXDV\nori5xXFrEbHYGYQG3ygQcoQpg3r8PUwpR7Q6GEMHhiJMKYdG2bOwhCkVCFb4dnjSwbBotdjRYu7q\nUYiazF0429zZvWzufr3D5kBEiOuW0s0tqMjQYGhUCt6LEyFC4azgmEwmqNVqFBYWAgDUajWMRmOv\nBSctLQ07d+6E3W5HRkaG8/WSkhLMnDmzx3Mq/JUYxkFYlkVnF9N3i+KWItJhc6CxzQRGoUSHzQGG\nhfOL32XRUAUhMSL4erEIur5u9981KgVCg+W8fwm7O/cKuQxR6mBEqYPdbqvLwaDNYse1TjtaO7tw\n7XohqjfZcKqxw1mYWi12dHYx3cXpekFy1YIaGBqMgeru89dbK18MPzt3gvIHBs4KjkajgdlsxtKl\nS8GyLDZu3Ijw8HCX69bX16OiogKrVq0CAOTl5SEtLQ12ux2nTp3C3LlzceDAAY/2e/M//I2b+aSy\nXFVVdcfbY1hg3L2T0GFzoOzQUVgcwPDRyeiwOVB16iwsDhmi4gehw+bARX0DLIwMwaEamGwOtHZY\nYWUAVdD1bia7FSoFC13UQGhUCrQ3NyBEzuLukcOgC1eh7vxZDJID901Kw+mTxxGisCBEzmLa1Acg\nk8lc57UDUyZ9v+wAkCaS8+/L5egwJU4dOwQAmHnz+0pgysPfL9sZIDn9PrR0duGbo5UwtcsQqh2G\nq+1WlH53ESa7DIxSjWvmLljtDoQpWMQP0GBgaBBs7c0IC2KRNmoEFLZezjct07KHy3zgbPJOhmGQ\nl5eH3NxcsCyLdevWYe3atS7X1ev12Lx5M1atWgWWZbF69Wrk5+fj5MmT2LlzJ7RaLRobG+FwOPDC\nCy/0eKLfzaQ+eSfLsrA5WI8Hunu0Pq53YVntDMKUCqiDFdCovh+v0PRocdz0d9Xtrwf5cHCc+I7V\nzvQYZ7p2vVvvUksn9EYb/nvOKL8f5yTc4GvyTs5aOHK5HPPnz3cWmezsbOd75eXlUKlUzuKg0+mQ\nlJSE9evXg2EYZGRkQKlUIj093bnOgQMHYLVaey02UuFgWOiNVlxutaKuzYK6Nisut1lwtd0Ko8UB\nAM6B7FsLwY3l6LDgHgXl5veF6I4i/FAFyRGnVSJO2/PSVoZlseLvp/HNhTZMGT5AoHSEuEePJ+BI\nm8WOulYLLrd1F5bLbVZcbrWg3mRDtDoYiREhSBygwuCIECRGqDAoQoXvjh7CtB9Ksx9Y6n3YUs//\n8Vfl+MY0AB/Mu9unl2/zRernX+r5Jd/CCQRdDgb6dhsutVm6WyutVmeLhWWBxAgVEgeEYHCECo/e\nFYnBA1RI0KqgDHJ9BVUvLxPi1l1hDlTZFdj7r2uYMSpK6DiEuEQtHDdYlkVLp93ZSqlrvdENZkVj\nhw2xYUokRqgweEBIjz8HhARRfzrh1XcGE147cBEfZo+B0seXhRP/Ri0cntnsDK60d7dO6lq/7war\na7NCIQMSI0Iw+HoX2D06DRIjQqDTKn1+vwch/ZUSr8GwgSHYWdOEJ1JjhY5DyG0CquCwLIsmcxfq\nWq8XlhvjK61WtHR2IV6r6m6lRKgwNkGLx8ZEY3BECMJD+DlNUu4HlnJ2wH/y/3iiDqu/OoeMUVFQ\nKxVCx/KYv5x/0je/LDidXQ5cud7t5bwSrNWCK+1WhATJuwfsrxeW9EFaDI5QIV6rkuRgKyE3Gxml\nxrgELbZ914inx8cLHYeQHvxuDOc/TyvRbrEjIVzV3Q0WoepxNZhG5Zc1lhCnK21WvLj9ND7KTuat\ndU6kjcZw+um/ZichNkxJrRUSsAZFqPDD4QPx1xP1WP6DQULHIcTJ70a8dRLuGrsx1YQUSTk74H/5\nnxofj11nmtHUYRMokXf87fwT1/yu4BBCgKiwYGSOjsKfj/nuIW+E3Cm/G8MRy0wDhAit3WLHkqJq\nvD1nFAZFhAgdh4gYX2M41MIhxE+FhwRhXmosCo/qhY5CCAAqOKIi5X5gKWcH/Df/E6kxqNKb8K8m\nM8+JvOOv55/0RAWHED8WGqzAk+Pi8fERauUQ4dEYDiF+zuZg8JOiGvzHtKG4J14jdBwiQjSGQwjx\nCaVCjsUT4vHR4avwo98viQRRwRERKfcDSzk74P/5Hx4ZCZPNgUOX23lK5B1/P/+kGxUcQgKAQi7D\njyfo8PERPRhq5RCB0BgOIQGCZVm8uP0MnkiNwUMjI4WOQ0SExnAIIT4lk8nw3L0J2HRUDzvjN79n\nEgmhgiMiUu4HlnJ2IHDyj0/QIk6jwlenmzlO5J1AOf+BjgoOIQFmyb06fHrMAIudEToKCTA0hkNI\nAPrdnvO4O0aNnLFxQkchIkBjOIQQzvx4gg5FVQ0wWe1CRyEBhAqOiEi5H1jK2YHAyz9kYAgmDQlH\nUVUDR4m8E2jnP1Bx/sTPyspKbN26FQCQk5OD1NTUXtctLS3Frl27oFAosGDBAue6f/zjH6HX68Ew\nDJ5//nnExVE3ACF36pl0HX7++SnMTY7BQHWw0HFIAOB0DIdhGOTl5SE3NxcAUFBQgDVr1kAmc/1E\nzpUrV2LDhg2wWCwoKChAQUFBj/dPnjyJ8vJyLFu2zOXnaQyHEO+8V14HhmWx4v7BQkchAvKLMRyD\nwQCdTgelUgmlUom4uDgYDL0/gTAxMRHV1dWoqKhAUlLSbe+HhIQgKIjzRhkhAePJcXHYd64FeqNV\n6CgkAHBacEwmE9RqNQoLC1FYWAi1Wg2j0djr+mlpadi5cydKS0tddr3t378fM2bM4DKyoKTcDyzl\n7EDg5h8YGow5yTH4pELYR1EH6vkPNJwWHI1GA7PZjEWLFmHhwoXo6OhAeHi4y3Xr6+tRUVGBVatW\n4eWXX0ZxcTFsNpvz/SNHjiAhIQGDBg3qc583/8OXlZVJarmqqkpUeWg5MJbn3xOLI5fbsW3vN6LI\nQ8vCLPOBtzEclmWxbt06rF271uW6er0emzdvxqpVq8CyLFavXo38/HwolUrU1tairKwMixcv7nN/\nNIZDSP9srazHyfoOrJk+QugoRAB8jeFwOiAil8sxf/58Z5HJzs52vldeXg6VSuUsEDqdDklJSVi/\nfj0YhkFGRgaUSiUA4M0330RUVBTy8/MxePBgLFmyhMvYhAScrOQYbPuuETUNHRgTGyZ0HOKnaKYB\nESkrK8OUKVOEjtEvUs4OUH4A+PJUEw7UtmDDrNsv2OEanX9h+cVVaoQQ6cgYFYVGUxcqrojzIW1E\n+qiFQwhxOnCuBVurGvCHx0f1er8c8T/UwiGE8O6HIwbAwbIou9AmdBTih6jgiAjflyj6kpSzA5T/\nBrlMhiUTE1B45CocPD6kjc5/YKCCQwjpYWKiFgNCg7HnX9eEjkL8DI3hEEJu8129Cev3X8BH2clQ\nKuj3Un9HYziEEMGkxGkwfGAodtY0CR2F+BEqOCIi5X5gKWcHKL8rz01MwF9P1MNsc/h827ei8x8Y\nqOAQQlwaERWKcQlabDspjoe0EemjMRxCSK+utlvx/744jQ+zkxERQo8G8Vc0hkMIEVxCuAo/HDEQ\nn52oFzoK8QNUcEREyv3AUs4OUP6+PDU+HrvONKOxw+Z+5X6i8x8YqOAQQvoUpQ7GrNFR+LPAD2kj\n0kdjOIQQt4xWO5YU1eD3WUlIjAgROg7xMRrDIYSIhlYVhHmpMdh0VC90FCJhVHBERMr9wFLODlB+\nT8xNiUGV3oSzTWafb5vOf2CggkMI8UhosAILx8Xj4yNXhY5CJIrGcAghHutyMPjJ1hqs/OFQpOk0\nQschPiKaMZyXX34ZpaWl6Orq4jwMIUTcghVyLE7X4aPDV+FHv6sSnrgtOEuXLsW5c+fw0ksvobCw\nEHV1dXzkCkhS7geWcnaA8nvjoZED0dHlwD8v++5R1HT+A4PbuSqGDx+O4cOHw26348iRI1i/fj2i\no6ORlZWFiRMn8pGRECIiCrkMz03UofDIVdw3OBxyehQ18ZBHYzjXrl1DaWkpysrKMGTIEEydOhXH\njx8HACxZsoTzkJ6iMRxC+MGyLH5ZfAaPJ8fg4bsihY5D7hBfYzhuWzivvvoqGhsb8fDDD2PNmjXQ\narUAgPT0dOTm5nIekBAiPjKZDM9NTMBbZZcwdfgABNND2ogH3P6UzJkzB7///e+RlZXlLDY3zJw5\nk7NggUjK/cBSzg5Q/v4Yl6BFvFaFr0433/G26PwHBrctnNTU1F7fe+CBB/r8bGVlJbZu3QoAyMnJ\n6XNbpaWl2LVrFxQKBRYsWOBc15ttEEL4tWRiAvJ212L6qCiEBFErh/TN7U/IhQsXbnutpqbG7YYZ\nhkFRURFeeeUVvPLKKygqKurzMsri4mKsW7cOv/nNb7Bly5Z+bUPqpkyZInSEfpNydoDy99eoGDWS\n48LwxXeNd7QdOv+BwW3B2bhx422v3SgIfTEYDNDpdFAqlVAqlYiLi4PB0Ptss4mJiaiurkZFRQWS\nkpL6tQ1CCP+enaDD1qoGmKx2oaMQkXNbcOTy21fxpJVhMpmgVqtRWFiIwsJCqNVqGI3GXtdPS0vD\nzp07UVpa6uw283YbUiflfmApZwco/50YMiAEk4aEo6iy/4+ipvMfGNwWHIVCgaamJueyXq93WYRu\npdFoYDabsWjRIixcuBAdHR0IDw93uW59fT0qKiqwatUqvPzyyyguLobNZvNqGzfc/A9fVlYmqeWq\nqipR5aFlWvZ0+Zl0Hf5eZcBXB74RRR5a9n6ZD27vw6mursb777+PSZMmgWEYHDx4ECtWrEBKSkqf\nG2YYBnl5ecjNzQXLsli3bh3Wrl3rcl29Xo/Nmzdj1apVYFkWq1evRn5+PoKCgjzeBkD34RAipPe+\nrQPDsFhx/2ChoxAv8XUfjkc3fjY0NODYsWOQyWQYN24cYmNjPdr4iRMnnFeYZWdnIy0tDQBQXl4O\nlUrVozhs27YNp0+fBsMweOCBBzBt2rQ+t+EKFRxChNPa2YWfbK3BO3NHQ6dVCR2HeEFUBUcqpF5w\nysrKJHu1i5SzA5TfVzYf1cNgtOI/pg3z6nNiyd9fUs8vmpkGAKClpQWtra3OiwVaW1sl/cVOCOHG\nj+6JxXN/q8b5a50YHhkqdBwiMm5bOFu2bMGBAwcQHByM8PBwNDQ0YMyYMXjppZf4yugxqbdwCPEH\nW6saUGUwIX/6CKGjEDccDIvXDlxA5oAWcbRwvv32W/zhD3/Avn37MGzYMKjVauzYsYPzYIQQaZoz\nJhrbTjagpqEDY2LDhI5D+vDtpTY0mrqAAfzsz+31zTExMVAqlYiJicGlS5cwZMgQXLlyhY9sAYfv\nSxR9ScrZAcrvS8ogOZ4ZH+/VQ9rElL8/pJq/uKYJWcnRvO3PbcGJjIyEyWTCmDFjsHv3bvzlL3/x\n6+llCCF3bsaoKDSbu1BxxX9v1Ja6ujYLaps7MXU4T80beDCG09nZidDQ7sG/ixcvoqqqCg888AAG\nDhzIS0Bv0BgOIeJRWtuCv1XW453HR0NGD2kTnfe/rUOwQo6f3JvA21Vqbls4N4oNAAwdOhSzZ88W\nZbEhhIjL1OEDwLLAPy60Ch2F3MJiZ7Dn7DU8dncUr/ul+cRFRKr9wIC0swOUnwtymQxL7k1A4RE9\nHEzf3fBizO8NqeU/cK4FY2LDEM/zDbpuC05+fj4fOQghfmjCIC0iQ4Ox++w1oaOQ61iWxfbqRl4v\nFrjBbcGxWq185CCQ9jM1pJwdoPxckV1v5fz5mB42O9PremLN7ykp5T/daIbJ5sDExL4nQuaC24Iz\nduxYlJeX85GFEOKHkuPCMCIyFDtONblfmXBue00TZo+JhlyACzncFpxDhw7hnXfewUsvveT8b+XK\nlXxkCzhS6we+mZSzA5Sfa89NTMBnJ+phtjlcvi/2/O5IJX+bxY7yi23IGMXvxQI3uJ1pYNWqVXzk\nIIT4seGRoRifoMW2kw14Ol0ndJyAtetMM+4fGoGIEI+m0fQ5mi2aEMKLq+1W/OKL0/goO1mwL7xA\nxrAsfvy3aqx+aBjuvmXKIdHch0MIIb6QEK7CtBED8dfjBqGjBKQjde3QqhQYHaMWLIPbXzNee+21\n216TyWTU1cYBKT9TQ8rZAcrPl0Xj47H8f2vwRGosYjVK5+tSyd8bKeQvrm5C1pgYQWd9cFtwsrKy\neiyfPn0aFouFs0CEEP8VpQ7GrLuj8ekxA/5t6hCh4wQMg9GKmoYOvPzIcEFz9GsM56OPPsKSJUu4\nyHNHaAyHEPEzWu1YUlSD32clITEiROg4AeHDw1fR5WDws0mJLt8X7RiOxWJBXV0dF1kIIQFAqwrC\nvNQYbDqiFzpKQLA5GOw63YzZY/ifWeBWbgvOM888g8WLFzv/e/755zF27Fg+sgUcqVzL74qUswOU\nn29zU2JQVW/C2SYzAOnlv5WY8//jfCtGRIWKojXpdgznk08+4SMHISSAhAYrsGhcPD4+chWvzrxL\n6Dh+rbi6CdlpsULHAECXRYuK2K9y6YuUswOUXwiZo6NQ12ZFpd4oyfw3E2v+c81mNHTYMGlIhNBR\nAHhQcM6fP3/bazU1NZyEIYQEjmCFHIvTdfjosJ6eIsyR4pomPHZ3NBRycTwAz23B+fDDD297bcuW\nLZyECXRi7gd2R8rZAcovlIdGDoS5y4GPvvpW6Ch3RIznv8PmwP/VtiJztDDzprnidgxHLr+9Jnn6\n20hlZSW2bt0KAMjJyUFqaqrL9cxmM9544w3ncm1tLTZt2gQAKC0txa5du6BQKLBgwYJet0EIkR6F\nXIZnJ+jwwT868BzLCjKDsb/affYaJiRqEakOFjqKk9uCo1Ao0NTUhOjo7kvq9Hq9yyJ0K4ZhUFRU\nhNzcXABAQUEBUlJSXN7lqlarkZeXBwC4ePEiSkpKnO8VFxdjw4YNsFgsKCgoQEFBgWdHJkFi7Qf2\nhJSzA5RfSPcPjcCnxzT45kIbpg4fIHScfhHb+WdZFsXVjXhxirhurnVbcLKzs/G73/0OkyZNAsMw\nOHjwIFasWOF2wwaDATqdDkpl9/QVcXFxztf6UlJSgszMTOdyYmIiqqur0draiqSkJLf7JYRIi0zW\n3crZePgq7h8aIZrxBik7oTdBLpfhnvgw9yvzyG1TJTk5Ga+88gqioqIQGxuLNWvWICUlxe2GTSYT\n1Go1CgsLUVhYCLVaDaPR2OdnjEYjmpubMXToUOdraWlp2LlzJ0pLS/2+O02M/cCeknJ2gPILzXax\nCqFBcvzf+Raho/SL2M7/9uomZI2JFnTeNFc8uiw6NjYWGRkZmDFjBmJjPbueW6PRwGw2Y9GiRVi4\ncCE6OjoQHt73I0337NnTY3qF+vp6VFRUYNWqVXj55ZdRXFwMm83W5zZu/ocvKyuT1HJVVZWo8tAy\nLfO1LJMBE0Ka8cdvzsPBsILnkfJyU4cNJ/RGhDWe9urzfHA7l5rdbkdQUM+et87OToSGhva5YYZh\nkJeXh9zcXLAsi3Xr1mHt2rW9ru9wOLBmzRrk5+c7x4j0ej02b96MVatWgWVZrF69Gvn5+c5uulvR\nXGqESBcZsJN9AAAWPUlEQVTLsnhp51nMHBWFGQI9kdIfbD6qR5vFjl88MNjjz4hmLrVbiwTLsi4f\nWXDbhuVyzJ8/H2vXrsW6deuQnZ3tfK+8vBwVFRU91j98+DAmTJjQ44IEnU6HpKQkrF+/Hq+++ioy\nMjJ6LTaEEGmTyWT48QQdPj1mgJ2h+3L6w86wKBHJvGmuuL1o4NYGkEwm8/iy6LFjx7qcd23y5Mm3\nvTZp0iSX25g3b55H+/IHZWXif6ZGb6ScHaD8QruRP02nRbxWid1nmpF5tzi/NF0Ry/k/eLEVCeEq\nDI/suwdKKG5bOA6Ho8e4SWdnJ7q6ujgNRQgJXM9OSMCnxw2wORiho0hO8fWLBcTKbQvnwQcfxOuv\nv445c+bA4XDgiy++wLRp03iIFnjE8BtSf0k5O0D5hXZz/uS4MAwdEIqvTjdjTnKMgKk8J4bzf6nF\ngsutFjwwTBzzprnituDMmDEDWq0We/fuhUwmQ0ZGBu6//34+shFCAtSzE3RYs7sWGaOioAqiOYY9\nUVzThJmjoxCsEO/58ijZ5MmT8atf/QrLly+H1WrF+vXruc4VkPi+RNGXpJwdoPxCuzX/qBg1kmLU\n2HmqSaBE3hH6/Hd2ObDv3DXMEvm4l9sWjtlsxpEjR3Dw4EE0NjZi7NixyMrK4iMbISSAPZuuw+qv\n/oXM0VEIDVYIHUfU9p1rwT3xGsRqxH0Vb6/34ZSVleHgwYPQ6/W49957cezYsR4TbIoR3YdDiH9Z\nt/c8RkWrkTM2TugoosWyLH7++Sksu28QJiT2fXN9bwS/D+cPf/gDlEolfvvb32LRokV0/wshhHfP\npMejqKoBZptD6CiiVd3QAaudxfhBWqGjuNVrwXn77bcxZMgQvP7661i7di3a29thMpn4zBZwhO4H\nvhNSzg5QfqH1ln/owFCkD9Li8+8aeU7kHSHPf3F1E2aPiZbEox16HcOJj4/HvHnzMG/ePFy9ehUH\nDx5Efn4+QkNDkZ6ejrlz5/KZkxASoJ5Jj8cvt5/B48nR0KjcDjsHlJbOLhy63I4V9ycKHcUjbudS\nu1VdXR0OHjyInJwcrjL1G43hEOKf/rP0ImI0Sjw7oe/HmwSav54w4EqbFS/9cKj7lfsg+BhObxIT\nE0VZbAgh/uup9Hhsr25Eu8UudBTRcDAsdtY0I0siN8cC/Sg4hDtS7oeXcnaA8gvNXX6dVoWpwweg\nqLKep0TeEeL8H7rcjoGhQRgVreZ93/1FBYcQIgmLxsXjy9PNaOmkuRwBoLimEVnJ4r7R81Zej+GI\nGY3hEOLf3j14GUFyGX46SRqD5Fy52m7Fi9vP4NMnU6D0wdQ/oh3DIYQQoTw5Lh5fn72G5o7AbuXs\nqGnCjKRInxQbPkkrrZ+Tcj+8lLMDlF9onuaPUgdjRlIk/nrCwHEi7/B5/q12BrvPXsNjIn4MQW+o\n4BBCJCVnbBz2nWtBg8nmfmU/VFrbglHRaiSEq4SO4jUawyGESM6Hh6/CaLXjl1OGCB2Fd7/44jSe\nGh+PSUN899wbGsMhhJBeZN8Ti7LzrdC3W4WOwqszjWa0dtpxbz8n6RQaFRwRkXI/vJSzA5RfaN7m\nDw8JwpzkGHx6TBxjOXyd/+KaRjw2JgoKufjnTXOFCg4hRJLmpcbgn5fbUddmEToKL4xWO7650IaM\nUVFCR+k3GsMhhEjWp8cMuNRqwW8eGiZ0FM79b1UDzjaZ8euHhvl82zSGQwghbjyREoOKK0ZcaOkU\nOgqnGJbFjpomyc0scCsqOCIi5X54KWcHKL/Q+ptfrVQg+55YfFIh7FgO1+f/2BUjVEFyJMeGcbof\nrnH6cInKykps3boVAJCTk4PU1FSX65nN5h6Pr66trcWmTZsAAM3NzXjnnXfgcDgwcuRIPPvss1xG\nJoRITFZyNLb9rRrnms0YGSWdiSy9UXy9dSOTwEPW+sLZGA7DMMjLy0Nubi4AoKCgAGvWrHF7wi5e\nvIiSkhL87Gc/AwC89dZbyMzMxOjRo93uk8ZwCAlM20424MRVE/JnjBA6is81mGz4+een8OcnUxAa\nrOBkH5IfwzEYDNDpdFAqlVAqlYiLi4PB4L7ZW1JSgszMTADdRau+vt6jYkMICVyz747G2SYzzjSa\nhY7ic1+easLDIyM5KzZ84qzgmEwmqNVqFBYWorCwEGq1Gkajsc/PGI1GNDc3Y+jQ7qfXtbe3w2az\nYcOGDcjPz8ehQ4e4iisKUu6Hl3J2gPIL7U7zK4PkeHJcHDYd1fsokXe4Ov9dDgZfnW6W/MUCN3BW\ncDQaDcxmMxYtWoSFCxeio6MD4eF93x27Z8+eHs06jUYDtVqNlStX4uWXX8bnn38Om63v+ZNu/ocv\nKyuT1HJVVZWo8tAyLUtpObzpNM4YWlFd3yGKPL5Y/nDXPzFkYAiGDAjhfH984GUMh2VZrFu3DmvX\nru11fYfDgTVr1iA/Px9y+fd18K233sLixYsRGRmJ3Nxc5ObmQqlUutwGjeEQEthKTjXhQG0LXp+V\nJHQUn/jVjjN4IiUWU4cP4HQ/fI3hcHaVmlwux/z5851FJjs72/leeXk5VCpVj+Jw+PBhTJgwoUex\nAYCnn34aH3zwAcxmMyZPntxrsSGEkOmjovBZZT0q9Uak6bRCx7kj5691Qt9uw+ShvpukU2g004CI\nlJWVYcqUKULH6BcpZwcov9B8mX/32WaUnG7Gfz2WxNtlxFyc///+5jIGhgbhmXSdT7friuSvUiOE\nECE8PDISrZ12VFzp+yIlMeuwOVBa24JZo/3jYoEbqIVDCPE7+8+14POTDXh7zihJ3iy5vboRJ/Qm\n5D4ynJf9UQuHEEL66cERA9BpZ3DocrvQUbzGsmz3zAISfIS0O1RwRITvSxR9ScrZAcovNF/nl8tk\nWJyuw6ajevDRiePL/FWGDjAMi7E6jc+2KRZUcAghfumBYRFgAXxzsU3oKF4prmlEVnKMJLsC3aGC\nIyJSvspIytkByi80LvLfaOVsPqoHw3Erx1f5r5m7cLTOiOlJkT7ZnthQwSGE+K1JQ8KhCpLj/2pb\nhY7ikZLTzfjhiAEIU0p/3jRXqOCIiJT74aWcHaD8QuMqv0wmw7MTdPikQg8Hw10rxxf5HQyLnaf8\n82KBG6jgEEL82oRBWoSHBGH/uRaho/Tp20ttiA1T+u0zfQC6D4cQEgCOXzXirbLL+HD+GCjk4hyM\n/3XJvzA9KRKP3MX/+A3dh0MIIT4yLkGLmLBg7D57TegoLtW1WVDb3Mn5JJ1Co4IjIlLuh5dydoDy\nC42P/D+eoMOnxwzocjA+3/ad5t9R04SZo6OgVPj3V7J/Hx0hhFyXEq/B4AEq7DojrlaOxc5gz9lr\neOxu/71Y4AYqOCIi5XsppJwdoPxC4yv/4nQd/nLcAJvdt62cO8m//1wLkuPCEKf1/0evUMEhhASM\nu2PDcFdUKHaeahI6CoDr86ZVNyJrTIzQUXhBBUdEpNwPL+XsAOUXGp/5F6fr8FllPSw+bOX0N/+p\nRjM6bA5MSJT2w+I8RQWHEBJQ7opWIzk2DMXVjUJHQXFNE2aPiYbcD+dNc4UKjohIuR9eytkByi80\nvvM/k65DUWUDzDaHT7bXn/xtFju+vdiGjFFRPskgBVRwCCEBZ3hkKMYlaPCFgK2cXWeaMXloBMJD\nggTLwDcqOCIi5X54KWcHKL/QhMj/TLoO2042osMHrRxv8zMsix1++pC1vlDBIYQEpMEDQnDv4HD8\nb1UD7/s+UtcOrUqB0TH+O2+aK1RwRETK/fBSzg5QfqEJlf/p8fHYXt2Idov9jrbjbf7i6iZkjfHP\nh6z1hQoOISRgJYSr8MCwAby2cgxGK2oaOjBt5EDe9ikWVHBERMr98FLODlB+oQmZf9G4eOw41YTW\nzq5+b8Ob/DtPNePRpEiEBAXe1y+nl0dUVlZi69atAICcnBykpqa6XM9sNuONN95wLtfW1mLTpk3O\n5a6uLrz44ouYM2cOZs6cyWVkQkiAidMqMW3EQPytsgHLfzCI033ZHAx2nW7Gm1lJnO5HrDgrOAzD\noKioCLm5uQCAgoICpKSkuOyzVKvVyMvLAwBcvHgRJSUlPd7fvXs3RowY4ff9nVLuh5dydoDyC03o\n/AvHxeGn205h/j2xiFQHe/15T/P/43wrRkSFIjEixOt9+APO2nQGgwE6nQ5KpRJKpRJxcXEwGAxu\nP1dSUoLMzEznstVqRWVlJSZOnAg/elYcIUREosOUeDQpEn89Uc/pfoqrmzAnObAuhb4ZZwXHZDJB\nrVajsLAQhYWFUKvVMBqNfX7GaDSiubkZQ4cOdb5WUlISMN1oUu6Hl3J2gPILTQz5n0yLw95/XUNj\nh83rz3qS/1yzGY0dNvxgcER/4vkFzgqORqOB2WzGokWLsHDhQnR0dCA8PLzPz+zZs6fHY07NZjNO\nnTqFcePGebzfm//hy8rKJLVcVVUlqjy0TMuBtPxdxT+RGtaJLcfqOdn+n/afRGpoh/MR10If763L\nfJCxHPVTMQyDvLw85ObmgmVZrFu3DmvXru11fYfDgTVr1iA/Px9yeXcdrKiowM6dO6HVatHY2AiH\nw4EXXngBiYmJLrexd+9epKenc3E4hJAA0GaxY0lRNd6dOxrxWpXPtmuy2rH4s2psnD+mX2NEXKuo\nqOjxyz5XOLtoQC6XY/78+c4ik52d7XyvvLwcKpWqR3E4fPgwJkyY4Cw2AJCenu5c58CBA7Barb0W\nG0IIuVMRIUGYPSYanx4z4KUfDnX/AQ/tPnsNExO1oiw2fOL0suixY8di7Nixt70+efLk216bNGlS\nn9uaNm2ar2KJVllZmeBX6/SXlLMDlF9oYso//55YPPe3alxps2JQhGetnL7ysyyL4pom/NvUIb6M\nKUmBd+cRIYT0QasKwtyUGPz5mN4n2zuuNyFILkNqXJhPtidlVHBERCy/4fWHlLMDlF9oYsv/RGos\njtQZcanF4tH6feXvnjct2u/vI/QEFRxCCLlFmFKBH90Tg08q7qyV09Rhwwm9EY/cFemjZNJGBUdE\n+L5E0ZeknB2g/EITY/7Hk2NQaTDh/LVOt+v2lv/LU82YNmIg1EqFr+NJEhUcQghxITRYgey0OGw+\n2r9Wjp1hUXK6GbMD7CFrfaGCIyJi68f2hpSzA5RfaGLNnzUmGqcazTjbZO5zPVf5D15sRUK4CsMj\nQ7mKJzlUcAghpBeqIDmeHNu/Vs6NiwXI96jgiIgY+7E9JeXsAOUXmpjzZ94dhdprnahp6Oh1nVvz\nX2qx4HKrBQ8MC9x501yhgkMIIX1QKuRYND4em7xo5RTXNGHm6CgEK+gr9mZ0NkRErP3YnpBydoDy\nC03s+TNGReFquxVVBpPL92/O39nlwL5z1zDrbupOuxUVHEIIcSNILsNT4+Ox6Yje7XO59p1rQVq8\nBrEaJU/ppIMKjoiIuR/bHSlnByi/0KSQ/9G7ItFs7sLxq7e3cm7kZ1kWxdWNyArgh6z1hQoOIYR4\nQCGX4en07rGc3lo51Q0dsDlYjEvQ8pxOGqjgiIjY+7H7IuXsAOUXmlTyTxsxEB02Bw7Xtfd4/Ub+\n7dVNmD0mGnKaN80lKjiEEOIhhVyGZ9Ljsfmo4bZWTktnFw5fbsf0JJo3rTdUcERECv3YvZFydoDy\nC01K+acMHwA7w6D8UpvztbKyMnx1uhlThg2AVsXpY8YkjQoOIYR4QS6TYfEEHTYf1YO53sphWGDn\nqSa6WMANKjgiIpV+bFeknB2g/EKTWv7JQyIQJJej7EIrACB4yD2IDA1GUrRa4GTiRgWHEEK8JJPJ\nsHhCPD45aoCDYVFcQ5dCe4IKjohIqR/7VlLODlB+oUkx/72J4QhTKrDlRD2q9e14cPhAoSOJHhUc\nQgjpB5lMhmevj+WMjeiCMoi+Tt2Rse7maZCQvXv3Ij09XegYhJAAwbIsNh66isdTYiQ9lU1FRQUe\neeQRzvdD1+8RQkg/yWQyLPvBIKFjSAbnBaeyshJbt24FAOTk5CA1NdXlemazGW+88YZzuba2Fps2\nbQIA/PGPf4RerwfDMHj++ecRFxfHdWxBlJWVSe5qnRuknB2g/EKj/IGB04LDMAyKioqQm5sLACgo\nKEBKSgpkLqZ9UKvVyMvLAwBcvHgRJSUlzveWL18OADh58iS2b9+OZcuWcRmbEEIIBzgd5TIYDNDp\ndFAqlVAqlYiLi4PBYHD7uZKSEmRmZt72ekhICIKC/LcXUMq/IUk5O0D5hUb5AwOn394mkwlqtRqF\nhYUAulsxRqMROp2u188YjUY0Nzdj6NCht723f/9+zJo1i6u4hBBCOMRpC0ej0cBsNmPRokVYuHAh\nOjo6EB4e3udn9uzZ4/JqiSNHjiAhIQGDBvnvAJ0U70W4QcrZAcovNMofGDht4cTHx0Ov//454AaD\nAfHx8b2u73A4UFFRgfz8/B6v19bWorq6GosXL3a7z4qKiv4HFpharZZsfilnByi/0Ch/YOD8PpwT\nJ044r1LLzs5GWloaAKC8vBwqlarHfTPffvstDAYD5s6d22MbL7zwAqKioiCXyzF48GAsWbKEy8iE\nEEI44Fc3fhJCCBEvmouBEEIIL6jgEEII4QUVHEIIIbxQrFmzZo3QIVyprKzEu+++i/379yMmJgax\nsbFer9vb6zU1NXjzzTeh1+sxduxY0R8Hn3n74s2xiCVzf/J4c5xccpXZ22x8H4s3mcVyLL7IzPex\ncJmZ02NhRcjhcLCvvPIKa7VaWavVyv72t79lGYbxeN2+XmdZlj1x4gT7z3/+k928ebPoj4PPvH3x\n5lhYVhyZb+ZpHm+Pk0u3ZvY2mxDH4klmsR3LnWQW6li4yMzHsYiyS82bKXFcravX63t9HQDS0tKg\n0WgkcRx85u2Lt9MUiSHzzTzN09/pmLhwa2ZvswlxLJ5k7u3/T6GO5U4yC3UsXGTm41hEOTGZN1Pi\n9Lbujb97M62Or/niOPjM2xex5/MVMR+nt9nEcCy++v+Tz2PxNrMYjsVXmbk+FlG2cLyZEqe3dfsz\nrY4Yj0MsxJ7PV8R8nN5mE8Ox+Or/Tz6PxdvMYjgWX2Xm+lhE2cLxZkqc3tZlGKbPbbA83O/qi+O4\ngY+8ffF2miJA+My38iRPf46TSzdn9jabUMfiSWZ3/3/eiutjuZPMQh0LF5m5PhbRzjTgzZQ4va3b\n2+t///vfcfz4cbS2tiI5Odn5vB2xHgefefvizbGIJbO7PN78O4ghc1/ZxHAs3mQWy7H4IjPfx8Jl\nZi6PRbQFhxBCiH8R5RgOIYQQ/0MFhxBCCC+o4BBCCOEFFRxCCCG8oIJDCCGEF1RwCCGE8EKUN34S\n4i/a29uxceNG1NfXIyQkBGFhYfj3f/93yGQyoaMRwjsqOIRw6KOPPsL48ePx0EMPAQDMZjMVGxKw\nqEuNEI50dHTg7NmzzmIDdE9uSEigooJDCEcaGhoEe3gbIWJEBYcQQggvqOAQwpHY2FjU19eLbtZs\nQoRCBYcQjoSFhWH06NH4+uuvna/V19cLmIgQYdFs0YRwyGQyYePGjdDr9VAqldBqtXjhhRfo4gES\nkKjgEEII4QV1qRFCCOEFFRxCCCG8oIJDCCGEF1RwCCGE8IIKDiGEEF5QwSGEEMILKjiEEEJ4QQWH\nEEIIL/4/AqL60pT/poMAAAAASUVORK5CYII=\n", 310 | "text/plain": [ 311 | "" 312 | ] 313 | }, 314 | "metadata": {}, 315 | "output_type": "display_data" 316 | } 317 | ], 318 | "source": [ 319 | "plt.plot([c.mean_validation_score for c in estimator.grid_scores_], label=\"validation error\")\n", 320 | "plt.xticks(np.arange(len(tuned_parameters)), tuned_parameters); plt.xlabel(\"C\"); plt.ylabel(\"Accuracy\");plt.legend(loc='best');" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 10, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "mean: 0.83410, std: 0.00447, params: {'svm__C': 0.001}\n", 335 | "mean: 0.83868, std: 0.00546, params: {'svm__C': 0.01}\n", 336 | "mean: 0.84051, std: 0.00431, params: {'svm__C': 0.10000000000000001}\n", 337 | "mean: 0.84235, std: 0.00448, params: {'svm__C': 1.0}\n", 338 | "mean: 0.83960, std: 0.00714, params: {'svm__C': 10.0}\n", 339 | "mean: 0.78277, std: 0.04901, params: {'svm__C': 100.0}\n", 340 | "mean: 0.72411, std: 0.04093, params: {'svm__C': 1000.0}\n", 341 | "mean: 0.79652, std: 0.02038, params: {'svm__C': 10000.0}\n" 342 | ] 343 | } 344 | ], 345 | "source": [] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "

MAKE PREDICTIONS ON TEST SET

" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 45, 357 | "metadata": { 358 | "collapsed": false 359 | }, 360 | "outputs": [ 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "accuracy of best SVM = 0.901639344262\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "predictions = estimator.best_estimator_.predict(X_test)\n", 371 | "#linear SVM that performed the best above\n", 372 | "#print(predictions)\n", 373 | "#how accurate was this?\n", 374 | "#do this tomorrow\n", 375 | "acc = float(sum(np.equal(predictions , y_test)))/len(predictions)\n", 376 | "print('accuracy of best SVM = %s' % acc)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": { 383 | "collapsed": true 384 | }, 385 | "outputs": [], 386 | "source": [] 387 | } 388 | ], 389 | "metadata": { 390 | "kernelspec": { 391 | "display_name": "Python 2", 392 | "language": "python", 393 | "name": "python2" 394 | }, 395 | "language_info": { 396 | "codemirror_mode": { 397 | "name": "ipython", 398 | "version": 2 399 | }, 400 | "file_extension": ".py", 401 | "mimetype": "text/x-python", 402 | "name": "python", 403 | "nbconvert_exporter": "python", 404 | "pygments_lexer": "ipython2", 405 | "version": "2.7.10" 406 | } 407 | }, 408 | "nbformat": 4, 409 | "nbformat_minor": 0 410 | } 411 | -------------------------------------------------------------------------------- /digit_recoginition/digit_recog_classifier_test_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pylab as pl 3 | 4 | from sklearn import svm, metrics, preprocessing 5 | 6 | import csv 7 | 8 | import time 9 | start_time = time.time() 10 | 11 | from numpy import genfromtxt 12 | my_data = genfromtxt('train.csv', delimiter=',') 13 | 14 | print time.time() - start_time, "seconds" #took ~41 seconds 15 | 16 | 17 | 18 | start_time = time.time() 19 | 20 | images_train = my_data[1:,1:] 21 | images_train = preprocessing.scale(imagestot) 22 | targets_train = my_data[1:,0] 23 | 24 | classifier = svm.SVC(kernel = 'poly', C = 100, gamma = 0.001, degree = 3) 25 | 26 | # We learn the digits 27 | classifier.fit(images_train, targets_train) 28 | 29 | print time.time() - start_time, "seconds" 30 | 31 | 32 | 33 | my_test_data = genfromtxt('test.csv', delimiter=',') 34 | test = my_test_data[1:,] 35 | test = preprocessing.scale(test) 36 | predicted = classifier.predict(test) 37 | 38 | length = len(predicted) 39 | 40 | 41 | with open('pred_test.csv', 'wb') as csvfile: 42 | csv_writer = csv.writer(csvfile) 43 | csv_writer.writerow(['ImageId','Label']) 44 | for y in range(length): 45 | csv_writer.writerow([y+1,int(predicted[y])]) 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /digit_recoginition/digit_recog_grid_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ================================================================================ 4 | Digit recognition: Support vector machine parameter estimation using grid search 5 | ================================================================================ 6 | 7 | Here I implemented a cross-validation algorithm. I used scikit learn's 8 | `sklearn.grid_search.GridSearchCV` to train each classifier on half the 9 | labeled data and used the other half as the cross-validation set to test 10 | the performance of the classifier. 11 | 12 | The classifiers I tested were all support vector machines (SVMs): Gaussian, 13 | linear, and polynomial (degrees 2,3 and 4) over a range of parameters. 14 | 15 | I tested these classifiers for precision, that is, the positive predictive 16 | value or the proportion of those tested that are predicted correctly. 17 | """ 18 | 19 | from __future__ import print_function 20 | 21 | from sklearn import datasets 22 | from sklearn.cross_validation import train_test_split 23 | from sklearn.grid_search import GridSearchCV 24 | from sklearn.metrics import classification_report 25 | from sklearn.svm import SVC 26 | from sklearn import svm, metrics, preprocessing 27 | import csv 28 | import time 29 | 30 | print(__doc__) 31 | 32 | # Loading the Digits dataset 33 | 34 | 35 | ### 36 | from numpy import genfromtxt 37 | 38 | 39 | my_data = genfromtxt('train.csv', delimiter=',') 40 | 41 | 42 | x_train = my_data[1:,1:] 43 | x_train = preprocessing.scale(x_train) 44 | t_train = my_data[1:,0] 45 | 46 | 47 | 48 | start_time = time.time() 49 | # Split the dataset in two equal parts 50 | x_train, x_cv, t_train, t_cv = train_test_split( 51 | x_train, t_train, test_size=0.5, random_state=0) 52 | 53 | # Set the parameters by cross-validation 54 | tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 55 | 'C': [1, 10]}, 56 | {'kernel': ['linear'], 'C': [1, 10]},{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [2]}, 57 | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [3]}, 58 | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [4]}] 59 | 60 | 61 | scores = ['precision'] # you can alter this by adding, for example, `recall' 62 | 63 | for score in scores: 64 | print("# Tuning hyper-parameters for %s" % score) 65 | print() 66 | 67 | clf = GridSearchCV(SVC(C=1), tuned_parameters) 68 | clf.fit(x_train, t_train) 69 | 70 | print("Best parameters set found on development set:") 71 | print() 72 | print(clf.best_estimator_) 73 | print() 74 | print("Grid scores on development set:") 75 | print() 76 | for params, mean_score, scores in clf.grid_scores_: 77 | print("%0.3f (+/-%0.03f) for %s" 78 | % (mean_score, scores.std() / 2, params)) 79 | print() 80 | 81 | 82 | print(time.time()- start_time) 83 | 84 | 85 | # 86 | 87 | 88 | # 89 | 90 | 91 | # 92 | 93 | 94 | -------------------------------------------------------------------------------- /homesite/Boris_gradient_boost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Homesite" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Using Theano backend.\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "import numpy as np\n", 28 | "import copy\n", 29 | "import csv\n", 30 | "from sklearn import linear_model\n", 31 | "import xgboost as xgb\n", 32 | "from sklearn.ensemble import RandomForestClassifier\n", 33 | "from keras.models import Sequential\n", 34 | "from keras.layers.core import Dense, Dropout, Activation\n", 35 | "from keras.optimizers import SGD\n", 36 | "from sklearn import svm\n", 37 | "from sklearn.decomposition import PCA\n", 38 | "from sklearn.preprocessing import PolynomialFeatures\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "def ReplaceWithDummies(column, *DataFrames):\n", 50 | " #The purpose of this function is to replace a column of type 'object' with n distict values,\n", 51 | " #common to all DataFrames passed in\n", 52 | " #For example train and test data sets, with n-1 boolean columns as delete the original culmnn\n", 53 | " for df in DataFrames: #Make sure the column is actually in all data frames\n", 54 | " if column not in df.columns:\n", 55 | " print('column not found')\n", 56 | " return None\n", 57 | " size=[]\n", 58 | " for df in DataFrames:\n", 59 | " size.append(df.shape[0])\n", 60 | " \n", 61 | " long_column=[]\n", 62 | " for i in range(len(DataFrames)):\n", 63 | " long_column.append(DataFrames[i][column])\n", 64 | " long_column = pd.concat(long_column)\n", 65 | " dummies = pd.get_dummies(long_column)\n", 66 | " dummies.drop(list(dummies.columns)[0], axis=1, inplace=True) # dropping one column from dummies\n", 67 | " \n", 68 | " Dummies =[] # As list of dummies to append to the list of DataFrames in order \n", 69 | " for s in size:\n", 70 | " Dummies.append(dummies[:s])\n", 71 | " dummies=dummies[s:]\n", 72 | " \n", 73 | " #drop the column that needs replacing\n", 74 | " for df in DataFrames:\n", 75 | " df.drop(column, axis=1, inplace=True)\n", 76 | "\n", 77 | " \n", 78 | " #Now append the dummy variables:\n", 79 | "\n", 80 | " for i,df in enumerate(DataFrames):\n", 81 | " for column_type in Dummies[i]: \n", 82 | " new_name=str(column) +'_'+ str(column_type)\n", 83 | " df[new_name]=Dummies[i][column_type]\n", 84 | " return DataFrames\n", 85 | " \n", 86 | " " 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 23, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "tr=pd.read_csv('train.csv')\n", 98 | "te=pd.read_csv('test.csv')" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 24, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "#Run for local testing\n", 110 | "n= len(tr)\n", 111 | "n = int(n*(float(2)/float(3)))\n", 112 | "train = copy.deepcopy(tr[:n])\n", 113 | "m=int((len(tr)-n)/2)\n", 114 | "validation = copy.deepcopy(tr[n:n+m])\n", 115 | "test = copy.deepcopy(tr[n+m:])\n", 116 | "\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 36, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "#Use this for the real thing\n", 128 | "train = tr[:]\n", 129 | "validation = te[:]\n", 130 | "test=te[:]" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 37, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [ 140 | { 141 | "name": "stderr", 142 | "output_type": "stream", 143 | "text": [ 144 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: \n", 145 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 146 | "\n", 147 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 148 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: \n", 149 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 150 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 151 | "\n", 152 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 153 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: \n", 154 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 155 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 156 | "\n", 157 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 158 | "/Users/blerner/anaconda/lib/python3.4/site-packages/pandas/core/generic.py:2862: SettingWithCopyWarning: \n", 159 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 160 | "\n", 161 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 162 | " self._update_inplace(new_data)\n", 163 | "/Users/blerner/anaconda/lib/python3.4/site-packages/pandas/core/generic.py:3117: SettingWithCopyWarning: \n", 164 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 165 | "\n", 166 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 167 | " self._update_inplace(new_data)\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "#Converts date to an int. Seems to work better than previous attemopts of using categorical variables.\n", 173 | "\n", 174 | "ALL = [train, validation, test]\n", 175 | "for frame in ALL:\n", 176 | " frame.drop('QuoteNumber', axis=1, inplace=True)\n", 177 | " frame['Original_Quote_Date']= pd.to_datetime(frame['Original_Quote_Date'])\n", 178 | " frame['Original_Quote_Date'] = frame['Original_Quote_Date'].astype(int)\n", 179 | " for c in frame:\n", 180 | " frame[c].fillna(0, inplace=True)\n", 181 | " frame[c].replace(-1, 0, inplace=True)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 43, 187 | "metadata": { 188 | "collapsed": false 189 | }, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "173836" 195 | ] 196 | }, 197 | "execution_count": 43, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "len(validation)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 40, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stderr", 215 | "output_type": "stream", 216 | "text": [ 217 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: \n", 218 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 219 | "\n", 220 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "#dropping useless columns\n", 226 | "for c in train.drop('QuoteConversion_Flag', axis=1):\n", 227 | " x=train[c].unique()\n", 228 | " if len(x) < 2:\n", 229 | " for frame in ALL:\n", 230 | " frame.drop(c, axis=1, inplace=True)\n", 231 | "#rescaling\n", 232 | "for c in train.drop('QuoteConversion_Flag', axis=1):\n", 233 | " if train[c].dtype != 'object':\n", 234 | " mean=train[c].mean()\n", 235 | " std = train[c].std()\n", 236 | " if std > 0.0001:\n", 237 | " for frame in ALL:\n", 238 | " frame = (frame[c]-mean)/std" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 42, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [ 248 | { 249 | "name": "stderr", 250 | "output_type": "stream", 251 | "text": [ 252 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:27: SettingWithCopyWarning: \n", 253 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 254 | "\n", 255 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 256 | "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:35: SettingWithCopyWarning: \n", 257 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 258 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 259 | "\n", 260 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 261 | ] 262 | } 263 | ], 264 | "source": [ 265 | "#Replacing all categorical variables with dummy variables\n", 266 | "for column in train:\n", 267 | " if train[column].dtype == 'object':\n", 268 | " [train, validation, test] = ReplaceWithDummies(column, train, validation, test)\n", 269 | " " 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "#Run this for partial\n", 281 | "\n", 282 | "X_train=train.drop('QuoteConversion_Flag', axis=1)\n", 283 | "X_test=test.drop('QuoteConversion_Flag', axis=1)\n", 284 | "X_validation=test.drop('QuoteConversion_Flag', axis=1)\n", 285 | "y_train=train['QuoteConversion_Flag']\n", 286 | "y_test=test['QuoteConversion_Flag']\n", 287 | "y_validation=validation['QuoteConversion_Flag']" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 44, 293 | "metadata": { 294 | "collapsed": true 295 | }, 296 | "outputs": [], 297 | "source": [ 298 | "#Run this for full\n", 299 | "X_train=train.drop('QuoteConversion_Flag', axis=1)\n", 300 | "y_train=train['QuoteConversion_Flag']\n", 301 | "X_validation= validation" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 49, 307 | "metadata": { 308 | "collapsed": false 309 | }, 310 | "outputs": [ 311 | { 312 | "data": { 313 | "text/plain": [ 314 | "173836" 315 | ] 316 | }, 317 | "execution_count": 49, 318 | "metadata": {}, 319 | "output_type": "execute_result" 320 | } 321 | ], 322 | "source": [ 323 | "len(X_validation)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "collapsed": true 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "#Support Vector machine\n", 335 | "model_svc = svm.SVC()\n", 336 | "model_svc.fit(X_train, y_train)\n", 337 | "print(sum(y_train))\n", 338 | "print(1-sum(abs(np.array(y_train)-np.array(model_svc.predict(X_train))))/float(len(y_train)))" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 48, 344 | "metadata": { 345 | "collapsed": false 346 | }, 347 | "outputs": [ 348 | { 349 | "data": { 350 | "text/plain": [ 351 | "(260753, 601)" 352 | ] 353 | }, 354 | "execution_count": 48, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "X_train.shape\n" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 19, 366 | "metadata": { 367 | "collapsed": false 368 | }, 369 | "outputs": [], 370 | "source": [ 371 | "#Poly\n", 372 | "pca = PCA(n_components=550) #Instantiate the model & set parameters\n", 373 | "pca.fit(X_train); #Fit the model\n", 374 | "X_train_red = pca.transform(X_train)\n", 375 | "X_validation_red = pca.transform(X_validation)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 11, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "#introduce interaction terms\n", 387 | "poly = PolynomialFeatures()\n", 388 | "poly.fit(X_train_red)\n", 389 | "X_train_poly=poly.transform(X_train_red)\n", 390 | "X_validation_poly = poly.transform(X_validation_red)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 12, 396 | "metadata": { 397 | "collapsed": false 398 | }, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "(260753, 1326)" 404 | ] 405 | }, 406 | "execution_count": 12, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "X_train_poly.shape" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": { 419 | "collapsed": false 420 | }, 421 | "outputs": [], 422 | "source": [ 423 | "X_train_red.shape" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "collapsed": false 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "#Logistioc regression\n", 435 | "model = linear_model.LogisticRegression(C=0.1)\n", 436 | "model.fit(X_train, y_train)\n", 437 | "predictions=model.predict_proba(X_validation)[:,1]\n", 438 | "#print(sum(y_train))\n", 439 | "#print(1-sum(abs(np.array(y_train)-np.array(model.predict(X_train))))/float(len(y_train)))\n", 440 | "#p1=model.predict(validation)\n", 441 | "#print(sum(y_validation))\n", 442 | "#print(1-sum(abs(np.array(y_validation)-np.array(model.predict(X_validation))))/float(len(y_validation)))\n" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 50, 448 | "metadata": { 449 | "collapsed": false 450 | }, 451 | "outputs": [], 452 | "source": [ 453 | "#Gradient boosting \n", 454 | "model_xgb = xgb.DMatrix(np.array(X_train), label=np.array(y_train))\n", 455 | "bst = xgb.train({'objective':'reg:logistic'},dtrain=model_xgb)\n", 456 | "predictions = bst.predict(xgb.DMatrix(X_validation))\n", 457 | "#predictions = bst.predict(xgb.DMatrix(X_train))\n", 458 | "#predictions_binary = []\n", 459 | "#for x in list(predictions):\n", 460 | "# if x>=0.5:\n", 461 | "# predictions_binary.append(1)\n", 462 | "# else:\n", 463 | "# predictions_binary.append(0)\n", 464 | "\n", 465 | "#print(sum(y_train))\n", 466 | "#print(1-sum(abs(np.array(y_train)-np.array(predictions_binary)))/float(len(y_train)))" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 51, 472 | "metadata": { 473 | "collapsed": false 474 | }, 475 | "outputs": [ 476 | { 477 | "data": { 478 | "text/plain": [ 479 | "173836" 480 | ] 481 | }, 482 | "execution_count": 51, 483 | "metadata": {}, 484 | "output_type": "execute_result" 485 | } 486 | ], 487 | "source": [ 488 | "len(predictions)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "collapsed": false 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "predictions = bst.predict(xgb.DMatrix(X_validation))\n", 500 | "#predictions_binary = []\n", 501 | "#for x in list(predictions):\n", 502 | "# if x>=0.5:\n", 503 | "# predictions_binary.append(1)\n", 504 | "# else:\n", 505 | "# predictions_binary.append(0)\n", 506 | "#print(sum(y_train))\n", 507 | "#print(1-sum(abs(np.array(y_validation)-np.array(predictions_binary)))/float(len(y_validation)))\n", 508 | "#p2=predictions_binary" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": { 515 | "collapsed": false 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "len(p1)" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": { 526 | "collapsed": false 527 | }, 528 | "outputs": [], 529 | "source": [ 530 | "#Let's try random forest\n", 531 | "\n", 532 | "rfc_model = RandomForestClassifier(n_estimators = 10, n_jobs=-1)\n", 533 | "rfc_model.fit(X_train,y_train)\n", 534 | "print(sum(y_train))\n", 535 | "print(1-sum(abs(np.array(y_train)-np.array(rfc_model.predict(X_train))))/float(len(y_train)))\n", 536 | "#predict2=rfc_model.predict(validation)\n", 537 | "#print(sum(y_validation))\n", 538 | "#print(1-sum(abs(np.array(y_validation)-np.array(rfc_model.predict(X_validation))))/float(len(y_validation)))\n", 539 | "p3=rfc_model.predict(X_validation)" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": { 546 | "collapsed": false 547 | }, 548 | "outputs": [], 549 | "source": [ 550 | "#Not run for full\n", 551 | "ensemble_train =list(np.logical_or(np.array(p1),np.array(p2), np.array(p3)))\n", 552 | "ensemble_train = int(ensemble_train==1)\n", 553 | "print(1-sum(abs(np.array(y_train)-np.array(ensemble_train)))/float(len(y_train)))\n", 554 | "\n", 555 | "ensemble_validation =list(np.logical_or(np.array(model.predict(X_validation)),np.array(rfc_model.predict(X_validation))))\n", 556 | "ensemble_validation = int(ensemble_validation==1)\n", 557 | "print(1-sum(abs(np.array(y_validation)-np.array(ensemble_validation)))/float(len(y_validation)))" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": { 564 | "collapsed": false 565 | }, 566 | "outputs": [], 567 | "source": [ 568 | "ensemble_predict =np.logical_or(np.array(p1),np.array(p2), np.array(p3)).astype(int)\n", 569 | "len(ensemble_predict)\n" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 53, 575 | "metadata": { 576 | "collapsed": false 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "#creates the ouput to be submitted\n", 581 | "output =pd.DataFrame()\n", 582 | "output[\"QuoteNumber\"] = te[\"QuoteNumber\"]\n", 583 | "output[\"QuoteConversion_Flag\"] = predictions\n", 584 | "output.to_csv(\"output_boost_newdate.csv\", index=False)\n", 585 | "\n", 586 | "\n" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": { 593 | "collapsed": false 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "#attempt at neural network: not working right now\n", 598 | "\n", 599 | "model = Sequential()\n", 600 | "\n", 601 | "# Dense(64) is a fully-connected layer with 64 hidden units.\n", 602 | "# in the first layer, you must specify the expected input data shape:\n", 603 | "# here, 20-dimensional vectors.\n", 604 | "model.add(Dense(64, input_dim=584, init='uniform'))\n", 605 | "#model.add(Activation('tanh'))\n", 606 | "#model.add(Dropout(0.5))\n", 607 | "#model.add(Dense(64, init='uniform'))\n", 608 | "#model.add(Activation('tanh'))\n", 609 | "#model.add(Dropout(0.5))\n", 610 | "#model.add(Dense(2, init='uniform'))\n", 611 | "#model.add(Activation('softmax'))\n", 612 | "\n", 613 | "#sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)\n", 614 | "model.compile(loss='mean_squared_error', optimizer='sgd')\n", 615 | "\n", 616 | "model.fit(X_train1, y_train1)" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": { 623 | "collapsed": false 624 | }, 625 | "outputs": [], 626 | "source": [] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": { 632 | "collapsed": true 633 | }, 634 | "outputs": [], 635 | "source": [] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": null, 640 | "metadata": { 641 | "collapsed": true 642 | }, 643 | "outputs": [], 644 | "source": [] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "metadata": { 650 | "collapsed": true 651 | }, 652 | "outputs": [], 653 | "source": [] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "metadata": { 659 | "collapsed": true 660 | }, 661 | "outputs": [], 662 | "source": [] 663 | }, 664 | { 665 | "cell_type": "markdown", 666 | "metadata": {}, 667 | "source": [ 668 | "https://www.kaggle.com/mpearmain/homesite-quote-conversion/xgboost-benchmark/discussion" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": { 675 | "collapsed": true 676 | }, 677 | "outputs": [], 678 | "source": [] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": { 684 | "collapsed": true 685 | }, 686 | "outputs": [], 687 | "source": [] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": null, 692 | "metadata": { 693 | "collapsed": true 694 | }, 695 | "outputs": [], 696 | "source": [] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": null, 701 | "metadata": { 702 | "collapsed": true 703 | }, 704 | "outputs": [], 705 | "source": [] 706 | } 707 | ], 708 | "metadata": { 709 | "kernelspec": { 710 | "display_name": "Python 3", 711 | "language": "python", 712 | "name": "python3" 713 | }, 714 | "language_info": { 715 | "codemirror_mode": { 716 | "name": "ipython", 717 | "version": 3 718 | }, 719 | "file_extension": ".py", 720 | "mimetype": "text/x-python", 721 | "name": "python", 722 | "nbconvert_exporter": "python", 723 | "pygments_lexer": "ipython3", 724 | "version": "3.4.3" 725 | } 726 | }, 727 | "nbformat": 4, 728 | "nbformat_minor": 0 729 | } 730 | -------------------------------------------------------------------------------- /homesite/initial_foray_insurance_grad_boosting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# INITIAL FORAY INTO INSURANCE DATA" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "(260753, 299)\n" 22 | ] 23 | }, 24 | { 25 | "data": { 26 | "text/html": [ 27 | "
\n", 28 | "\n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
QuoteNumberOriginal_Quote_DateQuoteConversion_FlagField6Field7Field8Field9Field10Field11Field12...GeographicField59AGeographicField59BGeographicField60AGeographicField60BGeographicField61AGeographicField61BGeographicField62AGeographicField62BGeographicField63GeographicField64
012013-08-160B230.94030.00069651.0200N...99-18-118-110NCA
122014-04-220F71.00060.00405481.2433N...1010-111-117-120NNJ
242014-08-250F71.00060.00405481.2433N...1518-121-111-18NNJ
362013-04-150J100.97690.00041,1651.2665N...65-110-19-121NTX
482014-01-250E230.94720.00061,4871.3045N...1822-110-111-112NIL
\n", 178 | "

5 rows × 299 columns

\n", 179 | "
" 180 | ], 181 | "text/plain": [ 182 | " QuoteNumber Original_Quote_Date QuoteConversion_Flag Field6 Field7 \\\n", 183 | "0 1 2013-08-16 0 B 23 \n", 184 | "1 2 2014-04-22 0 F 7 \n", 185 | "2 4 2014-08-25 0 F 7 \n", 186 | "3 6 2013-04-15 0 J 10 \n", 187 | "4 8 2014-01-25 0 E 23 \n", 188 | "\n", 189 | " Field8 Field9 Field10 Field11 Field12 ... \\\n", 190 | "0 0.9403 0.0006 965 1.0200 N ... \n", 191 | "1 1.0006 0.0040 548 1.2433 N ... \n", 192 | "2 1.0006 0.0040 548 1.2433 N ... \n", 193 | "3 0.9769 0.0004 1,165 1.2665 N ... \n", 194 | "4 0.9472 0.0006 1,487 1.3045 N ... \n", 195 | "\n", 196 | " GeographicField59A GeographicField59B GeographicField60A \\\n", 197 | "0 9 9 -1 \n", 198 | "1 10 10 -1 \n", 199 | "2 15 18 -1 \n", 200 | "3 6 5 -1 \n", 201 | "4 18 22 -1 \n", 202 | "\n", 203 | " GeographicField60B GeographicField61A GeographicField61B \\\n", 204 | "0 8 -1 18 \n", 205 | "1 11 -1 17 \n", 206 | "2 21 -1 11 \n", 207 | "3 10 -1 9 \n", 208 | "4 10 -1 11 \n", 209 | "\n", 210 | " GeographicField62A GeographicField62B GeographicField63 \\\n", 211 | "0 -1 10 N \n", 212 | "1 -1 20 N \n", 213 | "2 -1 8 N \n", 214 | "3 -1 21 N \n", 215 | "4 -1 12 N \n", 216 | "\n", 217 | " GeographicField64 \n", 218 | "0 CA \n", 219 | "1 NJ \n", 220 | "2 NJ \n", 221 | "3 TX \n", 222 | "4 IL \n", 223 | "\n", 224 | "[5 rows x 299 columns]" 225 | ] 226 | }, 227 | "execution_count": 1, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "#data from this kaggle comp.: https://www.kaggle.com/c/homesite-quote-conversion\n", 234 | "#I NEED TO ADD MORE COMMENTS, I KNOW!\n", 235 | "import numpy as np\n", 236 | "import pandas as pd\n", 237 | "import matplotlib.pyplot as plt\n", 238 | "%matplotlib inline\n", 239 | "pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier\n", 240 | "##check out tutorial here:\n", 241 | "##http://nbviewer.ipython.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%201%20-%20Reading%20from%20a%20CSV.ipynb\n", 242 | "df_train = pd.read_csv('train.csv')\n", 243 | "print np.shape(df_train)\n", 244 | "df_train.head()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "##CHOOSE A SUBSET TO WORK WITH INITIALLY\n", 256 | "# df_train = df_train[0:10000]\n", 257 | "df_train.head()" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 2, 263 | "metadata": { 264 | "collapsed": false 265 | }, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/html": [ 270 | "
\n", 271 | "\n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | "
QuoteNumberOriginal_Quote_DateField6Field7Field8Field9Field10Field11Field12CoverageField1A...GeographicField59AGeographicField59BGeographicField60AGeographicField60BGeographicField61AGeographicField61BGeographicField62AGeographicField62BGeographicField63GeographicField64
032014-08-12E160.93640.00061,4871.3045N4...11-11-120-125YIL
152013-09-07F110.99190.00385641.1886N8...1010-15-15-121NNJ
272013-03-29F150.89450.00385641.0670N11...1011-120-122-111NNJ
392015-03-21K210.88700.00041,1131.2665Y14...88-113-18-121NTX
4102014-12-10B250.91530.00079351.0200N4...77-13-122-121NCA
\n", 421 | "

5 rows × 298 columns

\n", 422 | "
" 423 | ], 424 | "text/plain": [ 425 | " QuoteNumber Original_Quote_Date Field6 Field7 Field8 Field9 Field10 \\\n", 426 | "0 3 2014-08-12 E 16 0.9364 0.0006 1,487 \n", 427 | "1 5 2013-09-07 F 11 0.9919 0.0038 564 \n", 428 | "2 7 2013-03-29 F 15 0.8945 0.0038 564 \n", 429 | "3 9 2015-03-21 K 21 0.8870 0.0004 1,113 \n", 430 | "4 10 2014-12-10 B 25 0.9153 0.0007 935 \n", 431 | "\n", 432 | " Field11 Field12 CoverageField1A ... GeographicField59A \\\n", 433 | "0 1.3045 N 4 ... 1 \n", 434 | "1 1.1886 N 8 ... 10 \n", 435 | "2 1.0670 N 11 ... 10 \n", 436 | "3 1.2665 Y 14 ... 8 \n", 437 | "4 1.0200 N 4 ... 7 \n", 438 | "\n", 439 | " GeographicField59B GeographicField60A GeographicField60B \\\n", 440 | "0 1 -1 1 \n", 441 | "1 10 -1 5 \n", 442 | "2 11 -1 20 \n", 443 | "3 8 -1 13 \n", 444 | "4 7 -1 3 \n", 445 | "\n", 446 | " GeographicField61A GeographicField61B GeographicField62A \\\n", 447 | "0 -1 20 -1 \n", 448 | "1 -1 5 -1 \n", 449 | "2 -1 22 -1 \n", 450 | "3 -1 8 -1 \n", 451 | "4 -1 22 -1 \n", 452 | "\n", 453 | " GeographicField62B GeographicField63 GeographicField64 \n", 454 | "0 25 Y IL \n", 455 | "1 21 N NJ \n", 456 | "2 11 N NJ \n", 457 | "3 21 N TX \n", 458 | "4 21 N CA \n", 459 | "\n", 460 | "[5 rows x 298 columns]" 461 | ] 462 | }, 463 | "execution_count": 2, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "X_test = pd.read_csv('test.csv')\n", 470 | "# X_test = X_test[0:5000]\n", 471 | "X_test.head()" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "

WE FIRST DEAL WITH MISSING VALUES

" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 3, 484 | "metadata": { 485 | "collapsed": false 486 | }, 487 | "outputs": [ 488 | { 489 | "data": { 490 | "text/plain": [ 491 | "(434589, 1489)" 492 | ] 493 | }, 494 | "execution_count": 3, 495 | "metadata": {}, 496 | "output_type": "execute_result" 497 | } 498 | ], 499 | "source": [ 500 | "from sklearn.decomposition import PCA #import principal component analysis\n", 501 | "from sklearn.preprocessing import Imputer\n", 502 | "from sklearn.preprocessing import scale\n", 503 | "df_train_nt = df_train.drop('QuoteConversion_Flag', 1)\n", 504 | "from sklearn.decomposition import PCA #import principal component analysis\n", 505 | "from sklearn.preprocessing import Imputer\n", 506 | "from sklearn.preprocessing import scale\n", 507 | "# df_train_nt = df_train.drop('QuoteConversion_Flag', 1)\n", 508 | "frames = [df_train_nt , X_test]\n", 509 | "X = pd.concat( frames )\n", 510 | "X_hot = pd.get_dummies( X )\n", 511 | "imp = Imputer(missing_values='NaN', strategy='mean', axis=0)\n", 512 | "imp.fit( X_hot )\n", 513 | "X_hot_imp = imp.transform( X_hot )\n", 514 | "df_all = scale(X_hot_imp) #scaled data\n", 515 | "np.shape(df_all)" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 31, 521 | "metadata": { 522 | "collapsed": false 523 | }, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "(260753,)\n" 530 | ] 531 | } 532 | ], 533 | "source": [ 534 | "# # X_train = df_all[0:10000]\n", 535 | "# X_train = df_all[0:260753]\n", 536 | "# # X_test = df_all[10000:]\n", 537 | "X_test = df_all[260753:]\n", 538 | "# print np.shape(X_train)\n", 539 | "# print np.shape(X_test)\n", 540 | "# # y_train = df_train['QuoteConversion_Flag'][0:10000]\n", 541 | "# y_train = df_train['QuoteConversion_Flag'][0:260753]\n", 542 | "X_train = df_all[0:260753]\n", 543 | "y_train = df_train['QuoteConversion_Flag'][0:260753]\n", 544 | "print np.shape(y_train)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": { 550 | "collapsed": true 551 | }, 552 | "source": [ 553 | "# LET'S TRY SOME GRADIENT BOOSTING!" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": { 560 | "collapsed": false 561 | }, 562 | "outputs": [], 563 | "source": [ 564 | "# import xgboost as xgb\n", 565 | "\n", 566 | "# #also see here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md\n", 567 | "# # hacking this: https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py\n", 568 | "# dtrain = xgb.DMatrix(np.array(X_train), label=np.array(y_train))\n", 569 | "# param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:logistic'}\n", 570 | "# num_round = 2\n", 571 | "# print ('running cross validation')\n", 572 | "# bst_cv = xgb.cv(param, dtrain, num_round, nfold=5,\n", 573 | "# metrics={'error'}, seed = 0)\n", 574 | "\n", 575 | "# print 'done'\n", 576 | "\n", 577 | "#you actually wasnt to use GridSearchCV, I think:\n", 578 | "#https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": { 585 | "collapsed": false 586 | }, 587 | "outputs": [], 588 | "source": [ 589 | "# type(bst_cv)\n", 590 | "# print bst_cv\n", 591 | "# # predictions = bst_cv.predict(xgb.DMatrix(X_test))" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 38, 597 | "metadata": { 598 | "collapsed": false 599 | }, 600 | "outputs": [], 601 | "source": [ 602 | "#Gradient boosting a la BL\n", 603 | "import xgboost as xgb\n", 604 | "model_xgb = xgb.DMatrix(np.array(X_train), label=np.array(y_train))\n", 605 | "bst = xgb.train({'max_depth':10 , 'n_estimators': 50 , 'objective':'reg:logistic'},dtrain=model_xgb)\n", 606 | "predictions = bst.predict(xgb.DMatrix(X_test))" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 33, 612 | "metadata": { 613 | "collapsed": false 614 | }, 615 | "outputs": [ 616 | { 617 | "name": "stdout", 618 | "output_type": "stream", 619 | "text": [ 620 | "(173836,)\n", 621 | "[ 0.04766377 0.1459558 0.12043708 ..., 0.45562789 0.04766377\n", 622 | " 0.2594822 ]\n" 623 | ] 624 | } 625 | ], 626 | "source": [ 627 | "print np.shape(predictions)\n", 628 | "print predictions" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 39, 634 | "metadata": { 635 | "collapsed": false 636 | }, 637 | "outputs": [], 638 | "source": [ 639 | "X_test_q = pd.read_csv('test.csv')\n", 640 | "output =pd.DataFrame()\n", 641 | "output[\"QuoteNumber\"] = X_test_q[\"QuoteNumber\"]\n", 642 | "output[\"QuoteConversion_Flag\"] = predictions\n", 643 | "output.to_csv(\"pred_test_6.csv\", index=False)" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 20, 649 | "metadata": { 650 | "collapsed": false 651 | }, 652 | "outputs": [ 653 | { 654 | "name": "stderr", 655 | "output_type": "stream", 656 | "text": [ 657 | "/Users/hugobowne-anderson/repos/scikit-learn/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 658 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n", 659 | "/Users/hugobowne-anderson/repos/scikit-learn/sklearn/grid_search.py:43: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n", 660 | " DeprecationWarning)\n" 661 | ] 662 | } 663 | ], 664 | "source": [ 665 | "from sklearn.cross_validation import KFold, train_test_split\n", 666 | "from sklearn.metrics import confusion_matrix, mean_squared_error\n", 667 | "from sklearn.grid_search import GridSearchCV" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 27, 673 | "metadata": { 674 | "collapsed": false 675 | }, 676 | "outputs": [ 677 | { 678 | "name": "stdout", 679 | "output_type": "stream", 680 | "text": [ 681 | "0.100004890254\n", 682 | "0.0794442980374\n", 683 | "Parameter optimization\n" 684 | ] 685 | } 686 | ], 687 | "source": [ 688 | "#taking lead from here: https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py\n", 689 | "rng = np.random.RandomState(31337)\n", 690 | "kf = KFold(y_train.shape[0], n_folds=2, shuffle=True, random_state=rng)\n", 691 | "for train_index, test_index in kf:\n", 692 | " model_xgb = xgb.DMatrix(np.array(X_train[train_index]), label=np.array(y_train[train_index]))\n", 693 | " bst = xgb.train({'max_depth':6 , 'n_estimators': 150 , 'objective':'reg:logistic'},dtrain=model_xgb)\n", 694 | " predictions = bst.predict(xgb.DMatrix(X_train[test_index]))\n", 695 | " actuals = y_train[test_index]\n", 696 | " print(mean_squared_error(actuals, predictions))\n", 697 | "\n", 698 | "print(\"Parameter optimization\")" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 37, 704 | "metadata": { 705 | "collapsed": false 706 | }, 707 | "outputs": [ 708 | { 709 | "name": "stdout", 710 | "output_type": "stream", 711 | "text": [ 712 | "Fitting 3 folds for each of 15 candidates, totalling 45 fits\n", 713 | "0.644338929872\n", 714 | "{'n_estimators': 50, 'max_depth': 10}\n" 715 | ] 716 | }, 717 | { 718 | "name": "stderr", 719 | "output_type": "stream", 720 | "text": [ 721 | "[Parallel(n_jobs=1)]: Done 45 out of 45 | elapsed: 897.3min finished\n" 722 | ] 723 | } 724 | ], 725 | "source": [ 726 | "xgb_model = xgb.XGBRegressor()\n", 727 | "clf = GridSearchCV(xgb_model,\n", 728 | " {'max_depth': [2,4,6,8,10],\n", 729 | " 'n_estimators': [50,100,200]}, verbose=1)\n", 730 | "clf.fit(X_train,y_train)\n", 731 | "print(clf.best_score_)\n", 732 | "print(clf.best_params_)" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": { 739 | "collapsed": true 740 | }, 741 | "outputs": [], 742 | "source": [] 743 | } 744 | ], 745 | "metadata": { 746 | "kernelspec": { 747 | "display_name": "Python 2", 748 | "language": "python", 749 | "name": "python2" 750 | }, 751 | "language_info": { 752 | "codemirror_mode": { 753 | "name": "ipython", 754 | "version": 2 755 | }, 756 | "file_extension": ".py", 757 | "mimetype": "text/x-python", 758 | "name": "python", 759 | "nbconvert_exporter": "python", 760 | "pygments_lexer": "ipython2", 761 | "version": "2.7.10" 762 | } 763 | }, 764 | "nbformat": 4, 765 | "nbformat_minor": 0 766 | } 767 | -------------------------------------------------------------------------------- /paribas/README.md: -------------------------------------------------------------------------------- 1 | for this kaggle comp.: https://www.kaggle.com/c/bnp-paribas-cardif-claims-management -------------------------------------------------------------------------------- /paribas/boosting_in_barbados.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#I want to get to know gradient boosting methods (in particular, the xgboost library) and i am also currently in barbados.\n", 12 | "#Import libraries:\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "import xgboost as xgb\n", 16 | "import time\n", 17 | "#load data:\n", 18 | "train = pd.read_csv(\"train.csv\")\n", 19 | "target = train['target']\n", 20 | "#drop targets & (unique row) IDs from training data\n", 21 | "train = train.drop(['ID','target'],axis=1)\n", 22 | "test = pd.read_csv(\"test.csv\")\n", 23 | "IDs = test['ID'].values\n", 24 | "test = test.drop(['ID'],axis=1)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "# PREPROCESSING" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 4, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "#impute both numerical & categorical features a la\n", 43 | "#http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn\n", 44 | "\n", 45 | "from sklearn.base import TransformerMixin\n", 46 | "\n", 47 | "class DataFrameImputer(TransformerMixin):\n", 48 | "\n", 49 | " def __init__(self):\n", 50 | " \"\"\"Impute missing values.\n", 51 | "\n", 52 | " Columns of dtype object are imputed with the most frequent value \n", 53 | " in column.\n", 54 | "\n", 55 | " Columns of other types are imputed with mean of column.\n", 56 | "\n", 57 | " \"\"\"\n", 58 | " def fit(self, X, y=None):\n", 59 | "\n", 60 | " self.fill = pd.Series([X[c].value_counts().index[0]\n", 61 | " if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],\n", 62 | " index=X.columns)\n", 63 | "\n", 64 | " return self\n", 65 | "\n", 66 | " def transform(self, X, y=None):\n", 67 | " return X.fillna(self.fill)\n", 68 | " \n", 69 | "xtrain = DataFrameImputer().fit_transform( train )\n", 70 | "xtest = DataFrameImputer().fit_transform( test )" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 6, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "#factorize categorical columns:\n", 82 | "for column in xtrain:\n", 83 | " if xtrain[column].dtype == 'O':\n", 84 | "# print pd.factorize(xtrain[column])\n", 85 | " xtrain[column] = pd.factorize(xtrain[column])[0]\n", 86 | " \n", 87 | "for column in xtest:\n", 88 | " if xtest[column].dtype == 'O':\n", 89 | "# print pd.factorize(xtrain[column])\n", 90 | " xtest[column] = pd.factorize(xtest[column])[0]" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "Next up: scaling/transforms/get_dummies/dimensionality reduction" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "# GRADIENT BOOSTING & CROSS VALIDATION" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 8, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [ 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "/Users/hugobowne-anderson/repos/scikit-learn/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", 119 | " \"This module will be removed in 0.20.\", DeprecationWarning)\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "#check this out: http://xgboost.readthedocs.org/en/latest/model.html\n", 125 | "from sklearn.cross_validation import KFold, train_test_split\n", 126 | "X = xtrain.values\n", 127 | "y = target.values\n", 128 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1 , random_state=0)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 9, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [ 138 | { 139 | "name": "stderr", 140 | "output_type": "stream", 141 | "text": [ 142 | "Will train until validation_0 error hasn't decreased in 50 rounds.\n", 143 | "[0]\tvalidation_0-logloss:0.660512\n", 144 | "[1]\tvalidation_0-logloss:0.633833\n", 145 | "[2]\tvalidation_0-logloss:0.611452\n", 146 | "[3]\tvalidation_0-logloss:0.592876\n", 147 | "[4]\tvalidation_0-logloss:0.577408\n", 148 | "[5]\tvalidation_0-logloss:0.564382\n", 149 | "[6]\tvalidation_0-logloss:0.553405\n", 150 | "[7]\tvalidation_0-logloss:0.543860\n", 151 | "[8]\tvalidation_0-logloss:0.536023\n", 152 | "[9]\tvalidation_0-logloss:0.528992\n", 153 | "[10]\tvalidation_0-logloss:0.523194\n", 154 | "[11]\tvalidation_0-logloss:0.518142\n", 155 | "[12]\tvalidation_0-logloss:0.514029\n", 156 | "[13]\tvalidation_0-logloss:0.510511\n", 157 | "[14]\tvalidation_0-logloss:0.507166\n", 158 | "[15]\tvalidation_0-logloss:0.504451\n", 159 | "[16]\tvalidation_0-logloss:0.502286\n", 160 | "[17]\tvalidation_0-logloss:0.500022\n", 161 | "[18]\tvalidation_0-logloss:0.498165\n", 162 | "[19]\tvalidation_0-logloss:0.496556\n", 163 | "[20]\tvalidation_0-logloss:0.495312\n", 164 | "[21]\tvalidation_0-logloss:0.493927\n", 165 | "[22]\tvalidation_0-logloss:0.492970\n", 166 | "[23]\tvalidation_0-logloss:0.491965\n", 167 | "[24]\tvalidation_0-logloss:0.491137\n", 168 | "[25]\tvalidation_0-logloss:0.490338\n", 169 | "[26]\tvalidation_0-logloss:0.489665\n", 170 | "[27]\tvalidation_0-logloss:0.489089\n", 171 | "[28]\tvalidation_0-logloss:0.488625\n", 172 | "[29]\tvalidation_0-logloss:0.488127\n", 173 | "[30]\tvalidation_0-logloss:0.487602\n", 174 | "[31]\tvalidation_0-logloss:0.487334\n", 175 | "[32]\tvalidation_0-logloss:0.486997\n", 176 | "[33]\tvalidation_0-logloss:0.486487\n", 177 | "[34]\tvalidation_0-logloss:0.486237\n", 178 | "[35]\tvalidation_0-logloss:0.485890\n", 179 | "[36]\tvalidation_0-logloss:0.485579\n", 180 | "[37]\tvalidation_0-logloss:0.485430\n", 181 | "[38]\tvalidation_0-logloss:0.485202\n", 182 | "[39]\tvalidation_0-logloss:0.484802\n", 183 | "[40]\tvalidation_0-logloss:0.484583\n", 184 | "[41]\tvalidation_0-logloss:0.484348\n", 185 | "[42]\tvalidation_0-logloss:0.484242\n", 186 | "[43]\tvalidation_0-logloss:0.483940\n", 187 | "[44]\tvalidation_0-logloss:0.483843\n", 188 | "[45]\tvalidation_0-logloss:0.483686\n", 189 | "[46]\tvalidation_0-logloss:0.483566\n", 190 | "[47]\tvalidation_0-logloss:0.483306\n", 191 | "[48]\tvalidation_0-logloss:0.482803\n", 192 | "[49]\tvalidation_0-logloss:0.482678\n", 193 | "[50]\tvalidation_0-logloss:0.482645\n", 194 | "[51]\tvalidation_0-logloss:0.482599\n", 195 | "[52]\tvalidation_0-logloss:0.482483\n", 196 | "[53]\tvalidation_0-logloss:0.482222\n", 197 | "[54]\tvalidation_0-logloss:0.482145\n", 198 | "[55]\tvalidation_0-logloss:0.482033\n", 199 | "[56]\tvalidation_0-logloss:0.481891\n", 200 | "[57]\tvalidation_0-logloss:0.481790\n", 201 | "[58]\tvalidation_0-logloss:0.481366\n", 202 | "[59]\tvalidation_0-logloss:0.481314\n", 203 | "[60]\tvalidation_0-logloss:0.481271\n", 204 | "[61]\tvalidation_0-logloss:0.481124\n", 205 | "[62]\tvalidation_0-logloss:0.481000\n", 206 | "[63]\tvalidation_0-logloss:0.480625\n", 207 | "[64]\tvalidation_0-logloss:0.480605\n", 208 | "[65]\tvalidation_0-logloss:0.480526\n", 209 | "[66]\tvalidation_0-logloss:0.480480\n", 210 | "[67]\tvalidation_0-logloss:0.480338\n", 211 | "[68]\tvalidation_0-logloss:0.480286\n", 212 | "[69]\tvalidation_0-logloss:0.480209\n", 213 | "[70]\tvalidation_0-logloss:0.480185\n", 214 | "[71]\tvalidation_0-logloss:0.480126\n", 215 | "[72]\tvalidation_0-logloss:0.480102\n", 216 | "[73]\tvalidation_0-logloss:0.479896\n", 217 | "[74]\tvalidation_0-logloss:0.479858\n", 218 | "[75]\tvalidation_0-logloss:0.479808\n", 219 | "[76]\tvalidation_0-logloss:0.479728\n", 220 | "[77]\tvalidation_0-logloss:0.479648\n", 221 | "[78]\tvalidation_0-logloss:0.479611\n", 222 | "[79]\tvalidation_0-logloss:0.479599\n", 223 | "[80]\tvalidation_0-logloss:0.479577\n", 224 | "[81]\tvalidation_0-logloss:0.479537\n", 225 | "[82]\tvalidation_0-logloss:0.479479\n", 226 | "[83]\tvalidation_0-logloss:0.479466\n", 227 | "[84]\tvalidation_0-logloss:0.479452\n", 228 | "[85]\tvalidation_0-logloss:0.479426\n", 229 | "[86]\tvalidation_0-logloss:0.479434\n", 230 | "[87]\tvalidation_0-logloss:0.479411\n", 231 | "[88]\tvalidation_0-logloss:0.479377\n", 232 | "[89]\tvalidation_0-logloss:0.479222\n", 233 | "[90]\tvalidation_0-logloss:0.479141\n", 234 | "[91]\tvalidation_0-logloss:0.479086\n", 235 | "[92]\tvalidation_0-logloss:0.479091\n", 236 | "[93]\tvalidation_0-logloss:0.479068\n", 237 | "[94]\tvalidation_0-logloss:0.479094\n", 238 | "[95]\tvalidation_0-logloss:0.479089\n", 239 | "[96]\tvalidation_0-logloss:0.479038\n", 240 | "[97]\tvalidation_0-logloss:0.479036\n", 241 | "[98]\tvalidation_0-logloss:0.478962\n", 242 | "[99]\tvalidation_0-logloss:0.478884\n" 243 | ] 244 | }, 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\n", 249 | " gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,\n", 250 | " min_child_weight=1, missing=None, n_estimators=100, nthread=-1,\n", 251 | " objective='binary:logistic', reg_alpha=0, reg_lambda=1,\n", 252 | " scale_pos_weight=1, seed=0, silent=True, subsample=1)" 253 | ] 254 | }, 255 | "execution_count": 9, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "# Early-stopping\n", 262 | "#http://xgboost.readthedocs.org/en/latest/python/python_intro.html#early-stopping\n", 263 | "#Also see https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py (Jamie Hall et al.)\n", 264 | "clf = xgb.XGBClassifier()\n", 265 | "clf.fit(X_train, y_train, early_stopping_rounds=50, eval_metric=\"logloss\",\n", 266 | " eval_set=[(X_test, y_test)])" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 14, 272 | "metadata": { 273 | "collapsed": false 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "preds = clf.predict_proba(xtest.values, ntree_limit=clf.best_iteration)[:,1]" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 16, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "import csv\n", 289 | "predictions_file = open(\"xgboost_predictions.csv\", \"w\")\n", 290 | "open_file_object = csv.writer(predictions_file)\n", 291 | "open_file_object.writerow([\"ID\", \"PredictedProb\"])\n", 292 | "open_file_object.writerows(zip(IDs, preds))\n", 293 | "predictions_file.close()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "collapsed": true 300 | }, 301 | "source": [ 302 | "This above performed okay: logloss = -0.5252. But I think we need to increase num_rounds and at least try to change preprocessing:" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "# TESTING ANOTHER APPROACH" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "Loading & preprocessing:" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 53, 322 | "metadata": { 323 | "collapsed": false 324 | }, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "Load data...\n", 331 | "Clearing...\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "#https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code\n", 337 | "print('Load data...')\n", 338 | "train = pd.read_csv(\"train.csv\")\n", 339 | "target = train['target']\n", 340 | "train = train.drop(['ID','target'],axis=1)\n", 341 | "test = pd.read_csv(\"test.csv\")\n", 342 | "ids = test['ID'].values\n", 343 | "test = test.drop(['ID'],axis=1)\n", 344 | "#\n", 345 | "print('Clearing...')\n", 346 | "for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):\n", 347 | " if train_series.dtype == 'O':\n", 348 | " #for objects: factorize\n", 349 | " train[train_name], tmp_indexer = pd.factorize(train[train_name])\n", 350 | " test[test_name] = tmp_indexer.get_indexer(test[test_name])\n", 351 | " #but now we have -1 values (NaN)\n", 352 | " else:\n", 353 | " #for int or float: fill NaN\n", 354 | " tmp_len = len(train[train_series.isnull()])\n", 355 | " if tmp_len>0:\n", 356 | " train.loc[train_series.isnull(), train_name] = train_series.mean()\n", 357 | " #and Test\n", 358 | " tmp_len = len(test[test_series.isnull()])\n", 359 | " if tmp_len>0:\n", 360 | " test.loc[test_series.isnull(), test_name] = train_series.mean() #TODO" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "A little function to report best scores (from cross validation):" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 60, 373 | "metadata": { 374 | "collapsed": true 375 | }, 376 | "outputs": [], 377 | "source": [ 378 | "from operator import itemgetter\n", 379 | "# Utility function to report best scores\n", 380 | "def report(grid_scores, n_top=3):\n", 381 | " top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]\n", 382 | " for i, score in enumerate(top_scores):\n", 383 | " print(\"Model with rank: {0}\".format(i + 1))\n", 384 | " print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", 385 | " score.mean_validation_score,\n", 386 | " np.std(score.cv_validation_scores)))\n", 387 | " print(\"Parameters: {0}\".format(score.parameters))\n", 388 | " print(\"\")" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "Now we perform a randomizedsearchCV over a number of parameters (using xgb.XGBClassifier()) --\n", 396 | "I do this because I don't know how to do it with xgb.train() --\n", 397 | "Important question: what is the relation between these two xgb.train() & xgb.XGBClassifier()?\n", 398 | "This is important because I can only do hyperparameter tuning on the latter AND I can only alter num_rounds on the former (which is necessary for a good model, it seems). Any thoughts?" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 65, 404 | "metadata": { 405 | "collapsed": false 406 | }, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "5031.24653196\n", 413 | "Model with rank: 1\n", 414 | "Mean validation score: -0.515 (std: 0.000)\n", 415 | "Parameters: {'objective': 'binary:logistic', 'subsample': 0.80000000000000004, 'learning_rate': 0.01, 'colsample_bytree': 0.90000000000000002, 'max_depth': 11}\n", 416 | "\n", 417 | "Model with rank: 2\n", 418 | "Mean validation score: -0.516 (std: 0.000)\n", 419 | "Parameters: {'objective': 'binary:logistic', 'subsample': 0.40000000000000002, 'learning_rate': 0.01, 'colsample_bytree': 1.0, 'max_depth': 12}\n", 420 | "\n", 421 | "Model with rank: 3\n", 422 | "Mean validation score: -0.516 (std: 0.000)\n", 423 | "Parameters: {'objective': 'binary:logistic', 'subsample': 0.5, 'learning_rate': 0.01, 'colsample_bytree': 0.90000000000000002, 'max_depth': 13}\n", 424 | "\n", 425 | "None\n" 426 | ] 427 | } 428 | ], 429 | "source": [ 430 | "# X = train.values\n", 431 | "# y = target.values\n", 432 | "#https://www.kaggle.com/c/springleaf-marketing-response/forums/t/16627/help-with-xgboost-sklearn-randomized-grid-search\n", 433 | "# -*- coding: utf-8 -*-\n", 434 | "\"\"\"\n", 435 | "\"\"\"\n", 436 | "t0 = time.time()\n", 437 | "#http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html\n", 438 | "#http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.RandomizedSearchCV.html\n", 439 | "param_grid = {'max_depth': range(4,15),\n", 440 | "# 'min_child_weight': [1,40],\n", 441 | " 'objective':['binary:logistic'],\n", 442 | "# 'n_estimators':[5],\n", 443 | " 'learning_rate':[0.01], #this is same as eta\n", 444 | " 'subsample': np.arange(0.1,1.1,0.1),\n", 445 | " 'colsample_bytree': np.arange(0.1,1.1,0.1),\n", 446 | " #'scale_pos_weight': [0.5, 1]\n", 447 | " #'model__eta':[0.01,0.02],\n", 448 | " #'model__scale_pos_weight':[0.8,1.0]\n", 449 | " #'model__silent':[1],\n", 450 | " }\n", 451 | "\n", 452 | "\n", 453 | "from sklearn.grid_search import GridSearchCV, RandomizedSearchCV\n", 454 | "from sklearn import metrics\n", 455 | "\n", 456 | "xgb_model = xgb.XGBClassifier()\n", 457 | "n_iter_search=20\n", 458 | "random_search = RandomizedSearchCV(xgb_model, param_distributions=param_grid,\n", 459 | " n_iter=n_iter_search, scoring =\"log_loss\")\n", 460 | "\n", 461 | "# start = time()\n", 462 | "# training and y_training are \n", 463 | "# small dataset and target variable that I generated from the training dataset\n", 464 | "random_search.fit(train, target) \n", 465 | "t1 = time.time()\n", 466 | "total_time = t1 - t0\n", 467 | "print total_time\n", 468 | "\n", 469 | "print report(random_search.grid_scores_)\n", 470 | "xgb_model_best = xgb.XGBClassifier()\n", 471 | "xgb_model_best.set_params(**random_search.best_params_)\n", 472 | "#http://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python\n", 473 | "xgb_model_best.fit(X , y)\n", 474 | "preds = xgb_model_best.predict_proba(xtest.values)[:,1]\n", 475 | "#also see this! https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings/forums/t/18494/gridsearchcv-on-xgboost/105272" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 66, 481 | "metadata": { 482 | "collapsed": true 483 | }, 484 | "outputs": [], 485 | "source": [ 486 | "import csv\n", 487 | "predictions_file = open(\"xgb_rgs_larger_predictions.csv\", \"w\")\n", 488 | "open_file_object = csv.writer(predictions_file)\n", 489 | "open_file_object.writerow([\"ID\", \"PredictedProb\"])\n", 490 | "open_file_object.writerows(zip(IDs, preds))\n", 491 | "predictions_file.close()" 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "This above performed ok (logloss = -.53791) but not as well as other people's xgbtrain() w/ a large num_rounds. For example, see:\n", 499 | "https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 67, 505 | "metadata": { 506 | "collapsed": true 507 | }, 508 | "outputs": [], 509 | "source": [ 510 | "#https://www.kaggle.com/mpearmain/homesite-quote-conversion/xgboost-benchmark\n", 511 | "#https://www.kaggle.com/c/springleaf-marketing-response/forums/t/17089/beating-the-benchmark/96855\n", 512 | "#https://github.com/lenguyenthedat/kaggle-for-fun/blob/master/springleaf-marketing-response/springleaf-xgb-native.py" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "So now I'll try using the best parameters for xgb.XGBClassifier() in xgb.train() AND make num_boost_round = 200." 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": { 526 | "collapsed": true 527 | }, 528 | "outputs": [], 529 | "source": [ 530 | "#cf https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code\n", 531 | "t0 = time.time()\n", 532 | "xgtrain = xgb.DMatrix(train.values, target.values)\n", 533 | "xgtest = xgb.DMatrix(test.values)\n", 534 | "\n", 535 | "#Now let's fit the model\n", 536 | "print('Fit the model...')\n", 537 | "boost_round = 2000 #1800 CHANGE THIS BEFORE START\n", 538 | "clf = xgb.train(random_search.best_params_,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)\n", 539 | "\n", 540 | "#Make predict\n", 541 | "print('Predict...')\n", 542 | "preds = clf.predict(xgtest, ntree_limit=clf.best_iteration )\n", 543 | "##check here for eval metrics + https://github.com/dmlc/xgboost/blob/master/demo/guide-python/evals_result.py\n", 544 | "t1 = time.time()\n", 545 | "total_time = t1 - t0\n", 546 | "print total_time" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": { 553 | "collapsed": true 554 | }, 555 | "outputs": [], 556 | "source": [ 557 | "import csv\n", 558 | "predictions_file = open(\"xgb_rgs_more_rounds_predictions.csv\", \"w\")\n", 559 | "open_file_object = csv.writer(predictions_file)\n", 560 | "open_file_object.writerow([\"ID\", \"PredictedProb\"])\n", 561 | "open_file_object.writerows(zip(IDs, preds))\n", 562 | "predictions_file.close()" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": {}, 568 | "source": [ 569 | "This performed well: logloss = -0.45991 . " 570 | ] 571 | } 572 | ], 573 | "metadata": { 574 | "kernelspec": { 575 | "display_name": "Python 2", 576 | "language": "python", 577 | "name": "python2" 578 | }, 579 | "language_info": { 580 | "codemirror_mode": { 581 | "name": "ipython", 582 | "version": 2 583 | }, 584 | "file_extension": ".py", 585 | "mimetype": "text/x-python", 586 | "name": "python", 587 | "nbconvert_exporter": "python", 588 | "pygments_lexer": "ipython2", 589 | "version": "2.7.11" 590 | } 591 | }, 592 | "nbformat": 4, 593 | "nbformat_minor": 0 594 | } 595 | -------------------------------------------------------------------------------- /paribas/stratified_CV_with_xgboost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#I want to get to know gradient boosting methods (in particular, the xgboost library) and i am also currently in barbados.\n", 12 | "#Import libraries:\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "import xgboost as xgb\n", 16 | "import time" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# Stratified CV w/ XGBoost" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Loading & preprocessing:" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "Load data...\n", 45 | "Clearing...\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "#https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code\n", 51 | "print('Load data...')\n", 52 | "train = pd.read_csv(\"train.csv\")\n", 53 | "target = train['target']\n", 54 | "train = train.drop(['ID','target'],axis=1)\n", 55 | "test = pd.read_csv(\"test.csv\")\n", 56 | "ids = test['ID'].values\n", 57 | "test = test.drop(['ID'],axis=1)\n", 58 | "#\n", 59 | "print('Clearing...')\n", 60 | "for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):\n", 61 | " if train_series.dtype == 'O':\n", 62 | " #for objects: factorize\n", 63 | " train[train_name], tmp_indexer = pd.factorize(train[train_name])\n", 64 | " test[test_name] = tmp_indexer.get_indexer(test[test_name])\n", 65 | " #but now we have -1 values (NaN)\n", 66 | " else:\n", 67 | " #for int or float: fill NaN\n", 68 | " tmp_len = len(train[train_series.isnull()])\n", 69 | " if tmp_len>0:\n", 70 | " train.loc[train_series.isnull(), train_name] = train_series.mean()\n", 71 | " #and Test\n", 72 | " tmp_len = len(test[test_series.isnull()])\n", 73 | " if tmp_len>0:\n", 74 | " test.loc[test_series.isnull(), test_name] = train_series.mean() #TODO" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "#https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/19083/best-practices-for-parameter-tuning-on-models/\n", 86 | "#https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 48, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "Fit the model...\n", 101 | "408.922273874\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "t0 = time.time()\n", 107 | "xgtrain = xgb.DMatrix(train.values, target.values)\n", 108 | "xgtest = xgb.DMatrix(test.values)\n", 109 | "\n", 110 | "params = {'objective': 'binary:logistic', \n", 111 | " 'subsample': 1, \n", 112 | " 'eta': 0.1, \n", 113 | " 'colsample_bytree': 0.9, \n", 114 | " 'max_depth': 10,\n", 115 | " 'min_child_weight' : 5,\n", 116 | " 'silent':1}\n", 117 | "\n", 118 | "#Now let's fit the model\n", 119 | "print('Fit the model...')\n", 120 | "num_round = 50 #1800 CHANGE THIS BEFORE START\n", 121 | "clf = xgb.cv(params,xgtrain,num_boost_round=num_round,metrics={'logloss'}, nfold = 5 ,\n", 122 | " seed = 0 ,maximize=False)\n", 123 | "\n", 124 | "#i have attempted this with argument stratified = 1 and get the following error:\n", 125 | "#TypeError: cv() got an unexpected keyword argument 'stratified'\n", 126 | "\n", 127 | "\n", 128 | "#Make predict\n", 129 | "# print('Predict...')\n", 130 | "##check here for eval metrics + https://github.com/dmlc/xgboost/blob/master/demo/guide-python/evals_result.py\n", 131 | "t1 = time.time()\n", 132 | "total_time = t1 - t0\n", 133 | "print total_time" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 49, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/html": [ 146 | "
\n", 147 | "\n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | "
test-logloss-meantest-logloss-stdtrain-logloss-meantrain-logloss-std
00.6579330.0005910.6554290.000550
10.6283500.0007390.6230860.000943
20.6038840.0011040.5958020.001063
30.5833700.0010120.5725410.000836
40.5667550.0009240.5531940.000422
50.5526140.0008550.5362830.000151
60.5402110.0008550.5213920.000222
70.5297210.0008800.5084030.000270
80.5207900.0008670.4971020.000514
90.5132120.0009050.4871750.000640
100.5069940.0009590.4785060.000730
110.5014470.0010940.4708610.000729
120.4967890.0012370.4641080.000729
130.4926890.0011840.4579470.000677
140.4891900.0012120.4522370.000786
150.4862480.0012570.4471010.000871
160.4836810.0012390.4424270.000849
170.4814310.0012760.4382610.000909
180.4796270.0013800.4343580.000855
190.4778960.0014530.4305950.000764
200.4765380.0013810.4273510.000817
210.4752840.0014240.4242630.000826
220.4742390.0013950.4214720.000912
230.4732940.0014220.4188480.000894
240.4725170.0014780.4164370.000846
250.4718180.0015030.4141730.000894
260.4712440.0015400.4120180.000970
270.4707180.0015830.4102440.001135
280.4702900.0015980.4084010.001012
290.4699170.0016310.4067920.001054
300.4695910.0016320.4051170.001180
310.4692360.0016560.4033490.000978
320.4689960.0016890.4018590.000776
330.4687920.0016640.4006430.000672
340.4685620.0016430.3992900.000633
350.4683000.0017000.3979690.000679
360.4680790.0016990.3966540.000702
370.4679500.0017060.3955480.000823
380.4677910.0016840.3944530.000800
390.4676160.0016570.3934370.000796
400.4674660.0016500.3921710.000925
410.4673810.0017020.3911020.001146
420.4672530.0017740.3899480.000796
430.4671220.0017330.3890370.000645
440.4670430.0017470.3878920.000712
450.4669650.0017570.3869380.000627
460.4668650.0017870.3858810.000904
470.4668200.0018410.3849700.000891
480.4667240.0018950.3842100.000894
490.4666670.0019110.3835090.000787
\n", 510 | "
" 511 | ], 512 | "text/plain": [ 513 | " test-logloss-mean test-logloss-std train-logloss-mean train-logloss-std\n", 514 | "0 0.657933 0.000591 0.655429 0.000550\n", 515 | "1 0.628350 0.000739 0.623086 0.000943\n", 516 | "2 0.603884 0.001104 0.595802 0.001063\n", 517 | "3 0.583370 0.001012 0.572541 0.000836\n", 518 | "4 0.566755 0.000924 0.553194 0.000422\n", 519 | "5 0.552614 0.000855 0.536283 0.000151\n", 520 | "6 0.540211 0.000855 0.521392 0.000222\n", 521 | "7 0.529721 0.000880 0.508403 0.000270\n", 522 | "8 0.520790 0.000867 0.497102 0.000514\n", 523 | "9 0.513212 0.000905 0.487175 0.000640\n", 524 | "10 0.506994 0.000959 0.478506 0.000730\n", 525 | "11 0.501447 0.001094 0.470861 0.000729\n", 526 | "12 0.496789 0.001237 0.464108 0.000729\n", 527 | "13 0.492689 0.001184 0.457947 0.000677\n", 528 | "14 0.489190 0.001212 0.452237 0.000786\n", 529 | "15 0.486248 0.001257 0.447101 0.000871\n", 530 | "16 0.483681 0.001239 0.442427 0.000849\n", 531 | "17 0.481431 0.001276 0.438261 0.000909\n", 532 | "18 0.479627 0.001380 0.434358 0.000855\n", 533 | "19 0.477896 0.001453 0.430595 0.000764\n", 534 | "20 0.476538 0.001381 0.427351 0.000817\n", 535 | "21 0.475284 0.001424 0.424263 0.000826\n", 536 | "22 0.474239 0.001395 0.421472 0.000912\n", 537 | "23 0.473294 0.001422 0.418848 0.000894\n", 538 | "24 0.472517 0.001478 0.416437 0.000846\n", 539 | "25 0.471818 0.001503 0.414173 0.000894\n", 540 | "26 0.471244 0.001540 0.412018 0.000970\n", 541 | "27 0.470718 0.001583 0.410244 0.001135\n", 542 | "28 0.470290 0.001598 0.408401 0.001012\n", 543 | "29 0.469917 0.001631 0.406792 0.001054\n", 544 | "30 0.469591 0.001632 0.405117 0.001180\n", 545 | "31 0.469236 0.001656 0.403349 0.000978\n", 546 | "32 0.468996 0.001689 0.401859 0.000776\n", 547 | "33 0.468792 0.001664 0.400643 0.000672\n", 548 | "34 0.468562 0.001643 0.399290 0.000633\n", 549 | "35 0.468300 0.001700 0.397969 0.000679\n", 550 | "36 0.468079 0.001699 0.396654 0.000702\n", 551 | "37 0.467950 0.001706 0.395548 0.000823\n", 552 | "38 0.467791 0.001684 0.394453 0.000800\n", 553 | "39 0.467616 0.001657 0.393437 0.000796\n", 554 | "40 0.467466 0.001650 0.392171 0.000925\n", 555 | "41 0.467381 0.001702 0.391102 0.001146\n", 556 | "42 0.467253 0.001774 0.389948 0.000796\n", 557 | "43 0.467122 0.001733 0.389037 0.000645\n", 558 | "44 0.467043 0.001747 0.387892 0.000712\n", 559 | "45 0.466965 0.001757 0.386938 0.000627\n", 560 | "46 0.466865 0.001787 0.385881 0.000904\n", 561 | "47 0.466820 0.001841 0.384970 0.000891\n", 562 | "48 0.466724 0.001895 0.384210 0.000894\n", 563 | "49 0.466667 0.001911 0.383509 0.000787" 564 | ] 565 | }, 566 | "execution_count": 49, 567 | "metadata": {}, 568 | "output_type": "execute_result" 569 | } 570 | ], 571 | "source": [ 572 | "clf" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "# Some notes on xgb.train() " 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 89, 585 | "metadata": { 586 | "collapsed": false 587 | }, 588 | "outputs": [], 589 | "source": [ 590 | "from sklearn.cross_validation import KFold, train_test_split\n", 591 | "X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.05 ,random_state=0)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 90, 597 | "metadata": { 598 | "collapsed": true 599 | }, 600 | "outputs": [], 601 | "source": [ 602 | "xgtrains = xgb.DMatrix(X_train.values, y_train.values)\n", 603 | "xgtest = xgb.DMatrix(X_test.values, y_test.values)\n", 604 | "# xgtest = xgb.DMatrix(test.values)\n", 605 | "params = {'objective': 'binary:logistic', \n", 606 | " 'subsample': 1, \n", 607 | " 'eta': 0.1, \n", 608 | " 'colsample_bytree': 0.9, \n", 609 | " 'max_depth': 10,\n", 610 | " 'min_child_weight' : 5,\n", 611 | " 'silent':1}" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 91, 617 | "metadata": { 618 | "collapsed": true 619 | }, 620 | "outputs": [ 621 | { 622 | "name": "stderr", 623 | "output_type": "stream", 624 | "text": [ 625 | "Will train until logloss error hasn't decreased in 10 rounds.\n", 626 | "[0]\tlogloss-error:0.235963\n", 627 | "[1]\tlogloss-error:0.232639\n", 628 | "[2]\tlogloss-error:0.232290\n", 629 | "[3]\tlogloss-error:0.231065\n", 630 | "[4]\tlogloss-error:0.229491\n", 631 | "[5]\tlogloss-error:0.230191\n", 632 | "[6]\tlogloss-error:0.227567\n", 633 | "[7]\tlogloss-error:0.226517\n", 634 | "[8]\tlogloss-error:0.225993\n", 635 | "[9]\tlogloss-error:0.226867\n", 636 | "[10]\tlogloss-error:0.226517\n", 637 | "[11]\tlogloss-error:0.227917\n", 638 | "[12]\tlogloss-error:0.226168\n", 639 | "[13]\tlogloss-error:0.227567\n", 640 | "[14]\tlogloss-error:0.226692\n", 641 | "[15]\tlogloss-error:0.227042\n", 642 | "[16]\tlogloss-error:0.227742\n", 643 | "[17]\tlogloss-error:0.227567\n", 644 | "[18]\tlogloss-error:0.226692\n", 645 | "Stopping. Best iteration:\n", 646 | "[8]\tlogloss-error:0.225993\n", 647 | "\n" 648 | ] 649 | } 650 | ], 651 | "source": [ 652 | "clft = xgb.train(params,xgtrains,num_boost_round=num_round,\n", 653 | " evals= [(xgtest,'logloss')] , early_stopping_rounds = 10,\n", 654 | " verbose_eval=True)" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 92, 660 | "metadata": { 661 | "collapsed": true 662 | }, 663 | "outputs": [], 664 | "source": [ 665 | "#see here:\n", 666 | "#https://www.kaggle.com/ashhafez/springleaf-marketing-response/xgb-learning-rate-eta-decay/run/78945/code\n", 667 | "#http://discuss.analyticsvidhya.com/t/how-to-predict-class-labels-using-xgboost-in-python-when-objective-function-is-binary-logistic/7809" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": { 674 | "collapsed": true 675 | }, 676 | "outputs": [], 677 | "source": [] 678 | } 679 | ], 680 | "metadata": { 681 | "kernelspec": { 682 | "display_name": "Python 2", 683 | "language": "python", 684 | "name": "python2" 685 | }, 686 | "language_info": { 687 | "codemirror_mode": { 688 | "name": "ipython", 689 | "version": 2 690 | }, 691 | "file_extension": ".py", 692 | "mimetype": "text/x-python", 693 | "name": "python", 694 | "nbconvert_exporter": "python", 695 | "pygments_lexer": "ipython2", 696 | "version": "2.7.11" 697 | } 698 | }, 699 | "nbformat": 4, 700 | "nbformat_minor": 0 701 | } 702 | -------------------------------------------------------------------------------- /wine_quality/README.txt: -------------------------------------------------------------------------------- 1 | #dataset is from here: #http://archive.ics.uci.edu/ml/datasets/Wine+Quality 2 | #there are many more great data sets there! -------------------------------------------------------------------------------- /wine_quality/ipython_notebooks/box_cox.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Sept 29 2015 4 | 5 | @author: hugobowne-anderson 6 | @email: hugobowne at gmail dot com 7 | """ 8 | 9 | from scipy import stats 10 | import pandas as pd 11 | 12 | def box_cox(df, lmbda=None, alpha=None): 13 | """ 14 | Performs a Box-Cox Transformation on all columns (features) of a pandas 15 | dataframe. Currently, there is some ambiguity as to how to deal with 16 | non-positive values & I need to check this out: at the moment, I just centre 17 | the data so that min(value) > 0, for all features, as necessitated by 18 | the very nature of the Box-Cox Transformation. 19 | """ 20 | df_tr = pd.DataFrame(columns=df.columns) #initialize empty data frame with same features as df 21 | for val in list(df.columns): 22 | df_tr[val] = stats.boxcox(df[val] - min(df[val]) + 0.1,lmbda, alpha)[0] #populate dataframe with transformed data 23 | return df_tr 24 | -------------------------------------------------------------------------------- /wine_quality/ipython_notebooks/yeo_johnson.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Sept 29 2015 4 | 5 | @author: hugobowne-anderson 6 | @email: hugobowne at gmail dot com 7 | """ 8 | 9 | from scipy import stats 10 | import numpy as np 11 | import pandas as pd 12 | import math as math 13 | 14 | def yeo_johnson(x, lmbda=0 ): 15 | """ 16 | Performs a Yeo-Johnson Transformation on a numpy array. 17 | Arguments: 18 | Input array. Should be 1-dimensional. 19 | lmbda : {scalar}, optional. 20 | IN PROGRESS: I WILL COMMENT CODE BELOW ASAP; A RUNTIME WARNING MAY BE THROWN 21 | DURING EXECUTION BUT THE RESULT SHOULD NOT BE AFFECTED. 22 | I HAVE USED THE DEFINITION OF YEO-JOHNSON TRANSFORMATION FROM THE ORIGINAL PAPER: 23 | Yeo, In-Kwon and Johnson, Richard (2000). A new family of power transformations 24 | to improve normality or symmetry. Biometrika, 87, 954-959. 25 | """ 26 | #The Yeo-Johnson Transform is defined differently for differing values of lambda 27 | if lmbda == 0: 28 | #as transform is defined piecewise, I compute it using the sum of relational 29 | #operators: for this reason, I 1st define the 2 functions 30 | A1 = np.log(abs(x+1)) 31 | A1[A1 == -np.inf] = 0 #subtlety: if value = -inf , then term will not be used 32 | # BUT I do need to set it to 0 so that it IS unused below 33 | A2 = (np.power(1-x , 2) - 1)/2 34 | A2[np.isnan(A2)] = 0#subtlety: if value = NaN , then term will not be used 35 | # BUT I do need to set it to 0 so that it IS unused below 36 | x_yj = (x>=0)*A1 - (x<0)*A2 37 | elif lmbda == 2: 38 | #as transform is defined piecewise, I compute it using the sum of relational 39 | #operators: for this reason, I 1st define the 2 functions 40 | B1 = (np.power(x+1 , 2) - 1)/2 41 | B1[np.isnan(B1)] = 0#subtlety: if value = NaN , then term will not be used 42 | # BUT I do need to set it to 0 so that it IS unused below 43 | B2 = np.log(abs(1-x)) 44 | B2[B2==-np.inf] = 0#subtlety: if value = -inf , then term will not be used 45 | # BUT I do need to set it to 0 so that it IS unused below 46 | x_yj = (x>=0)*B1 - (x<0)*B2 47 | else: 48 | #as transform is defined piecewise, I compute it using the sum of relational 49 | #operators: for this reason, I 1st define the 2 functions 50 | C1 = (np.power(x+1 , lmbda) - 1)/lmbda 51 | C1[np.isnan(C1)] = 0#subtlety: if value = NaN , then term will not be used 52 | # BUT I do need to set it to 0 so that it IS unused below 53 | C2 = (np.power(1-x , 2-lmbda) - 1)/(2 - lmbda) 54 | C2[np.isnan(C2)] = 0#subtlety: if value = NaN , then term will not be used 55 | # BUT I do need to set it to 0 so that it IS unused below 56 | x_yj = (x>=0)*C1 + (x<0)*C2 57 | 58 | return x_yj 59 | 60 | def dfyeo_johnson(df, lmbda=0 ): 61 | """ 62 | Performs a Yeo-Johnson Transformation on all columns (features)of a dataframe 63 | """ 64 | df_yj = pd.DataFrame(columns=df.columns) #initialize empty data frame with same features as df 65 | for val in list(df.columns): 66 | df_yj[val] = yeo_johnson(df[val]) #populate dataframe with transformed data 67 | return df_yj 68 | -------------------------------------------------------------------------------- /wine_quality/ipython_notebooks/yjscratch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Sep 29 13:53:59 2015 4 | 5 | @author: hugobowne-anderson 6 | """ 7 | 8 | import numpy as np 9 | 10 | s = np.arange(10)-5 11 | import math as math 12 | import yeo_johnson as yj 13 | 14 | 15 | math.log(math.exp(1)) 16 | 17 | yeo_johnson(s , lmbda = 2) 18 | -------------------------------------------------------------------------------- /wine_quality/python/wine_classifier.py: -------------------------------------------------------------------------------- 1 | __author__ = "Fernando Carrillo" 2 | __email__ = "fernando at carrillo.at" 3 | 4 | from sklearn.grid_search import GridSearchCV 5 | from sklearn.cross_validation import cross_val_score 6 | from sklearn.metrics import classification_report 7 | 8 | class WineClassifier(object): 9 | """ 10 | Use classification (not regression) for wine quality. 11 | """ 12 | def __init__(self, X_train, y_train, X_valid, y_valid, pipeline, param_grid): 13 | """ 14 | Set the data sets. 15 | """ 16 | self.X_train = X_train 17 | self.y_train = y_train 18 | self.X_valid = X_valid 19 | self.y_valid = y_valid 20 | self.pipeline = pipeline 21 | self.param_grid = param_grid 22 | 23 | def train(self, verbose=1, n_jobs=-1, scoring='accuracy', cv=10): 24 | """ 25 | Train the classifier by grid search 26 | """ 27 | if len(self.param_grid) != 0: 28 | grid_search = GridSearchCV(self.pipeline, param_grid=self.param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs, scoring=scoring) 29 | grid_search.fit(self.X_train, self.y_train) 30 | if verbose > 1: 31 | print( ('Best score %s with parameters %s') % (grid_search.best_score_, grid_search.best_params_)) 32 | self.pipeline = grid_search.best_estimator_ 33 | else: 34 | if verbose > 1: 35 | scores = cross_val_score(self.pipeline, self.X_train, self.y_train, cv=cv) 36 | print(('Best score %s') % (scores.mean())) 37 | self.pipeline.fit(self.X_train, self.y_train) 38 | 39 | def classification_report(self, print_stdout=True): 40 | """ 41 | Valid classifier on validation set. 42 | """ 43 | report = classification_report(self.y_valid, self.pipeline.predict(self.X_valid)) 44 | if print_stdout: print(report) 45 | return(report) 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /wine_quality/python/wine_data.py: -------------------------------------------------------------------------------- 1 | __author__ = "Fernando Carrillo" 2 | __email__ = "fernando at carrillo.at" 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | class WineData(object): 8 | """docstring for WineData""" 9 | def __init__(self, path_to_red, path_to_white): 10 | self.path_to_red = path_to_red 11 | self.path_to_white = path_to_white 12 | 13 | def _load(self, path_to_data): 14 | """ 15 | Loads the data from data 16 | """ 17 | data = np.array(pd.read_csv(path_to_data, header=0, sep=';')) 18 | X = data[:,:-1] 19 | y = data[:,-1] 20 | return X, y 21 | 22 | def load_red(self): 23 | """ 24 | Loads the red wine data 25 | """ 26 | return self._load(self.path_to_red) 27 | 28 | def load_white(self): 29 | """ 30 | Loads the white wine data 31 | """ 32 | return self._load(self.path_to_white) -------------------------------------------------------------------------------- /wine_quality/python/wine_explore.py: -------------------------------------------------------------------------------- 1 | __author__ = "Fernando Carrillo" 2 | __email__ = "fernando at carrillo.at" 3 | 4 | from matplotlib import pyplot as plt 5 | from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler 6 | from sklearn.decomposition import PCA 7 | from sklearn.manifold import Isomap, TSNE, LocallyLinearEmbedding, SpectralEmbedding, MDS 8 | 9 | import pandas as pd 10 | 11 | def plot2d(X, y, scale=True, normalize=False, embedding='pca', title=''): 12 | """ 13 | Plot data transformed into two dimensions by PCA. 14 | PCA transforms into a new embedding dimension such that 15 | the first dimension contains the maximal variance and following 16 | dimensions maximal remaining variance. 17 | This shoudl spread the observed n-dimensional data maximal. This 18 | is unsupervised and will not consider target values. 19 | """ 20 | if (scale): 21 | scaler = StandardScaler() 22 | X = scaler.fit_transform(X) 23 | 24 | if (normalize): 25 | normalizer = Normalizer(norm='l2') 26 | X = normalizer.fit_transform(X) 27 | 28 | if (embedding is 'pca'): 29 | pca = PCA(n_components=2) 30 | X_transformed = pca.fit_transform(X) 31 | elif (embedding is 'isomap'): 32 | isomap = Isomap(n_components=2, n_neighbors=20) 33 | X_transformed = isomap.fit_transform(X) 34 | elif (embedding is 'lle' ): 35 | lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5) 36 | X_transformed = lle.fit_transform(X) 37 | elif (embedding is 'tsne'): 38 | t_sne = TSNE(n_components=2) 39 | X_transformed = t_sne.fit_transform(X) 40 | elif (embedding is 'spectral'): 41 | se = SpectralEmbedding(n_components=2) 42 | X_transformed = se.fit_transform(X) 43 | elif (embedding is 'mds'): 44 | mds = MDS(n_components=2) 45 | X_transformed = mds.fit_transform(X) 46 | elif (embedding is 'gallery'): 47 | plt.figure(1) 48 | 49 | plt.subplot(231) 50 | plt.title('pca') 51 | X_t = PCA(n_components=2).fit_transform(X) 52 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) 53 | 54 | plt.subplot(232) 55 | plt.title('isomap') 56 | X_t = Isomap(n_neighbors=20).fit_transform(X) 57 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) 58 | 59 | plt.subplot(233) 60 | plt.title('lle') 61 | X_t = LocallyLinearEmbedding(n_neighbors=20).fit_transform(X) 62 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) 63 | 64 | plt.subplot(234) 65 | plt.title('tsne') 66 | X_t = TSNE().fit_transform(X) 67 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) 68 | 69 | plt.subplot(235) 70 | plt.title('spectral') 71 | X_t = SpectralEmbedding().fit_transform(X) 72 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) 73 | 74 | plt.subplot(236) 75 | plt.title('mds') 76 | X_t = MDS().fit_transform(X) 77 | plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) 78 | 79 | plt.suptitle('Gallery transforms ' + title) 80 | 81 | return plt 82 | else: 83 | raise ValueError("Choose between pca, isomap and tsne") 84 | 85 | plt.title(title + ' ' + embedding + ' plot') 86 | sc = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y) 87 | plt.colorbar(sc) 88 | return plt 89 | 90 | def pairs(X, y, title): 91 | """ 92 | Quick and dirty version of pairs. 93 | """ 94 | df = pd.DataFrame(X) 95 | df[df.shape[1]] = y 96 | plt.title(title + ' Pairwise plot') 97 | axes = pd.tools.plotting.scatter_matrix(df, alpha=0.2) 98 | return plt 99 | -------------------------------------------------------------------------------- /wine_quality/python/wine_main.py: -------------------------------------------------------------------------------- 1 | __author__ = "Fernando Carrillo" 2 | __email__ = "fernando at carrillo.at" 3 | 4 | from wine_data import WineData 5 | from wine_preprocesser import WinePreprocesser 6 | from wine_explore import plot2d, pairs 7 | from wine_classifier import WineClassifier 8 | 9 | from time import time 10 | import numpy as np 11 | 12 | from sklearn.pipeline import Pipeline 13 | from sklearn.cross_validation import train_test_split 14 | from sklearn.preprocessing import StandardScaler 15 | from sklearn.naive_bayes import GaussianNB 16 | from sklearn.neighbors import KNeighborsClassifier 17 | from sklearn.svm import LinearSVC 18 | from sklearn.linear_model import LogisticRegression 19 | from sklearn.decomposition import PCA 20 | 21 | # Load data and preprocess (everything you don't put in the pipeline) 22 | data = WineData('../winequality-red.csv', '../winequality-white.csv') 23 | 24 | print('Preprocesing.') 25 | t0 = time() 26 | wp = WinePreprocesser(data) 27 | #wp.add_divided_features(replace_inf_with_absmax=True) 28 | wp.polynomial_expansion(rank=2) 29 | wp.remove_low_variance_features(variance_threshold=0) 30 | X_red, y_red = wp.get_red() 31 | X_white, y_white = wp.get_white() 32 | print('Preprocesing. Done in %fs' % (time()-t0) ) 33 | ############################### 34 | # Explore data 35 | # 1. Plot in 2d, color code classes: 36 | # -> no simple low dimension linear separation 37 | # 2. Plot paris 38 | # -> correlation: transform data or use regularized methods 39 | # -> non-normal distributed featues: Box-Cox transform 40 | ############################### 41 | do_plot = False 42 | if (do_plot): 43 | plot2d(X_red, y_red, embedding='gallery', title='Red wine').show()#.savefig('../data/red_whine_2d_gallery.png') 44 | plot2d(X_white, y_white, embedding='gallery', title='White wine').show()#.savefig('../data/white_whine_2d_gallery.png') 45 | pairs(X_red, y_red, 'Red wine') 46 | pairs(X_white, y_white, 'White wine') 47 | 48 | ############################### 49 | # Classification 50 | # Prepare data 51 | ############################### 52 | #X = X_white 53 | #y = y_white 54 | X = X_red 55 | y = y_red 56 | X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state=23, test_size=0.2) 57 | 58 | ############################### 59 | # Classify on transformed dataset. 60 | ############################### 61 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('cls', GaussianNB())]) 62 | cls_nb = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10)}) 63 | cls_nb.train(verbose=1, n_jobs=-1, scoring='f1_micro') 64 | cls_nb.classification_report() 65 | 66 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('nn', KNeighborsClassifier())]) 67 | cls_nn = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10), 'nn__n_neighbors': [1, 2, 4, 8, 32, 64]}) 68 | cls_nn.train(verbose=1, n_jobs=1, scoring='f1_micro') # crashes with n_jobs > 1 69 | cls_nn.classification_report() 70 | 71 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('svc', LinearSVC())]) 72 | cls_svc = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10), 'svc__C': 10. ** np.arange(-3, 4)}) 73 | cls_svc.train(verbose=1, n_jobs=1, scoring='f1_micro') 74 | cls_svc.classification_report() 75 | 76 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('logistic', LogisticRegression(multi_class='multinomial', solver='lbfgs'))]) 77 | cls_log = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10), 'logistic__C': 10. ** np.arange(-3, 4)}) 78 | cls_log.train(verbose=1, n_jobs=1, scoring='f1_micro') # Not sure why, but multi_class logisitc regression crashes with multithreading. 79 | cls_log.classification_report() -------------------------------------------------------------------------------- /wine_quality/python/wine_preprocesser.py: -------------------------------------------------------------------------------- 1 | __author__ = "Fernando Carrillo" 2 | __email__ = "fernando at carrillo.at" 3 | 4 | import numpy as np 5 | from sklearn.preprocessing import PolynomialFeatures 6 | 7 | class WinePreprocesser(object): 8 | """docstring for WinePreprocesser""" 9 | def __init__(self, wine_data): 10 | self.X_red, self.y_red = wine_data.load_red() 11 | self.X_white, self.y_white = wine_data.load_white() 12 | 13 | def _divide_features(self, X, replace_inf_with_absmax): 14 | """ 15 | Divide 1 by feature value. 16 | """ 17 | # Do the division 18 | nf = np.divide(1, X) 19 | # Replace inf by nan or by the maximal absolute value 20 | for i in np.arange(nf.shape[1]): 21 | if np.inf in nf[:,i]: 22 | a = nf[:,i] 23 | if replace_inf_with_absmax: 24 | a[np.isinf(a)] = a[np.argmax(abs(a[np.isfinite(a)]))] 25 | else: 26 | a[np.isinf(a)] = np.nan 27 | nf[:,i] = a 28 | return(nf) 29 | 30 | 31 | def add_divided_features(self, replace_inf_with_absmax=True): 32 | """ 33 | For each feature y_i add 1/y_i 34 | """ 35 | X_red_divided = self._divide_features(X=self.X_red, replace_inf_with_absmax=replace_inf_with_absmax) 36 | self.X_red = np.concatenate((self.X_red, X_red_divided), axis=1) 37 | X_white_divided = self._divide_features(X=self.X_white, replace_inf_with_absmax=replace_inf_with_absmax) 38 | self.X_white = np.concatenate((self.X_white, X_white_divided), axis=1) 39 | 40 | def polynomial_expansion(self, rank=2): 41 | """ 42 | Expand the features with polynonial of rank rnank 43 | """ 44 | pf = PolynomialFeatures(degree=2) 45 | self.X_red = pf.fit_transform(self.X_red) 46 | self.X_white = pf.fit_transform(self.X_white) 47 | 48 | def _remove_low_var(self, X, variance_threshold): 49 | """ 50 | Remove features with variance below threshold. 51 | """ 52 | remove_index = [] 53 | for col in range(X.shape[1]): 54 | if np.var(X[:,col]) < variance_threshold: 55 | remove_index.append(col) 56 | return(np.delete(X, remove_index, 1)) 57 | 58 | def remove_low_variance_features(self, variance_threshold=0): 59 | """ 60 | Remove features with variance below threshold. 61 | """ 62 | self.X_red = self._remove_low_var(self.X_red, variance_threshold) 63 | self.X_white = self._remove_low_var(self.X_white, variance_threshold) 64 | 65 | def yeo_johnson_transform(self): 66 | """ 67 | Implement yeo johnson transform 68 | """ 69 | raise NotImplementedError 70 | 71 | def get_red(self): 72 | """ 73 | Returns X, y of red wine data 74 | """ 75 | return self.X_red, self.y_red 76 | 77 | def get_white(self): 78 | """ 79 | Returns X, y of white wine data 80 | """ 81 | return self.X_white, self.y_white 82 | --------------------------------------------------------------------------------