├── .gitignore
├── POTUS_aka_El_Presidente
    ├── README.txt
    ├── best_model.txt
    ├── model_selection.R
    ├── notes.txt
    ├── predict.R
    ├── python_code
    │   ├── python_POTUS.ipynb
    │   ├── python_POTUS_notes.ipynb
    │   └── python_POTUS_pipeline.ipynb
    ├── test.csv
    └── train.csv
├── Preprocessing_note.html
├── Preprocessing_note.ipynb
├── digit_recoginition
    ├── digit_recog_classifier_test_data.py
    └── digit_recog_grid_search.py
├── expedia
    └── EDA_1st_model.ipynb
├── homesite
    ├── Boris_gradient_boost.ipynb
    ├── initial_foray_insurance.ipynb
    └── initial_foray_insurance_grad_boosting.ipynb
├── notes_on_ML
    ├── K-NN_and_preprocessing.html
    ├── K-NN_and_preprocessing.ipynb
    ├── Logistic_regression_and_preprocessing.ipynb
    └── Scaling_synthesized_data.ipynb
├── paribas
    ├── README.md
    ├── boosting_in_barbados.ipynb
    ├── exploratory_analysis.ipynb
    ├── extra_trees_classifier.ipynb
    ├── paribas_I.ipynb
    └── stratified_CV_with_xgboost.ipynb
└── wine_quality
    ├── README.txt
    ├── ipython_notebooks
        ├── Predicting_Wine_Quality.ipynb
        ├── Testing Box_Cox.ipynb
        ├── box_cox.py
        ├── explore_wine_data.ipynb
        ├── yeo_johnson.py
        └── yjscratch.py
    ├── python
        ├── .ipynb_checkpoints
        │   └── wine_notebook-checkpoint.ipynb
        ├── wine_classifier.py
        ├── wine_data.py
        ├── wine_explore.py
        ├── wine_main.py
        ├── wine_notebook.ipynb
        └── wine_preprocesser.py
    ├── winequality-red.csv
    └── winequality-white.csv


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .RData
 3 | .Rhistory
 4 | .project
 5 | .pydevproject
 6 | *.pickle
 7 | *.pyc
 8 | data/
 9 | __init__.py
10 | 


--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/README.txt:
--------------------------------------------------------------------------------
 1 | Predicting Presidential Votes Across Counties
 2 | 
 3 | This is a typical example of a job interview challenge (usually given as a homework assignment to complete in a couple of hours): they will provide you with a training .csv, ask you to build some predicitve models and choose the best one; then to apply it to their test data .csv and send them the results, along with your code.
 4 | Herein find: 
 5 | -an example training set train.csv which contains metadata about counties and who won in- that county (Obama/Romney) and a test data set test.csv; 
 6 | -an R script model_selection.R that builds a variety of predictive models for the problem and chooses the one that performs best on train.csv (using repeated 10-fold cross validation); 
 7 | -an R script predict.R that that predicts the election outcome for the counties (rows) in test.csv;
 8 | -notes.txt, that explains the model building process;
 9 | -best_model.txt, that describes the performance of the best model.
10 | 
11 | Enjoy!
12 | 
13 | 


--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/best_model.txt:
--------------------------------------------------------------------------------
 1 | Stochastic Gradient Boosting 
 2 | 
 3 | 1213 samples
 4 |    9 predictor
 5 |    2 classes: 'Barack Obama', 'Mitt Romney' 
 6 | 
 7 | Pre-processing: centered, scaled, principal component signal extraction 
 8 | Resampling: Cross-Validated (10 fold, repeated 10 times) 
 9 | 
10 | Summary of sample sizes: 1092, 1092, 1091, 1092, 1091, 1092, ... 
11 | 
12 | Resampling results across tuning parameters:
13 | 
14 |   interaction.depth  n.trees  Accuracy  Kappa  Accuracy SD  Kappa SD
15 |   1                   50      0.843     0.426  0.0243       0.1026  
16 |   1                  100      0.847     0.467  0.0258       0.0988  
17 |   1                  150      0.847     0.471  0.0274       0.1019  
18 |   2                   50      0.848     0.469  0.0248       0.0976  
19 |   2                  100      0.844     0.469  0.0263       0.0966  
20 |   2                  150      0.843     0.468  0.0262       0.0960  
21 |   3                   50      0.847     0.474  0.0256       0.0979  
22 |   3                  100      0.844     0.473  0.0280       0.0997  
23 |   3                  150      0.841     0.466  0.0280       0.0966  
24 | 
25 | Tuning parameter 'shrinkage' was held constant at a value of 0.1
26 | Accuracy was used to select the optimal model using  the largest value.
27 | The final values used for the model were n.trees = 50, interaction.depth = 2 and shrinkage = 0.1. 
28 | 


--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/model_selection.R:
--------------------------------------------------------------------------------
  1 | #################################################################################
  2 | ###THE SET UP
  3 | #################################################################################
  4 | library( glmnet )
  5 | library( ggplot2 )
  6 | library( caret )
  7 | library( kernlab )
  8 | library( klaR )
  9 | library(doMC)
 10 | 
 11 | 
 12 | ###HERE I TAKE ADVANTAGE OF MULTITHREADING
 13 | ###Using multithreading with my Dual Core 2.8 GHz Intel Core i7 processor,
 14 | ###the code below takes ~6minutes to run
 15 | nc <- detectCores()
 16 | registerDoMC(cores = nc)
 17 | 
 18 | 
 19 | rm(list=ls(all=TRUE))
 20 | setwd("~/Documents/ML/")#SET YOUR WORKING DIRECTORY HERE
 21 | data <- read.csv("train_potus_by_county.csv", header = TRUE  )
 22 | 
 23 | 
 24 | #################################################################################
 25 | ###INITIAL DATA DIVE
 26 | #################################################################################
 27 | #HISTOGRAM OF RESPONSE VARIABLE TO CHEK FOR CLASS IMBALANCE
 28 | q <- ggplot( data , aes(x=Winner))
 29 | q + geom_histogram() ##note a class imbalance!
 30 | ##CHECK FOR FEATURES WITH NEAR-ZERO VARIANCE (MAY THROW OFF MODELS)
 31 | nzv <- nearZeroVar( data , saveMetrics=TRUE ) 
 32 | #View(nzv) ##no variables with near-zero variance
 33 | #VISUALIZE ALL VARIABLES AND THEIR RELATIONSHIPS
 34 | #ggpairs( data ) #THIS FUNCTION IS COMPUTATIONALLY INTENSIVE AND NOT ESSENTIAL FOR WHAT FOLLOWS
 35 | 
 36 | #################################################################################
 37 | ###FEATURE SELECTION: I USE LASSO REGRESSION TO SELECT THE MOST IMPORTANT 
 38 | ###FEATURES IN DETERMINING THE WINNER
 39 | ###(YOU COULD ALSO USE A NONLINEAR ALGORITHM, SUCH AS A RANDOM FOREST
 40 | ###TO SELECT FEATURES: AN ADVANTAGE OF LASSO REGRESSION IS THAT IT
 41 | ###SELECTS FEATURES AND TELLS YOU WHETHER THEY ARE +VELY OR -VELY
 42 | ###CORRELATED WITH THE TARGET VARIABLE)
 43 | #################################################################################
 44 | ###SETUP INPUTS TO MODEL
 45 | n <- length( data )
 46 | x<- as.matrix(data[,-n])
 47 | y <- as.matrix(data$Winner)
 48 | ###RUN THE MODEL
 49 | cvfit = cv.glmnet(x, y,  family = "binomial", type.measure = "class",
 50 |                   nfolds = 20 , nlambda = 1000 , alpha = 1)
 51 | ##VARIABLES WITH NONZERO COOEFICIENTS ARE THE IMPORTANT VARIABLES
 52 | coef(cvfit$glmnet.fit,s=cvfit$lambda.1se) 
 53 | 
 54 | ###KEEP IMPORTANT FEATURES AND RESPONSE VARIABLE
 55 | keep <- c("Median.age","X..BachelorsDeg.or.higher","Unemployment.rate",
 56 |           "Total.households","X..Owner.occupied.housing","X..Renter.occupied.housing",
 57 |           "Median.home.value","Population.growth", "Per.capita.income.growth",
 58 |           "Winner")
 59 | data <- data[,keep] ##KEEP ONLY THE MOST IMPORTANT FEATURES & RESPONSE VARAIBLES
 60 | 
 61 | #################################################################################
 62 | ###IN WHICH I BUILD A NUMBER OF MODELS TO PREDICT THE RESPONSE VARIABLE
 63 | ###I TRY LOGISTIC REGRESSION, SVMs, NEURAL NETWORKS, RANDOM FORESTS,
 64 | ###GENERALIZED BOOSTED MODELS AND NAIVE BAYES.
 65 | ###NOTE: PREPROCESSING OCCURS WITHIN EACH TRAINING METHOD.
 66 | #################################################################################
 67 | 
 68 | 
 69 | ###DETAILS OF MODEL TRAINING (REPEATED 10-FOLD CROSS VALIDATION)
 70 | fitControl <- trainControl(## 10-fold CV
 71 |   method = "repeatedcv",
 72 |   number = 10,
 73 |   #classProbs = TRUE,
 74 |   ## repeated ten times
 75 |   repeats = 10)
 76 | 
 77 | ###I DEFINE THE PREPROCESSING THAT I'LL PERFORM IN EACH MODEL FITTING
 78 | preProc = c("center", "scale","pca") ##centre & scale data, pca on predictor variables
 79 | tL = 5 #number of levels for each tuning parameter in training: you could do much wider and 
 80 |         #more rigorous tuning by choosing the model-dependent parameter values. Do this and
 81 |         #your models will perform better!!
 82 | 
 83 | 
 84 | # Start the clock!
 85 | ptm <- proc.time()
 86 | ###LOGISTIC REGRESSION (AS A PARTICULAR "GENERAL LINEAR MODEL")
 87 | lrfit <- train( Winner ~. , data = data , method = "glm", family = binomial,
 88 |                 trControl = fitControl, preProc , 
 89 |                 tuneLength =tL)
 90 | 
 91 | ###SUPPORT VECTOR MACHINE (RADIAL BASIS KERNEL)
 92 | svmfit <- train( Winner ~. , data = data , method = 'svmRadial',
 93 |                  trControl = fitControl, preProc  , 
 94 |                  tuneLength = tL)
 95 | 
 96 | ###NEURAL NETWORK
 97 | 
 98 | nnetfit <- train( Winner ~. , data = data , method = "nnet",
 99 |                   trControl = fitControl, preProc)
100 | 
101 | 
102 | ###RANDOM FOREST
103 | 
104 | rffit <- train( Winner ~. , data = data , method = "rf",
105 |                 trControl = fitControl, preProc)
106 | 
107 | 
108 | ###GENERALIZED BOOSTED MODEL
109 | 
110 | gbmfit <- train( Winner ~. , data = data , method = "gbm", 
111 |                  trControl = fitControl, preProc)
112 | 
113 | ###NAIVE BAYES
114 | 
115 | nbfit <- train( Winner ~. , data = data , method = "nb", 
116 |                  trControl = fitControl, preProc)
117 | # Stop the clock
118 | proc.time() - ptm
119 | #################################################################################
120 | ###COMPARE ALL MODELS
121 | #################################################################################
122 | ####
123 | 
124 | resamps <- resamples(list(nnet = nnetfit , gbm = gbmfit , lr = lrfit,
125 |                           svm = svmfit , rf = rffit , nb = nbfit))
126 | summary( resamps )
127 | ###GBM HAS THE HIGHEST MEAN ACCURACY
128 | 
129 | #################################################################################
130 | ###PRODUCE OUTPUTS
131 | #################################################################################
132 | 
133 | 
134 | ###SAVE BEST MODEL TO THE FILESYSTEM
135 | save(gbmfit , file = "mymodelgbm.rda")
136 | 
137 | ###LOG DATA ABOUT EXPECTED PERFORMANCE OF MODEL
138 | sink(file="performance.txt") 
139 | gbmfit
140 | sink(NULL) 
141 | 
142 | 
143 | 
144 | 
145 | #################################################################################
146 | ###HERE BELOW I INCLUDE SOME PREPROCESSING CODE THAT CHECKS FOR FEATURES THAT 
147 | ### REMOVES HIGHLY CORRELATED FEATURES AND LOOKS FOR COLLINEARITY.
148 | ###THIS PREPROCESSING DID NOT IMPROVE MODEL PERFORMANCE
149 | ##SO I DID NOT INCLUDE IT IN THE ABOVE CODE.
150 | #################################################################################
151 | 
152 | # ###remove correlated variables
153 | # dummies <- dummyVars( ~ ., data )
154 | # df <- predict(dummies, newdata = data)
155 | # da <- data.frame(df)
156 | # descrCor <-  cor( da )
157 | # #summary(descrCor[upper.tri(descrCor)])
158 | # highlyCorDescr <- findCorrelation(descrCor, cutoff = .75)
159 | # filteredDescr <- da[,-highlyCorDescr]
160 | # #descrCor2 <- cor(filteredDescr)
161 | # #summary(descrCor2[upper.tri(descrCor2)])
162 | # filteredDescr$Winner.Barack.Obama <- NULL
163 | # filteredDescr$Winner <- data$Winner
164 | # data <- filteredDescr
165 | # ##find linear combos
166 | # comboInfo <- findLinearCombos(data) #none


--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/notes.txt:
--------------------------------------------------------------------------------
 1 | Notes
 2 | 
 3 | I chose to use R to tackle this assignment. In particular, I made use of the package ‘caret’. See all dependencies at the bottom of these notes.
 4 | 
 5 | 
 6 | Approach
 7 | 
 8 | 1. Before attempting to build any models, I dove into the data: the most important aspects that I noticed immediately were (i) that all predictor variables were numerical AND (ii) that there was a class imbalance in the response variable (approx. 1/4 for “Barack Obama”). I also noticed that the response variable was binary.
 9 | 
10 | (i) above indicated that preprocessing all features via scaling and centering would be appropriate. (ii) above made me aware that I should use modelling techniques that are good at dealing with class imbalances, for examples ensemble methods such as random forests and boosting (one could also implement up-/down-sampling).
11 | 
12 | 2. In order to compare models, I needed to 1st decide on a metric of comparison: I chose ‘Accuracy’, because in predicting voting behaviour, we want to be as accurate as possible. (I am aware that ‘Accuracy’ may be problematic due to the class imbalance problem but a 25%/75% split isn’t too bad). I could also have used other metrics, such as ROC-curve or specificity.
13 | 
14 | 3. I wanted to select the most important features so that the others did not introduce unwanted noise into the modelling process. I did so using Lasso regression. Note: I also attempted to engineer new features (linear combinations, products, ratios of existing features) but this did not contribute to overall performance; I also attempted to remove correlated features but this did not contribute to overall performance.
15 | 
16 | 4. I wanted to implement as many models as possible and use the best one on the test data (with ‘Accuracy’ as metric, as discussed above): for each model, I preprocessed the data (scaling, centering and principal component signal extraction) and used repeated 10-fold cross validation to retreive the ‘Accuracy’ of each model (with error bars) for different meta-parameters.
17 | 
18 | 5. The models I chose were logistic regression, support vector machines, neural networks, random forests, stochastic gradient boosting and naive Bayes.
19 | 
20 | 6. Stochastic gradient boosting won with an accuracy of 84.8% (pretty good for out of the box!): as reported in performance.txt, “The final values used for the model were n.trees = 50, interaction.depth = 2 and shrinkage = 0.1.” Note that many of the models I tried had pretty similar accuracies.
21 | 
22 | 7. If I had more time, I would have definitely played around with (i) up-, down- and mixed sampling of the training data, (ii) using bagging techniques other than random forests, (iii) SMOTE to address the class imbalance problem in another manner and (iv) really working on feature engineering as this is the real key!
23 | 
24 | 
25 | 
26 | Dependencies
27 | 
28 | R libraries:
29 | library( glmnet )
30 | library( ggplot2 )
31 | library( caret )
32 | library( kernlab )
33 | library( klaR )
34 | library(doMC)
35 | 
36 | 
37 | 
38 | HUGO BOWNE-ANDERSON
39 | 07-14-2015
40 | 
41 | 


--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/predict.R:
--------------------------------------------------------------------------------
 1 | #################################################################################
 2 | ###THE SET UP
 3 | #################################################################################
 4 | 
 5 | library( caret )
 6 | rm(list=ls(all=TRUE))
 7 | setwd("~/Documents/ML/")#SET YOUR WORKING DIRECTORY HERE
 8 | data <- read.csv("test_potus_by_county.csv", header = TRUE  )
 9 | load( "mymodelgbm.rda")
10 | #################################################################################
11 | ###RUN MODEL AND WRITE PREDICTIONS TO .CSV
12 | #################################################################################
13 | predictions <- predict(gbmfit , data )
14 | 
15 | write.table(predictions , "predictions.csv" , row.names = FALSE , col.names = FALSE)
16 | 


--------------------------------------------------------------------------------
/POTUS_aka_El_Presidente/python_code/python_POTUS_pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<h1>PIPELINING WITH POTUS DATA AND MACHINE LEARNING</h1>"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<h2>IMPORT SOME LIBRARIES AND READ IN DATA</h2>"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/html": [
 27 |        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
 28 |        "<table border=\"1\" class=\"dataframe\">\n",
 29 |        "  <thead>\n",
 30 |        "    <tr style=\"text-align: right;\">\n",
 31 |        "      <th></th>\n",
 32 |        "      <th>Total population</th>\n",
 33 |        "      <th>Median age</th>\n",
 34 |        "      <th>% BachelorsDeg or higher</th>\n",
 35 |        "      <th>Unemployment rate</th>\n",
 36 |        "      <th>Per capita income</th>\n",
 37 |        "      <th>Total households</th>\n",
 38 |        "      <th>Average household size</th>\n",
 39 |        "      <th>% Owner occupied housing</th>\n",
 40 |        "      <th>% Renter occupied housing</th>\n",
 41 |        "      <th>% Vacant housing</th>\n",
 42 |        "      <th>Median home value</th>\n",
 43 |        "      <th>Population growth</th>\n",
 44 |        "      <th>House hold growth</th>\n",
 45 |        "      <th>Per capita income growth</th>\n",
 46 |        "      <th>Winner</th>\n",
 47 |        "      <th>Win_bin</th>\n",
 48 |        "    </tr>\n",
 49 |        "  </thead>\n",
 50 |        "  <tbody>\n",
 51 |        "    <tr>\n",
 52 |        "      <th>0</th>\n",
 53 |        "      <td>   9278</td>\n",
 54 |        "      <td> 37.9</td>\n",
 55 |        "      <td> 12.6</td>\n",
 56 |        "      <td> 21.3</td>\n",
 57 |        "      <td> 13992</td>\n",
 58 |        "      <td>   3802</td>\n",
 59 |        "      <td> 2.42</td>\n",
 60 |        "      <td> 51.9</td>\n",
 61 |        "      <td> 16.6</td>\n",
 62 |        "      <td> 31.6</td>\n",
 63 |        "      <td>  63959</td>\n",
 64 |        "      <td>-0.69</td>\n",
 65 |        "      <td>-0.49</td>\n",
 66 |        "      <td> 0.71</td>\n",
 67 |        "      <td> Barack Obama</td>\n",
 68 |        "      <td> True</td>\n",
 69 |        "    </tr>\n",
 70 |        "    <tr>\n",
 71 |        "      <th>1</th>\n",
 72 |        "      <td>  18594</td>\n",
 73 |        "      <td> 36.3</td>\n",
 74 |        "      <td>  9.7</td>\n",
 75 |        "      <td> 14.3</td>\n",
 76 |        "      <td> 14622</td>\n",
 77 |        "      <td>   6764</td>\n",
 78 |        "      <td> 2.55</td>\n",
 79 |        "      <td> 63.7</td>\n",
 80 |        "      <td> 16.2</td>\n",
 81 |        "      <td> 20.1</td>\n",
 82 |        "      <td>  74330</td>\n",
 83 |        "      <td>-0.13</td>\n",
 84 |        "      <td> 0.03</td>\n",
 85 |        "      <td> 0.85</td>\n",
 86 |        "      <td> Barack Obama</td>\n",
 87 |        "      <td> True</td>\n",
 88 |        "    </tr>\n",
 89 |        "    <tr>\n",
 90 |        "      <th>2</th>\n",
 91 |        "      <td> 662628</td>\n",
 92 |        "      <td> 37.9</td>\n",
 93 |        "      <td> 27.9</td>\n",
 94 |        "      <td> 12.1</td>\n",
 95 |        "      <td> 23909</td>\n",
 96 |        "      <td> 267862</td>\n",
 97 |        "      <td> 2.41</td>\n",
 98 |        "      <td> 57.0</td>\n",
 99 |        "      <td> 28.8</td>\n",
100 |        "      <td> 14.2</td>\n",
101 |        "      <td> 112687</td>\n",
102 |        "      <td>-0.09</td>\n",
103 |        "      <td> 0.00</td>\n",
104 |        "      <td> 0.55</td>\n",
105 |        "      <td> Barack Obama</td>\n",
106 |        "      <td> True</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>3</th>\n",
110 |        "      <td>  21292</td>\n",
111 |        "      <td> 38.9</td>\n",
112 |        "      <td> 14.1</td>\n",
113 |        "      <td> 15.7</td>\n",
114 |        "      <td> 16829</td>\n",
115 |        "      <td>   8547</td>\n",
116 |        "      <td> 2.47</td>\n",
117 |        "      <td> 63.5</td>\n",
118 |        "      <td> 17.1</td>\n",
119 |        "      <td> 19.4</td>\n",
120 |        "      <td>  73643</td>\n",
121 |        "      <td>-0.59</td>\n",
122 |        "      <td>-0.43</td>\n",
123 |        "      <td> 0.57</td>\n",
124 |        "      <td> Barack Obama</td>\n",
125 |        "      <td> True</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <th>4</th>\n",
129 |        "      <td>  13252</td>\n",
130 |        "      <td> 34.5</td>\n",
131 |        "      <td> 15.0</td>\n",
132 |        "      <td> 15.8</td>\n",
133 |        "      <td> 13012</td>\n",
134 |        "      <td>   5222</td>\n",
135 |        "      <td> 2.47</td>\n",
136 |        "      <td> 53.7</td>\n",
137 |        "      <td> 20.7</td>\n",
138 |        "      <td> 25.6</td>\n",
139 |        "      <td>  56642</td>\n",
140 |        "      <td>-1.16</td>\n",
141 |        "      <td>-1.03</td>\n",
142 |        "      <td> 0.69</td>\n",
143 |        "      <td> Barack Obama</td>\n",
144 |        "      <td> True</td>\n",
145 |        "    </tr>\n",
146 |        "  </tbody>\n",
147 |        "</table>\n",
148 |        "</div>"
149 |       ],
150 |       "text/plain": [
151 |        "   Total population  Median age  % BachelorsDeg or higher  Unemployment rate  \\\n",
152 |        "0              9278        37.9                      12.6               21.3   \n",
153 |        "1             18594        36.3                       9.7               14.3   \n",
154 |        "2            662628        37.9                      27.9               12.1   \n",
155 |        "3             21292        38.9                      14.1               15.7   \n",
156 |        "4             13252        34.5                      15.0               15.8   \n",
157 |        "\n",
158 |        "   Per capita income  Total households  Average household size  \\\n",
159 |        "0              13992              3802                    2.42   \n",
160 |        "1              14622              6764                    2.55   \n",
161 |        "2              23909            267862                    2.41   \n",
162 |        "3              16829              8547                    2.47   \n",
163 |        "4              13012              5222                    2.47   \n",
164 |        "\n",
165 |        "   % Owner occupied housing  % Renter occupied housing  % Vacant housing  \\\n",
166 |        "0                      51.9                       16.6              31.6   \n",
167 |        "1                      63.7                       16.2              20.1   \n",
168 |        "2                      57.0                       28.8              14.2   \n",
169 |        "3                      63.5                       17.1              19.4   \n",
170 |        "4                      53.7                       20.7              25.6   \n",
171 |        "\n",
172 |        "   Median home value  Population growth  House hold growth  \\\n",
173 |        "0              63959              -0.69              -0.49   \n",
174 |        "1              74330              -0.13               0.03   \n",
175 |        "2             112687              -0.09               0.00   \n",
176 |        "3              73643              -0.59              -0.43   \n",
177 |        "4              56642              -1.16              -1.03   \n",
178 |        "\n",
179 |        "   Per capita income growth        Winner Win_bin  \n",
180 |        "0                      0.71  Barack Obama    True  \n",
181 |        "1                      0.85  Barack Obama    True  \n",
182 |        "2                      0.55  Barack Obama    True  \n",
183 |        "3                      0.57  Barack Obama    True  \n",
184 |        "4                      0.69  Barack Obama    True  "
185 |       ]
186 |      },
187 |      "execution_count": 1,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "import numpy as np\n",
194 |     "import pandas as pd\n",
195 |     "import matplotlib.pyplot as plt\n",
196 |     "%matplotlib inline\n",
197 |     "pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier\n",
198 |     "##check out tutorial here:\n",
199 |     "##http://nbviewer.ipython.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%201%20-%20Reading%20from%20a%20CSV.ipynb\n",
200 |     "df = pd.read_csv('../train.csv')\n",
201 |     "df1 = df.drop('Winner', 1)\n",
202 |     "df['Win_bin'] = (df['Winner'] == 'Barack Obama') ##new column: logical wrt winner\n",
203 |     "df.head()"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "<h2>SPLIT DATA INTO TRAINING AND TEST SETS</h2>"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 2,
216 |    "metadata": {
217 |     "collapsed": false
218 |    },
219 |    "outputs": [
220 |     {
221 |      "name": "stdout",
222 |      "output_type": "stream",
223 |      "text": [
224 |       "X_train shape: (1091, 14)\n",
225 |       "y_train shape: (1091,)\n",
226 |       "X_test shape: (122, 14)\n",
227 |       "y_test shape: (122,)\n"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "from sklearn.cross_validation import train_test_split\n",
233 |     "X_train, X_test, y_train, y_test = train_test_split(df1, df['Winner'], test_size = 0.1, \n",
234 |     "                                                    random_state=0)\n",
235 |     "print(\"X_train shape: %s\" % repr(X_train.shape))\n",
236 |     "print(\"y_train shape: %s\" % repr(y_train.shape))\n",
237 |     "print(\"X_test shape: %s\" % repr(X_test.shape))\n",
238 |     "print(\"y_test shape: %s\" % repr(y_test.shape))"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "<h2>OPENING THE PIPELINE</h2>"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 3,
251 |    "metadata": {
252 |     "collapsed": false
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "#see here for intuition:\n",
257 |     "#http://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html\n",
258 |     "from sklearn import linear_model, decomposition, datasets, preprocessing\n",
259 |     "from sklearn.pipeline import Pipeline\n",
260 |     "from sklearn.grid_search import GridSearchCV\n",
261 |     "from sklearn.svm import LinearSVC\n",
262 |     "#build a scaler component to pipeline:\n",
263 |     "scaler = preprocessing.StandardScaler().fit(X_train)\n",
264 |     "#see here for 'scaler in pipeline' details: \n",
265 |     "#http://scikit-learn.org/stable/modules/preprocessing.html\n",
266 |     "#X_train_scaled = scaler.transform(X_train)\n",
267 |     "#Instantiate a model:\n",
268 |     "logistic = linear_model.LogisticRegression()\n",
269 |     "#this is the pipe!:\n",
270 |     "svm = LinearSVC() # Instantiate the model\n",
271 |     "tuned_parameters = 10.**np.arange(-3,5)\n",
272 |     "pipe = Pipeline(steps=[('scale', scaler), ('svm', svm)])"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 4,
278 |    "metadata": {
279 |     "collapsed": false
280 |    },
281 |    "outputs": [
282 |     {
283 |      "name": "stdout",
284 |      "output_type": "stream",
285 |      "text": [
286 |       "Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
287 |       "     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',\n",
288 |       "     random_state=None, tol=0.0001, verbose=0))])\n"
289 |      ]
290 |     }
291 |    ],
292 |    "source": [
293 |     "estimator = GridSearchCV(pipe,\n",
294 |     "                         dict(\n",
295 |     "                              svm__C=tuned_parameters))\n",
296 |     "estimator.fit(X_train , y_train);\n",
297 |     "print(estimator.best_estimator_)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 13,
303 |    "metadata": {
304 |     "collapsed": false
305 |    },
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZwAAAEQCAYAAACEM8KaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xt8E2W+P/BPkjZp06SF3lPK3YK0tUBBFxRWvEApUkSW\nFkHFlQV2V9yfe1bOYRftli5UFM9x9awedRe14LqsWw6uFKzItWcrdbkUaLHlspRbIemNXpKmSZrM\n/P4oRAppk5TMLfm+Xy9fMMlk5jNDzbfP88w8I2NZlgUhhBDCMbnQAQghhAQGKjiEEEJ4QQWHEEII\nL6jgEEII4QUVHEIIIbyggkMIIYQXQVxuvLKyElu3bgUA5OTkIDU1tdd1S0tLsWvXLigUCixYsMC5\nbnNzM9555x04HA6MHDkSzz77LJeRCSGEcISzgsMwDIqKipCbmwsAKCgoQEpKCmQymcv1i4uLsWHD\nBlgsFhQUFKCgoAAA8Mknn+DJJ5/E6NGjuYpKCCGEB5wVHIPBAJ1OB6VSCQCIi4tzvuZKYmIiqqur\n0draiqSkJADdRau+vp6KDSGE+AHOCo7JZIJarUZhYSEAQK1Ww2g09lpw0tLSsHPnTtjtdmRkZAAA\n2tvbYbPZsGHDBnR2diIzMxP33XcfV5EJIYRwiLOCo9FoYDabsXTpUrAsi40bNyI8PNzluvX19aio\nqMCqVasAAHl5eUhLS4NGo4FarcbKlSvBMAxyc3Mxbtw4Z6uJEEKIdHBWcOLj46HX653LBoMB8fHx\nLtdlGAYOhwMAwLIsbDZbd7igIERFRaG1tRWRkZEICuo77t69e32UnhBCAssjjzzC+T44KzhyuRzz\n58/H2rVrAQDZ2dnO98rLy6FSqZCeng4A0Ol0SEpKwvr168EwDDIyMpytmKeffhoffPABzGYzJk+e\n7LZ1c2ObhBBCPFNRUcHLfmT+NFv03r17JV1wysrKMGXKFKFj9IuUswOUX2iUX1gVFRW8tHDoxk9C\nCCG8oBYOIYQEOGrhEEII8SucTm1DvCPlfmApZwf4z28ymdDW1tbrzBveamtrQ0REhE+2JQTKzy2W\nZREREQGNRiNoDio4hPCsqakJMpkMCQkJPis4CQkJPtmOUCg/t1iWxbVr12CxWBAdHS1YDupSExEp\ntxCknB3gN7/NZkNUVJTPig0h7shkMkRFRTnvcRQKFRxCCCG8oIIjImVlZUJH6DcpZwekn58QKaCC\nQwjxidbWVmRmZiItLQ3vvPNOr+u9/vrrOHv2bJ/bGjx4sFf7fu+999DZ2XlH+yTco/twCOHZ1atX\nRT/IfCdef/11hIWF4YUXXuj3NoYMGYJLly55vP64ceOwb98+REZG9nufgaC3nz26D4cQwjuj0YjU\n1FTn4LLdbkdaWhra2tqc769YsQLz5s3Dfffdh3Xr1nm87Q8//BCzZs3CsGHDcPz48R7vnThxAtOm\nTUNWVhYKCgpw8+/Bfe3TYrEgMzMTDQ0NePLJJzFr1izU1dV5tM+LFy8iJycHmZmZyMjI6NGt+tpr\nr+E3v/kNli1bhocffhjLli3z+DiPHz+Oxx9/HLNnz8YzzzyD5uZm53uXLl3C/fffj1dffRWPPvoo\nHn/88R6fHTt2LDZv3ozp06dj6tSpPYru//zP/+DRRx/FjBkz8Ktf/QoWi8X53l/+8hesWLECzz33\nHGbMmIFXXnnF47x8ooIjIlIeR5BydkD6+X1Fq9XiwQcfxNdffw0A2LdvHyZNmuS8x0Sr1WLdunXY\ntm0bSktLsWXLFhgMBo+2/ZOf/ARffvkl0tLSbrtC7+c//znWrl2L4uJiPPLII7BarT0y9bbPkJAQ\nlJSUIDY2Fp999hm+/PJLJCYmerTPn/70p1i2bBlKSkqwceNGrFixAq2trc73q6ur8eabb2Lv3r04\ncuQILly44PYYbTYbfvGLX+CDDz7Ajh07MH/+fKxZs6bHOufPn0dycjL27NmDL774osd7MpkMZ86c\nwe7du/GPf/wDQ4YMAQDs378fO3bsQElJCb7++muoVCq8+eabPT574MAB/PrXv8bXX3/t1S8CfKL7\ncAgRoRkbj/lkO18vHe/1Z5566im8//77mD17Nj777DMsXry4x/sKhQK7du3CpUuXoFQq0dDQ0Ouj\nRzzR2toKo9GIqVOnAgAmTZqEkJAQTvdpNBpRV1eH6dOnA+geM/rBD36AQ4cOYcaMGZDJZJgxYwa0\nWq3z/RutvL6cPXsWV65cwfLlywF0P3pFpVL1WGfEiBGYO3dur9t46aWXbntt3759WLhwIYKDgwEA\nS5cuxfLly7F69WoA3YUqKytL9E9HpoIjIlK+l0XK2QHx5e9PofCVyZMnY+XKlbh06RK+++47PPjg\ng873vvvuO/zsZz/DkiVLcM899yAqKgp3Ogwsl/fd0cLFPgHctg2GYXq0gvqzD4VCgSFDhmD79u13\nnO9WDMP0+PutLTYpDMdTlxohpAeZTIb58+dj6dKl+NGPftTjvdLSUsyYMQPPPfccwsPDcenSpTv+\nogsPD0dsbCy+/fZbAMCuXbtgNpu92qdKpUJDQwMAz754tVothg4dipKSEgDAhQsXcOjQoTt+hH1S\nUhKsVit27NjhfM0XheDRRx/Fli1bnF2Nf/rTn5ytM1/tgw9UcEREyuMIUs4OSD+/ry1YsACVlZVY\nuHBhj9fnzZvnLADvvvsuJk+e7Pyiv9mHH36Ixx57DF1dXR7t7+2338bq1asxc+ZMHDlyBGq12qt9\nLlmyBE899RSeeOIJfPLJJx7t8/3338fHH3+MmTNnYvny5Xjvvfd6zIfWn5kgFAoFPv30U2zevBkZ\nGRnIzMzEn/70px7r9LXd3t578MEH8fjjj+Oxxx7D9OnTYbfb8ctf/rLH56QwcwVdFi0iUp4AU8rZ\nAX7z+/tl0US86LJo4iTlL2wpZwekn58QKaCCQwghhBdUcEREyuMIUs4OSD8/IVJABYcQQggvqOCI\niJTHEaScHeA//833VBDCBzH8zFHBIYRn0dHRuHLliii+AEhgYBgGV65cEfRpnwDNNCAqUr60WMrZ\nAX7zK5VKxMXFeTwHmSfa2tp63EMiNZSfe3FxcVAqlYJm4LTgVFZWYuvWrQCAnJwcpKam9rpuaWkp\ndu3aBYVCgQULFvRYt6urCy+++CLmzJmDmTNnchmZ+AGWZWGxM+iwOdBhc8B0/c8OmwMma/ey+frr\nN95rvhaCsv0XMDA0CAPVwd1/hn7/Z0RIEBRy391Yp1QqfXovTm1tLcaMGeOz7fGN8gcGzgoOwzAo\nKipCbm4uAKCgoAApKSm93g1bXFyMDRs2wGKxoKCgAAUFBc73du/ejREjRkjiTto7IeUWgi+zOxgW\n5q6bCoPVgY6u63+6KiLX1zHftE6QXIYwlQIaZRDClHKEKRXQKBXOP9VKBWI1SmhU3a/JZbFo7bTj\nWmcXWjvtqG3uREunHS2dXWjptMNktUOrCkKkOggDQoMRGXrLn+rvC1R4SBDkPP+sSvlnB6D8gYKz\ngmMwGKDT6ZxNuBtdCDqdzuX6iYmJqK6uRmtrK5KSkpyvW61WVFZWYtKkST2e/0DEy+Zg0HFTkXDV\nori5xXFrEbHYGYQG3ygQcoQpg3r8PUwpR7Q6GEMHhiJMKYdG2bOwhCkVCFb4dnjSwbBotdjRYu7q\nUYiazF0429zZvWzufr3D5kBEiOuW0s0tqMjQYGhUCt6LEyFC4azgmEwmqNVqFBYWAgDUajWMRmOv\nBSctLQ07d+6E3W5HRkaG8/WSkhLMnDmzx3Mq/JUYxkFYlkVnF9N3i+KWItJhc6CxzQRGoUSHzQGG\nhfOL32XRUAUhMSL4erEIur5u9981KgVCg+W8fwm7O/cKuQxR6mBEqYPdbqvLwaDNYse1TjtaO7tw\n7XohqjfZcKqxw1mYWi12dHYx3cXpekFy1YIaGBqMgeru89dbK18MPzt3gvIHBs4KjkajgdlsxtKl\nS8GyLDZu3Ijw8HCX69bX16OiogKrVq0CAOTl5SEtLQ12ux2nTp3C3LlzceDAAY/2e/M//I2b+aSy\nXFVVdcfbY1hg3L2T0GFzoOzQUVgcwPDRyeiwOVB16iwsDhmi4gehw+bARX0DLIwMwaEamGwOtHZY\nYWUAVdD1bia7FSoFC13UQGhUCrQ3NyBEzuLukcOgC1eh7vxZDJID901Kw+mTxxGisCBEzmLa1Acg\nk8lc57UDUyZ9v+wAkCaS8+/L5egwJU4dOwQAmHnz+0pgysPfL9sZIDn9PrR0duGbo5UwtcsQqh2G\nq+1WlH53ESa7DIxSjWvmLljtDoQpWMQP0GBgaBBs7c0IC2KRNmoEFLZezjct07KHy3zgbPJOhmGQ\nl5eH3NxcsCyLdevWYe3atS7X1ev12Lx5M1atWgWWZbF69Wrk5+fj5MmT2LlzJ7RaLRobG+FwOPDC\nCy/0eKLfzaQ+eSfLsrA5WI8Hunu0Pq53YVntDMKUCqiDFdCovh+v0PRocdz0d9Xtrwf5cHCc+I7V\nzvQYZ7p2vVvvUksn9EYb/nvOKL8f5yTc4GvyTs5aOHK5HPPnz3cWmezsbOd75eXlUKlUzuKg0+mQ\nlJSE9evXg2EYZGRkQKlUIj093bnOgQMHYLVaey02UuFgWOiNVlxutaKuzYK6Nisut1lwtd0Ko8UB\nAM6B7FsLwY3l6LDgHgXl5veF6I4i/FAFyRGnVSJO2/PSVoZlseLvp/HNhTZMGT5AoHSEuEePJ+BI\nm8WOulYLLrd1F5bLbVZcbrWg3mRDtDoYiREhSBygwuCIECRGqDAoQoXvjh7CtB9Ksx9Y6n3YUs//\n8Vfl+MY0AB/Mu9unl2/zRernX+r5Jd/CCQRdDgb6dhsutVm6WyutVmeLhWWBxAgVEgeEYHCECo/e\nFYnBA1RI0KqgDHJ9BVUvLxPi1l1hDlTZFdj7r2uYMSpK6DiEuEQtHDdYlkVLp93ZSqlrvdENZkVj\nhw2xYUokRqgweEBIjz8HhARRfzrh1XcGE147cBEfZo+B0seXhRP/Ri0cntnsDK60d7dO6lq/7war\na7NCIQMSI0Iw+HoX2D06DRIjQqDTKn1+vwch/ZUSr8GwgSHYWdOEJ1JjhY5DyG0CquCwLIsmcxfq\nWq8XlhvjK61WtHR2IV6r6m6lRKgwNkGLx8ZEY3BECMJD+DlNUu4HlnJ2wH/y/3iiDqu/OoeMUVFQ\nKxVCx/KYv5x/0je/LDidXQ5cud7t5bwSrNWCK+1WhATJuwfsrxeW9EFaDI5QIV6rkuRgKyE3Gxml\nxrgELbZ914inx8cLHYeQHvxuDOc/TyvRbrEjIVzV3Q0WoepxNZhG5Zc1lhCnK21WvLj9ND7KTuat\ndU6kjcZw+um/ZichNkxJrRUSsAZFqPDD4QPx1xP1WP6DQULHIcTJ70a8dRLuGrsx1YQUSTk74H/5\nnxofj11nmtHUYRMokXf87fwT1/yu4BBCgKiwYGSOjsKfj/nuIW+E3Cm/G8MRy0wDhAit3WLHkqJq\nvD1nFAZFhAgdh4gYX2M41MIhxE+FhwRhXmosCo/qhY5CCAAqOKIi5X5gKWcH/Df/E6kxqNKb8K8m\nM8+JvOOv55/0RAWHED8WGqzAk+Pi8fERauUQ4dEYDiF+zuZg8JOiGvzHtKG4J14jdBwiQjSGQwjx\nCaVCjsUT4vHR4avwo98viQRRwRERKfcDSzk74P/5Hx4ZCZPNgUOX23lK5B1/P/+kGxUcQgKAQi7D\njyfo8PERPRhq5RCB0BgOIQGCZVm8uP0MnkiNwUMjI4WOQ0SExnAIIT4lk8nw3L0J2HRUDzvjN79n\nEgmhgiMiUu4HlnJ2IHDyj0/QIk6jwlenmzlO5J1AOf+BjgoOIQFmyb06fHrMAIudEToKCTA0hkNI\nAPrdnvO4O0aNnLFxQkchIkBjOIQQzvx4gg5FVQ0wWe1CRyEBhAqOiEi5H1jK2YHAyz9kYAgmDQlH\nUVUDR4m8E2jnP1Bx/sTPyspKbN26FQCQk5OD1NTUXtctLS3Frl27oFAosGDBAue6f/zjH6HX68Ew\nDJ5//nnExVE3ACF36pl0HX7++SnMTY7BQHWw0HFIAOB0DIdhGOTl5SE3NxcAUFBQgDVr1kAmc/1E\nzpUrV2LDhg2wWCwoKChAQUFBj/dPnjyJ8vJyLFu2zOXnaQyHEO+8V14HhmWx4v7BQkchAvKLMRyD\nwQCdTgelUgmlUom4uDgYDL0/gTAxMRHV1dWoqKhAUlLSbe+HhIQgKIjzRhkhAePJcXHYd64FeqNV\n6CgkAHBacEwmE9RqNQoLC1FYWAi1Wg2j0djr+mlpadi5cydKS0tddr3t378fM2bM4DKyoKTcDyzl\n7EDg5h8YGow5yTH4pELYR1EH6vkPNJwWHI1GA7PZjEWLFmHhwoXo6OhAeHi4y3Xr6+tRUVGBVatW\n4eWXX0ZxcTFsNpvz/SNHjiAhIQGDBg3qc583/8OXlZVJarmqqkpUeWg5MJbn3xOLI5fbsW3vN6LI\nQ8vCLPOBtzEclmWxbt06rF271uW6er0emzdvxqpVq8CyLFavXo38/HwolUrU1tairKwMixcv7nN/\nNIZDSP9srazHyfoOrJk+QugoRAB8jeFwOiAil8sxf/58Z5HJzs52vldeXg6VSuUsEDqdDklJSVi/\nfj0YhkFGRgaUSiUA4M0330RUVBTy8/MxePBgLFmyhMvYhAScrOQYbPuuETUNHRgTGyZ0HOKnaKYB\nESkrK8OUKVOEjtEvUs4OUH4A+PJUEw7UtmDDrNsv2OEanX9h+cVVaoQQ6cgYFYVGUxcqrojzIW1E\n+qiFQwhxOnCuBVurGvCHx0f1er8c8T/UwiGE8O6HIwbAwbIou9AmdBTih6jgiAjflyj6kpSzA5T/\nBrlMhiUTE1B45CocPD6kjc5/YKCCQwjpYWKiFgNCg7HnX9eEjkL8DI3hEEJu8129Cev3X8BH2clQ\nKuj3Un9HYziEEMGkxGkwfGAodtY0CR2F+BEqOCIi5X5gKWcHKL8rz01MwF9P1MNsc/h827ei8x8Y\nqOAQQlwaERWKcQlabDspjoe0EemjMRxCSK+utlvx/744jQ+zkxERQo8G8Vc0hkMIEVxCuAo/HDEQ\nn52oFzoK8QNUcEREyv3AUs4OUP6+PDU+HrvONKOxw+Z+5X6i8x8YqOAQQvoUpQ7GrNFR+LPAD2kj\n0kdjOIQQt4xWO5YU1eD3WUlIjAgROg7xMRrDIYSIhlYVhHmpMdh0VC90FCJhVHBERMr9wFLODlB+\nT8xNiUGV3oSzTWafb5vOf2CggkMI8UhosAILx8Xj4yNXhY5CJIrGcAghHutyMPjJ1hqs/OFQpOk0\nQschPiKaMZyXX34ZpaWl6Orq4jwMIUTcghVyLE7X4aPDV+FHv6sSnrgtOEuXLsW5c+fw0ksvobCw\nEHV1dXzkCkhS7geWcnaA8nvjoZED0dHlwD8v++5R1HT+A4PbuSqGDx+O4cOHw26348iRI1i/fj2i\no6ORlZWFiRMn8pGRECIiCrkMz03UofDIVdw3OBxyehQ18ZBHYzjXrl1DaWkpysrKMGTIEEydOhXH\njx8HACxZsoTzkJ6iMRxC+MGyLH5ZfAaPJ8fg4bsihY5D7hBfYzhuWzivvvoqGhsb8fDDD2PNmjXQ\narUAgPT0dOTm5nIekBAiPjKZDM9NTMBbZZcwdfgABNND2ogH3P6UzJkzB7///e+RlZXlLDY3zJw5\nk7NggUjK/cBSzg5Q/v4Yl6BFvFaFr0433/G26PwHBrctnNTU1F7fe+CBB/r8bGVlJbZu3QoAyMnJ\n6XNbpaWl2LVrFxQKBRYsWOBc15ttEEL4tWRiAvJ212L6qCiEBFErh/TN7U/IhQsXbnutpqbG7YYZ\nhkFRURFeeeUVvPLKKygqKurzMsri4mKsW7cOv/nNb7Bly5Z+bUPqpkyZInSEfpNydoDy99eoGDWS\n48LwxXeNd7QdOv+BwW3B2bhx422v3SgIfTEYDNDpdFAqlVAqlYiLi4PB0Ptss4mJiaiurkZFRQWS\nkpL6tQ1CCP+enaDD1qoGmKx2oaMQkXNbcOTy21fxpJVhMpmgVqtRWFiIwsJCqNVqGI3GXtdPS0vD\nzp07UVpa6uw283YbUiflfmApZwco/50YMiAEk4aEo6iy/4+ipvMfGNwWHIVCgaamJueyXq93WYRu\npdFoYDabsWjRIixcuBAdHR0IDw93uW59fT0qKiqwatUqvPzyyyguLobNZvNqGzfc/A9fVlYmqeWq\nqipR5aFlWvZ0+Zl0Hf5eZcBXB74RRR5a9n6ZD27vw6mursb777+PSZMmgWEYHDx4ECtWrEBKSkqf\nG2YYBnl5ecjNzQXLsli3bh3Wrl3rcl29Xo/Nmzdj1apVYFkWq1evRn5+PoKCgjzeBkD34RAipPe+\nrQPDsFhx/2ChoxAv8XUfjkc3fjY0NODYsWOQyWQYN24cYmNjPdr4iRMnnFeYZWdnIy0tDQBQXl4O\nlUrVozhs27YNp0+fBsMweOCBBzBt2rQ+t+EKFRxChNPa2YWfbK3BO3NHQ6dVCR2HeEFUBUcqpF5w\nysrKJHu1i5SzA5TfVzYf1cNgtOI/pg3z6nNiyd9fUs8vmpkGAKClpQWtra3OiwVaW1sl/cVOCOHG\nj+6JxXN/q8b5a50YHhkqdBwiMm5bOFu2bMGBAwcQHByM8PBwNDQ0YMyYMXjppZf4yugxqbdwCPEH\nW6saUGUwIX/6CKGjEDccDIvXDlxA5oAWcbRwvv32W/zhD3/Avn37MGzYMKjVauzYsYPzYIQQaZoz\nJhrbTjagpqEDY2LDhI5D+vDtpTY0mrqAAfzsz+31zTExMVAqlYiJicGlS5cwZMgQXLlyhY9sAYfv\nSxR9ScrZAcrvS8ogOZ4ZH+/VQ9rElL8/pJq/uKYJWcnRvO3PbcGJjIyEyWTCmDFjsHv3bvzlL3/x\n6+llCCF3bsaoKDSbu1BxxX9v1Ja6ujYLaps7MXU4T80beDCG09nZidDQ7sG/ixcvoqqqCg888AAG\nDhzIS0Bv0BgOIeJRWtuCv1XW453HR0NGD2kTnfe/rUOwQo6f3JvA21Vqbls4N4oNAAwdOhSzZ88W\nZbEhhIjL1OEDwLLAPy60Ch2F3MJiZ7Dn7DU8dncUr/ul+cRFRKr9wIC0swOUnwtymQxL7k1A4RE9\nHEzf3fBizO8NqeU/cK4FY2LDEM/zDbpuC05+fj4fOQghfmjCIC0iQ4Ox++w1oaOQ61iWxfbqRl4v\nFrjBbcGxWq185CCQ9jM1pJwdoPxckV1v5fz5mB42O9PremLN7ykp5T/daIbJ5sDExL4nQuaC24Iz\nduxYlJeX85GFEOKHkuPCMCIyFDtONblfmXBue00TZo+JhlyACzncFpxDhw7hnXfewUsvveT8b+XK\nlXxkCzhS6we+mZSzA5Sfa89NTMBnJ+phtjlcvi/2/O5IJX+bxY7yi23IGMXvxQI3uJ1pYNWqVXzk\nIIT4seGRoRifoMW2kw14Ol0ndJyAtetMM+4fGoGIEI+m0fQ5mi2aEMKLq+1W/OKL0/goO1mwL7xA\nxrAsfvy3aqx+aBjuvmXKIdHch0MIIb6QEK7CtBED8dfjBqGjBKQjde3QqhQYHaMWLIPbXzNee+21\n216TyWTU1cYBKT9TQ8rZAcrPl0Xj47H8f2vwRGosYjVK5+tSyd8bKeQvrm5C1pgYQWd9cFtwsrKy\neiyfPn0aFouFs0CEEP8VpQ7GrLuj8ekxA/5t6hCh4wQMg9GKmoYOvPzIcEFz9GsM56OPPsKSJUu4\nyHNHaAyHEPEzWu1YUlSD32clITEiROg4AeHDw1fR5WDws0mJLt8X7RiOxWJBXV0dF1kIIQFAqwrC\nvNQYbDqiFzpKQLA5GOw63YzZY/ifWeBWbgvOM888g8WLFzv/e/755zF27Fg+sgUcqVzL74qUswOU\nn29zU2JQVW/C2SYzAOnlv5WY8//jfCtGRIWKojXpdgznk08+4SMHISSAhAYrsGhcPD4+chWvzrxL\n6Dh+rbi6CdlpsULHAECXRYuK2K9y6YuUswOUXwiZo6NQ12ZFpd4oyfw3E2v+c81mNHTYMGlIhNBR\nAHhQcM6fP3/bazU1NZyEIYQEjmCFHIvTdfjosJ6eIsyR4pomPHZ3NBRycTwAz23B+fDDD297bcuW\nLZyECXRi7gd2R8rZAcovlIdGDoS5y4GPvvpW6Ch3RIznv8PmwP/VtiJztDDzprnidgxHLr+9Jnn6\n20hlZSW2bt0KAMjJyUFqaqrL9cxmM9544w3ncm1tLTZt2gQAKC0txa5du6BQKLBgwYJet0EIkR6F\nXIZnJ+jwwT868BzLCjKDsb/affYaJiRqEakOFjqKk9uCo1Ao0NTUhOjo7kvq9Hq9yyJ0K4ZhUFRU\nhNzcXABAQUEBUlJSXN7lqlarkZeXBwC4ePEiSkpKnO8VFxdjw4YNsFgsKCgoQEFBgWdHJkFi7Qf2\nhJSzA5RfSPcPjcCnxzT45kIbpg4fIHScfhHb+WdZFsXVjXhxirhurnVbcLKzs/G73/0OkyZNAsMw\nOHjwIFasWOF2wwaDATqdDkpl9/QVcXFxztf6UlJSgszMTOdyYmIiqqur0draiqSkJLf7JYRIi0zW\n3crZePgq7h8aIZrxBik7oTdBLpfhnvgw9yvzyG1TJTk5Ga+88gqioqIQGxuLNWvWICUlxe2GTSYT\n1Go1CgsLUVhYCLVaDaPR2OdnjEYjmpubMXToUOdraWlp2LlzJ0pLS/2+O02M/cCeknJ2gPILzXax\nCqFBcvzf+Raho/SL2M7/9uomZI2JFnTeNFc8uiw6NjYWGRkZmDFjBmJjPbueW6PRwGw2Y9GiRVi4\ncCE6OjoQHt73I0337NnTY3qF+vp6VFRUYNWqVXj55ZdRXFwMm83W5zZu/ocvKyuT1HJVVZWo8tAy\nLfO1LJMBE0Ka8cdvzsPBsILnkfJyU4cNJ/RGhDWe9urzfHA7l5rdbkdQUM+et87OToSGhva5YYZh\nkJeXh9zcXLAsi3Xr1mHt2rW9ru9wOLBmzRrk5+c7x4j0ej02b96MVatWgWVZrF69Gvn5+c5uulvR\nXGqESBcZsJN9AAAWPUlEQVTLsnhp51nMHBWFGQI9kdIfbD6qR5vFjl88MNjjz4hmLrVbiwTLsi4f\nWXDbhuVyzJ8/H2vXrsW6deuQnZ3tfK+8vBwVFRU91j98+DAmTJjQ44IEnU6HpKQkrF+/Hq+++ioy\nMjJ6LTaEEGmTyWT48QQdPj1mgJ2h+3L6w86wKBHJvGmuuL1o4NYGkEwm8/iy6LFjx7qcd23y5Mm3\nvTZp0iSX25g3b55H+/IHZWXif6ZGb6ScHaD8QruRP02nRbxWid1nmpF5tzi/NF0Ry/k/eLEVCeEq\nDI/suwdKKG5bOA6Ho8e4SWdnJ7q6ujgNRQgJXM9OSMCnxw2wORiho0hO8fWLBcTKbQvnwQcfxOuv\nv445c+bA4XDgiy++wLRp03iIFnjE8BtSf0k5O0D5hXZz/uS4MAwdEIqvTjdjTnKMgKk8J4bzf6nF\ngsutFjwwTBzzprnituDMmDEDWq0We/fuhUwmQ0ZGBu6//34+shFCAtSzE3RYs7sWGaOioAqiOYY9\nUVzThJmjoxCsEO/58ijZ5MmT8atf/QrLly+H1WrF+vXruc4VkPi+RNGXpJwdoPxCuzX/qBg1kmLU\n2HmqSaBE3hH6/Hd2ObDv3DXMEvm4l9sWjtlsxpEjR3Dw4EE0NjZi7NixyMrK4iMbISSAPZuuw+qv\n/oXM0VEIDVYIHUfU9p1rwT3xGsRqxH0Vb6/34ZSVleHgwYPQ6/W49957cezYsR4TbIoR3YdDiH9Z\nt/c8RkWrkTM2TugoosWyLH7++Sksu28QJiT2fXN9bwS/D+cPf/gDlEolfvvb32LRokV0/wshhHfP\npMejqKoBZptD6CiiVd3QAaudxfhBWqGjuNVrwXn77bcxZMgQvP7661i7di3a29thMpn4zBZwhO4H\nvhNSzg5QfqH1ln/owFCkD9Li8+8aeU7kHSHPf3F1E2aPiZbEox16HcOJj4/HvHnzMG/ePFy9ehUH\nDx5Efn4+QkNDkZ6ejrlz5/KZkxASoJ5Jj8cvt5/B48nR0KjcDjsHlJbOLhy63I4V9ycKHcUjbudS\nu1VdXR0OHjyInJwcrjL1G43hEOKf/rP0ImI0Sjw7oe/HmwSav54w4EqbFS/9cKj7lfsg+BhObxIT\nE0VZbAgh/uup9Hhsr25Eu8UudBTRcDAsdtY0I0siN8cC/Sg4hDtS7oeXcnaA8gvNXX6dVoWpwweg\nqLKep0TeEeL8H7rcjoGhQRgVreZ93/1FBYcQIgmLxsXjy9PNaOmkuRwBoLimEVnJ4r7R81Zej+GI\nGY3hEOLf3j14GUFyGX46SRqD5Fy52m7Fi9vP4NMnU6D0wdQ/oh3DIYQQoTw5Lh5fn72G5o7AbuXs\nqGnCjKRInxQbPkkrrZ+Tcj+8lLMDlF9onuaPUgdjRlIk/nrCwHEi7/B5/q12BrvPXsNjIn4MQW+o\n4BBCJCVnbBz2nWtBg8nmfmU/VFrbglHRaiSEq4SO4jUawyGESM6Hh6/CaLXjl1OGCB2Fd7/44jSe\nGh+PSUN899wbGsMhhJBeZN8Ti7LzrdC3W4WOwqszjWa0dtpxbz8n6RQaFRwRkXI/vJSzA5RfaN7m\nDw8JwpzkGHx6TBxjOXyd/+KaRjw2JgoKufjnTXOFCg4hRJLmpcbgn5fbUddmEToKL4xWO7650IaM\nUVFCR+k3GsMhhEjWp8cMuNRqwW8eGiZ0FM79b1UDzjaZ8euHhvl82zSGQwghbjyREoOKK0ZcaOkU\nOgqnGJbFjpomyc0scCsqOCIi5X54KWcHKL/Q+ptfrVQg+55YfFIh7FgO1+f/2BUjVEFyJMeGcbof\nrnH6cInKykps3boVAJCTk4PU1FSX65nN5h6Pr66trcWmTZsAAM3NzXjnnXfgcDgwcuRIPPvss1xG\nJoRITFZyNLb9rRrnms0YGSWdiSy9UXy9dSOTwEPW+sLZGA7DMMjLy0Nubi4AoKCgAGvWrHF7wi5e\nvIiSkhL87Gc/AwC89dZbyMzMxOjRo93uk8ZwCAlM20424MRVE/JnjBA6is81mGz4+een8OcnUxAa\nrOBkH5IfwzEYDNDpdFAqlVAqlYiLi4PB4L7ZW1JSgszMTADdRau+vt6jYkMICVyz747G2SYzzjSa\nhY7ic1+easLDIyM5KzZ84qzgmEwmqNVqFBYWorCwEGq1Gkajsc/PGI1GNDc3Y+jQ7qfXtbe3w2az\nYcOGDcjPz8ehQ4e4iisKUu6Hl3J2gPIL7U7zK4PkeHJcHDYd1fsokXe4Ov9dDgZfnW6W/MUCN3BW\ncDQaDcxmMxYtWoSFCxeio6MD4eF93x27Z8+eHs06jUYDtVqNlStX4uWXX8bnn38Om63v+ZNu/ocv\nKyuT1HJVVZWo8tAyLUtpObzpNM4YWlFd3yGKPL5Y/nDXPzFkYAiGDAjhfH984GUMh2VZrFu3DmvX\nru11fYfDgTVr1iA/Px9y+fd18K233sLixYsRGRmJ3Nxc5ObmQqlUutwGjeEQEthKTjXhQG0LXp+V\nJHQUn/jVjjN4IiUWU4cP4HQ/fI3hcHaVmlwux/z5851FJjs72/leeXk5VCpVj+Jw+PBhTJgwoUex\nAYCnn34aH3zwAcxmMyZPntxrsSGEkOmjovBZZT0q9Uak6bRCx7kj5691Qt9uw+ShvpukU2g004CI\nlJWVYcqUKULH6BcpZwcov9B8mX/32WaUnG7Gfz2WxNtlxFyc///+5jIGhgbhmXSdT7friuSvUiOE\nECE8PDISrZ12VFzp+yIlMeuwOVBa24JZo/3jYoEbqIVDCPE7+8+14POTDXh7zihJ3iy5vboRJ/Qm\n5D4ynJf9UQuHEEL66cERA9BpZ3DocrvQUbzGsmz3zAISfIS0O1RwRITvSxR9ScrZAcovNF/nl8tk\nWJyuw6ajevDRiePL/FWGDjAMi7E6jc+2KRZUcAghfumBYRFgAXxzsU3oKF4prmlEVnKMJLsC3aGC\nIyJSvspIytkByi80LvLfaOVsPqoHw3Erx1f5r5m7cLTOiOlJkT7ZnthQwSGE+K1JQ8KhCpLj/2pb\nhY7ikZLTzfjhiAEIU0p/3jRXqOCIiJT74aWcHaD8QuMqv0wmw7MTdPikQg8Hw10rxxf5HQyLnaf8\n82KBG6jgEEL82oRBWoSHBGH/uRaho/Tp20ttiA1T+u0zfQC6D4cQEgCOXzXirbLL+HD+GCjk4hyM\n/3XJvzA9KRKP3MX/+A3dh0MIIT4yLkGLmLBg7D57TegoLtW1WVDb3Mn5JJ1Co4IjIlLuh5dydoDy\nC42P/D+eoMOnxwzocjA+3/ad5t9R04SZo6OgVPj3V7J/Hx0hhFyXEq/B4AEq7DojrlaOxc5gz9lr\neOxu/71Y4AYqOCIi5XsppJwdoPxC4yv/4nQd/nLcAJvdt62cO8m//1wLkuPCEKf1/0evUMEhhASM\nu2PDcFdUKHaeahI6CoDr86ZVNyJrTIzQUXhBBUdEpNwPL+XsAOUXGp/5F6fr8FllPSw+bOX0N/+p\nRjM6bA5MSJT2w+I8RQWHEBJQ7opWIzk2DMXVjUJHQXFNE2aPiYbcD+dNc4UKjohIuR9eytkByi80\nvvM/k65DUWUDzDaHT7bXn/xtFju+vdiGjFFRPskgBVRwCCEBZ3hkKMYlaPCFgK2cXWeaMXloBMJD\nggTLwDcqOCIi5X54KWcHKL/QhMj/TLoO2042osMHrRxv8zMsix1++pC1vlDBIYQEpMEDQnDv4HD8\nb1UD7/s+UtcOrUqB0TH+O2+aK1RwRETK/fBSzg5QfqEJlf/p8fHYXt2Idov9jrbjbf7i6iZkjfHP\nh6z1hQoOISRgJYSr8MCwAby2cgxGK2oaOjBt5EDe9ikWVHBERMr98FLODlB+oQmZf9G4eOw41YTW\nzq5+b8Ob/DtPNePRpEiEBAXe1y+nl0dUVlZi69atAICcnBykpqa6XM9sNuONN95wLtfW1mLTpk3O\n5a6uLrz44ouYM2cOZs6cyWVkQkiAidMqMW3EQPytsgHLfzCI033ZHAx2nW7Gm1lJnO5HrDgrOAzD\noKioCLm5uQCAgoICpKSkuOyzVKvVyMvLAwBcvHgRJSUlPd7fvXs3RowY4ff9nVLuh5dydoDyC03o\n/AvHxeGn205h/j2xiFQHe/15T/P/43wrRkSFIjEixOt9+APO2nQGgwE6nQ5KpRJKpRJxcXEwGAxu\nP1dSUoLMzEznstVqRWVlJSZOnAg/elYcIUREosOUeDQpEn89Uc/pfoqrmzAnObAuhb4ZZwXHZDJB\nrVajsLAQhYWFUKvVMBqNfX7GaDSiubkZQ4cOdb5WUlISMN1oUu6Hl3J2gPILTQz5n0yLw95/XUNj\nh83rz3qS/1yzGY0dNvxgcER/4vkFzgqORqOB2WzGokWLsHDhQnR0dCA8PLzPz+zZs6fHY07NZjNO\nnTqFcePGebzfm//hy8rKJLVcVVUlqjy0TMuBtPxdxT+RGtaJLcfqOdn+n/afRGpoh/MR10If763L\nfJCxHPVTMQyDvLw85ObmgmVZrFu3DmvXru11fYfDgTVr1iA/Px9yeXcdrKiowM6dO6HVatHY2AiH\nw4EXXngBiYmJLrexd+9epKenc3E4hJAA0GaxY0lRNd6dOxrxWpXPtmuy2rH4s2psnD+mX2NEXKuo\nqOjxyz5XOLtoQC6XY/78+c4ik52d7XyvvLwcKpWqR3E4fPgwJkyY4Cw2AJCenu5c58CBA7Barb0W\nG0IIuVMRIUGYPSYanx4z4KUfDnX/AQ/tPnsNExO1oiw2fOL0suixY8di7Nixt70+efLk216bNGlS\nn9uaNm2ar2KJVllZmeBX6/SXlLMDlF9oYso//55YPPe3alxps2JQhGetnL7ysyyL4pom/NvUIb6M\nKUmBd+cRIYT0QasKwtyUGPz5mN4n2zuuNyFILkNqXJhPtidlVHBERCy/4fWHlLMDlF9oYsv/RGos\njtQZcanF4tH6feXvnjct2u/vI/QEFRxCCLlFmFKBH90Tg08q7qyV09Rhwwm9EY/cFemjZNJGBUdE\n+L5E0ZeknB2g/EITY/7Hk2NQaTDh/LVOt+v2lv/LU82YNmIg1EqFr+NJEhUcQghxITRYgey0OGw+\n2r9Wjp1hUXK6GbMD7CFrfaGCIyJi68f2hpSzA5RfaGLNnzUmGqcazTjbZO5zPVf5D15sRUK4CsMj\nQ7mKJzlUcAghpBeqIDmeHNu/Vs6NiwXI96jgiIgY+7E9JeXsAOUXmpjzZ94dhdprnahp6Oh1nVvz\nX2qx4HKrBQ8MC9x501yhgkMIIX1QKuRYND4em7xo5RTXNGHm6CgEK+gr9mZ0NkRErP3YnpBydoDy\nC03s+TNGReFquxVVBpPL92/O39nlwL5z1zDrbupOuxUVHEIIcSNILsNT4+Ox6Yje7XO59p1rQVq8\nBrEaJU/ppIMKjoiIuR/bHSlnByi/0KSQ/9G7ItFs7sLxq7e3cm7kZ1kWxdWNyArgh6z1hQoOIYR4\nQCGX4en07rGc3lo51Q0dsDlYjEvQ8pxOGqjgiIjY+7H7IuXsAOUXmlTyTxsxEB02Bw7Xtfd4/Ub+\n7dVNmD0mGnKaN80lKjiEEOIhhVyGZ9Ljsfmo4bZWTktnFw5fbsf0JJo3rTdUcERECv3YvZFydoDy\nC01K+acMHwA7w6D8UpvztbKyMnx1uhlThg2AVsXpY8YkjQoOIYR4QS6TYfEEHTYf1YO53sphWGDn\nqSa6WMANKjgiIpV+bFeknB2g/EKTWv7JQyIQJJej7EIrACB4yD2IDA1GUrRa4GTiRgWHEEK8JJPJ\nsHhCPD45aoCDYVFcQ5dCe4IKjohIqR/7VlLODlB+oUkx/72J4QhTKrDlRD2q9e14cPhAoSOJHhUc\nQgjpB5lMhmevj+WMjeiCMoi+Tt2Rse7maZCQvXv3Ij09XegYhJAAwbIsNh66isdTYiQ9lU1FRQUe\neeQRzvdD1+8RQkg/yWQyLPvBIKFjSAbnBaeyshJbt24FAOTk5CA1NdXlemazGW+88YZzuba2Fps2\nbQIA/PGPf4RerwfDMHj++ecRFxfHdWxBlJWVSe5qnRuknB2g/EKj/IGB04LDMAyKioqQm5sLACgo\nKEBKSgpkLqZ9UKvVyMvLAwBcvHgRJSUlzveWL18OADh58iS2b9+OZcuWcRmbEEIIBzgd5TIYDNDp\ndFAqlVAqlYiLi4PBYHD7uZKSEmRmZt72ekhICIKC/LcXUMq/IUk5O0D5hUb5AwOn394mkwlqtRqF\nhYUAulsxRqMROp2u188YjUY0Nzdj6NCht723f/9+zJo1i6u4hBBCOMRpC0ej0cBsNmPRokVYuHAh\nOjo6EB4e3udn9uzZ4/JqiSNHjiAhIQGDBvnvAJ0U70W4QcrZAcovNMofGDht4cTHx0Ov//454AaD\nAfHx8b2u73A4UFFRgfz8/B6v19bWorq6GosXL3a7z4qKiv4HFpharZZsfilnByi/0Ch/YOD8PpwT\nJ044r1LLzs5GWloaAKC8vBwqlarHfTPffvstDAYD5s6d22MbL7zwAqKioiCXyzF48GAsWbKEy8iE\nEEI44Fc3fhJCCBEvmouBEEIIL6jgEEII4QUVHEIIIbxQrFmzZo3QIVyprKzEu+++i/379yMmJgax\nsbFer9vb6zU1NXjzzTeh1+sxduxY0R8Hn3n74s2xiCVzf/J4c5xccpXZ22x8H4s3mcVyLL7IzPex\ncJmZ02NhRcjhcLCvvPIKa7VaWavVyv72t79lGYbxeN2+XmdZlj1x4gT7z3/+k928ebPoj4PPvH3x\n5lhYVhyZb+ZpHm+Pk0u3ZvY2mxDH4klmsR3LnWQW6li4yMzHsYiyS82bKXFcravX63t9HQDS0tKg\n0WgkcRx85u2Lt9MUiSHzzTzN09/pmLhwa2ZvswlxLJ5k7u3/T6GO5U4yC3UsXGTm41hEOTGZN1Pi\n9Lbujb97M62Or/niOPjM2xex5/MVMR+nt9nEcCy++v+Tz2PxNrMYjsVXmbk+FlG2cLyZEqe3dfsz\nrY4Yj0MsxJ7PV8R8nN5mE8Ox+Or/Tz6PxdvMYjgWX2Xm+lhE2cLxZkqc3tZlGKbPbbA83O/qi+O4\ngY+8ffF2miJA+My38iRPf46TSzdn9jabUMfiSWZ3/3/eiutjuZPMQh0LF5m5PhbRzjTgzZQ4va3b\n2+t///vfcfz4cbS2tiI5Odn5vB2xHgefefvizbGIJbO7PN78O4ghc1/ZxHAs3mQWy7H4IjPfx8Jl\nZi6PRbQFhxBCiH8R5RgOIYQQ/0MFhxBCCC+o4BBCCOEFFRxCCCG8oIJDCCGEF1RwCCGE8EKUN34S\n4i/a29uxceNG1NfXIyQkBGFhYfj3f/93yGQyoaMRwjsqOIRw6KOPPsL48ePx0EMPAQDMZjMVGxKw\nqEuNEI50dHTg7NmzzmIDdE9uSEigooJDCEcaGhoEe3gbIWJEBYcQQggvqOAQwpHY2FjU19eLbtZs\nQoRCBYcQjoSFhWH06NH4+uuvna/V19cLmIgQYdFs0YRwyGQyYePGjdDr9VAqldBqtXjhhRfo4gES\nkKjgEEII4QV1qRFCCOEFFRxCCCG8oIJDCCGEF1RwCCGE8IIKDiGEEF5QwSGEEMILKjiEEEJ4QQWH\nEEIIL/4/AqL60pT/poMAAAAASUVORK5CYII=\n",
310 |       "text/plain": [
311 |        "<matplotlib.figure.Figure at 0x10a30d490>"
312 |       ]
313 |      },
314 |      "metadata": {},
315 |      "output_type": "display_data"
316 |     }
317 |    ],
318 |    "source": [
319 |     "plt.plot([c.mean_validation_score for c in estimator.grid_scores_], label=\"validation error\")\n",
320 |     "plt.xticks(np.arange(len(tuned_parameters)), tuned_parameters); plt.xlabel(\"C\"); plt.ylabel(\"Accuracy\");plt.legend(loc='best');"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 10,
326 |    "metadata": {
327 |     "collapsed": false
328 |    },
329 |    "outputs": [
330 |     {
331 |      "name": "stdout",
332 |      "output_type": "stream",
333 |      "text": [
334 |       "mean: 0.83410, std: 0.00447, params: {'svm__C': 0.001}\n",
335 |       "mean: 0.83868, std: 0.00546, params: {'svm__C': 0.01}\n",
336 |       "mean: 0.84051, std: 0.00431, params: {'svm__C': 0.10000000000000001}\n",
337 |       "mean: 0.84235, std: 0.00448, params: {'svm__C': 1.0}\n",
338 |       "mean: 0.83960, std: 0.00714, params: {'svm__C': 10.0}\n",
339 |       "mean: 0.78277, std: 0.04901, params: {'svm__C': 100.0}\n",
340 |       "mean: 0.72411, std: 0.04093, params: {'svm__C': 1000.0}\n",
341 |       "mean: 0.79652, std: 0.02038, params: {'svm__C': 10000.0}\n"
342 |      ]
343 |     }
344 |    ],
345 |    "source": []
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "<h2>MAKE PREDICTIONS ON TEST SET</h2>"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 45,
357 |    "metadata": {
358 |     "collapsed": false
359 |    },
360 |    "outputs": [
361 |     {
362 |      "name": "stdout",
363 |      "output_type": "stream",
364 |      "text": [
365 |       "accuracy of best SVM = 0.901639344262\n"
366 |      ]
367 |     }
368 |    ],
369 |    "source": [
370 |     "predictions = estimator.best_estimator_.predict(X_test)\n",
371 |     "#linear SVM that performed the best above\n",
372 |     "#print(predictions)\n",
373 |     "#how accurate was this?\n",
374 |     "#do this tomorrow\n",
375 |     "acc = float(sum(np.equal(predictions , y_test)))/len(predictions)\n",
376 |     "print('accuracy of best SVM = %s' % acc)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "metadata": {
383 |     "collapsed": true
384 |    },
385 |    "outputs": [],
386 |    "source": []
387 |   }
388 |  ],
389 |  "metadata": {
390 |   "kernelspec": {
391 |    "display_name": "Python 2",
392 |    "language": "python",
393 |    "name": "python2"
394 |   },
395 |   "language_info": {
396 |    "codemirror_mode": {
397 |     "name": "ipython",
398 |     "version": 2
399 |    },
400 |    "file_extension": ".py",
401 |    "mimetype": "text/x-python",
402 |    "name": "python",
403 |    "nbconvert_exporter": "python",
404 |    "pygments_lexer": "ipython2",
405 |    "version": "2.7.10"
406 |   }
407 |  },
408 |  "nbformat": 4,
409 |  "nbformat_minor": 0
410 | }
411 | 


--------------------------------------------------------------------------------
/digit_recoginition/digit_recog_classifier_test_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pylab as pl
 3 | 
 4 | from sklearn import svm, metrics, preprocessing
 5 | 
 6 | import csv
 7 | 
 8 | import time
 9 | start_time = time.time()
10 | 
11 | from numpy import genfromtxt
12 | my_data = genfromtxt('train.csv', delimiter=',')
13 | 
14 | print time.time() - start_time, "seconds" #took ~41 seconds
15 | 
16 | 
17 | 
18 | start_time = time.time()
19 | 
20 | images_train = my_data[1:,1:]
21 | images_train = preprocessing.scale(imagestot)
22 | targets_train = my_data[1:,0]
23 | 
24 | classifier = svm.SVC(kernel = 'poly', C = 100, gamma = 0.001, degree = 3)
25 | 
26 | # We learn the digits 
27 | classifier.fit(images_train, targets_train)
28 | 
29 | print time.time() - start_time, "seconds"
30 | 
31 | 
32 | 
33 | my_test_data = genfromtxt('test.csv', delimiter=',')
34 | test = my_test_data[1:,]
35 | test = preprocessing.scale(test)
36 | predicted = classifier.predict(test)
37 | 
38 | length = len(predicted)
39 |     
40 | 
41 | with open('pred_test.csv', 'wb') as csvfile:
42 |     csv_writer = csv.writer(csvfile)
43 |     csv_writer.writerow(['ImageId','Label'])
44 |     for y in range(length):
45 |         csv_writer.writerow([y+1,int(predicted[y])])
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/digit_recoginition/digit_recog_grid_search.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | ================================================================================
 4 | Digit recognition: Support vector machine parameter estimation using grid search 
 5 | ================================================================================
 6 | 
 7 | Here I implemented a cross-validation algorithm. I used scikit learn's
 8 | `sklearn.grid_search.GridSearchCV` to train each classifier on half the
 9 | labeled data and used the other half as the cross-validation set to test
10 | the performance of the classifier.
11 | 
12 | The classifiers I tested were all support vector machines (SVMs): Gaussian,
13 | linear, and polynomial (degrees 2,3 and 4) over a range of parameters.
14 | 
15 | I tested these classifiers for precision, that is, the positive predictive
16 | value or the proportion of those tested that are predicted correctly.
17 | """
18 | 
19 | from __future__ import print_function
20 | 
21 | from sklearn import datasets
22 | from sklearn.cross_validation import train_test_split
23 | from sklearn.grid_search import GridSearchCV
24 | from sklearn.metrics import classification_report
25 | from sklearn.svm import SVC
26 | from sklearn import svm, metrics, preprocessing
27 | import csv
28 | import time
29 | 
30 | print(__doc__)
31 | 
32 | # Loading the Digits dataset
33 | 
34 | 
35 | ###
36 | from numpy import genfromtxt
37 | 
38 | 
39 | my_data = genfromtxt('train.csv', delimiter=',')
40 | 
41 | 
42 | x_train = my_data[1:,1:]
43 | x_train = preprocessing.scale(x_train)
44 | t_train = my_data[1:,0]
45 | 
46 | 
47 | 
48 | start_time = time.time()
49 | # Split the dataset in two equal parts
50 | x_train, x_cv, t_train, t_cv = train_test_split(
51 |     x_train, t_train, test_size=0.5, random_state=0)
52 | 
53 | # Set the parameters by cross-validation
54 | tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
55 |                      'C': [1, 10]},
56 |                     {'kernel': ['linear'], 'C': [1, 10]},{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [2]},
57 |                     {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [3]},
58 |                     {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly'], 'degree': [4]}]
59 | 
60 | 
61 | scores = ['precision'] # you can alter this by adding, for example, `recall'
62 | 
63 | for score in scores:
64 |     print("# Tuning hyper-parameters for %s" % score)
65 |     print()
66 | 
67 |     clf = GridSearchCV(SVC(C=1), tuned_parameters)
68 |     clf.fit(x_train, t_train)
69 | 
70 |     print("Best parameters set found on development set:")
71 |     print()
72 |     print(clf.best_estimator_)
73 |     print()
74 |     print("Grid scores on development set:")
75 |     print()
76 |     for params, mean_score, scores in clf.grid_scores_:
77 |         print("%0.3f (+/-%0.03f) for %s"
78 |               % (mean_score, scores.std() / 2, params))
79 |     print()
80 | 
81 | 
82 | print(time.time()- start_time)
83 | 
84 | 
85 | # <codecell>
86 | 
87 | 
88 | # <codecell>
89 | 
90 | 
91 | # <codecell>
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/homesite/Boris_gradient_boost.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Homesite"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Using Theano backend.\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "import pandas as pd\n",
 27 |     "import numpy as np\n",
 28 |     "import copy\n",
 29 |     "import csv\n",
 30 |     "from sklearn import linear_model\n",
 31 |     "import xgboost as xgb\n",
 32 |     "from sklearn.ensemble import RandomForestClassifier\n",
 33 |     "from keras.models import Sequential\n",
 34 |     "from keras.layers.core import Dense, Dropout, Activation\n",
 35 |     "from keras.optimizers import SGD\n",
 36 |     "from sklearn import svm\n",
 37 |     "from sklearn.decomposition import PCA\n",
 38 |     "from sklearn.preprocessing import PolynomialFeatures\n"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 2,
 44 |    "metadata": {
 45 |     "collapsed": false
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "def ReplaceWithDummies(column, *DataFrames):\n",
 50 |     "    #The purpose of this function is to replace a column of type 'object' with n distict values,\n",
 51 |     "    #common to all DataFrames passed in\n",
 52 |     "    #For example train and test data sets, with n-1 boolean columns as delete the original culmnn\n",
 53 |     "    for df in DataFrames: #Make sure the column is actually in all data frames\n",
 54 |     "        if column not in df.columns:\n",
 55 |     "            print('column not found')\n",
 56 |     "            return None\n",
 57 |     "    size=[]\n",
 58 |     "    for df in DataFrames:\n",
 59 |     "        size.append(df.shape[0])\n",
 60 |     "    \n",
 61 |     "    long_column=[]\n",
 62 |     "    for i in range(len(DataFrames)):\n",
 63 |     "        long_column.append(DataFrames[i][column])\n",
 64 |     "    long_column = pd.concat(long_column)\n",
 65 |     "    dummies = pd.get_dummies(long_column)\n",
 66 |     "    dummies.drop(list(dummies.columns)[0], axis=1, inplace=True) # dropping one column from dummies\n",
 67 |     "    \n",
 68 |     "    Dummies =[] # As list of dummies to append to the list of DataFrames in order \n",
 69 |     "    for s in size:\n",
 70 |     "        Dummies.append(dummies[:s])\n",
 71 |     "        dummies=dummies[s:]\n",
 72 |     "    \n",
 73 |     "    #drop the column that needs replacing\n",
 74 |     "    for df in DataFrames:\n",
 75 |     "        df.drop(column, axis=1, inplace=True)\n",
 76 |     "\n",
 77 |     "    \n",
 78 |     "    #Now append the dummy variables:\n",
 79 |     "\n",
 80 |     "    for i,df in enumerate(DataFrames):\n",
 81 |     "        for column_type in Dummies[i]:     \n",
 82 |     "            new_name=str(column) +'_'+ str(column_type)\n",
 83 |     "            df[new_name]=Dummies[i][column_type]\n",
 84 |     "    return DataFrames\n",
 85 |     "    \n",
 86 |     "                "
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 23,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "tr=pd.read_csv('train.csv')\n",
 98 |     "te=pd.read_csv('test.csv')"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 24,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "#Run for local testing\n",
110 |     "n= len(tr)\n",
111 |     "n = int(n*(float(2)/float(3)))\n",
112 |     "train = copy.deepcopy(tr[:n])\n",
113 |     "m=int((len(tr)-n)/2)\n",
114 |     "validation = copy.deepcopy(tr[n:n+m])\n",
115 |     "test = copy.deepcopy(tr[n+m:])\n",
116 |     "\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 36,
122 |    "metadata": {
123 |     "collapsed": true
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "#Use this for the real thing\n",
128 |     "train = tr[:]\n",
129 |     "validation = te[:]\n",
130 |     "test=te[:]"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 37,
136 |    "metadata": {
137 |     "collapsed": false
138 |    },
139 |    "outputs": [
140 |     {
141 |      "name": "stderr",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: \n",
145 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
146 |       "\n",
147 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
148 |       "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: \n",
149 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
150 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
151 |       "\n",
152 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
153 |       "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: \n",
154 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
155 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
156 |       "\n",
157 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
158 |       "/Users/blerner/anaconda/lib/python3.4/site-packages/pandas/core/generic.py:2862: SettingWithCopyWarning: \n",
159 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
160 |       "\n",
161 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
162 |       "  self._update_inplace(new_data)\n",
163 |       "/Users/blerner/anaconda/lib/python3.4/site-packages/pandas/core/generic.py:3117: SettingWithCopyWarning: \n",
164 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
165 |       "\n",
166 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
167 |       "  self._update_inplace(new_data)\n"
168 |      ]
169 |     }
170 |    ],
171 |    "source": [
172 |     "#Converts date to an int. Seems to work better than previous attemopts of using categorical variables.\n",
173 |     "\n",
174 |     "ALL = [train, validation, test]\n",
175 |     "for frame in ALL:\n",
176 |     "    frame.drop('QuoteNumber', axis=1, inplace=True)\n",
177 |     "    frame['Original_Quote_Date']= pd.to_datetime(frame['Original_Quote_Date'])\n",
178 |     "    frame['Original_Quote_Date'] = frame['Original_Quote_Date'].astype(int)\n",
179 |     "    for c in frame:\n",
180 |     "        frame[c].fillna(0, inplace=True)\n",
181 |     "        frame[c].replace(-1, 0, inplace=True)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 43,
187 |    "metadata": {
188 |     "collapsed": false
189 |    },
190 |    "outputs": [
191 |     {
192 |      "data": {
193 |       "text/plain": [
194 |        "173836"
195 |       ]
196 |      },
197 |      "execution_count": 43,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "len(validation)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 40,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stderr",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: \n",
218 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
219 |       "\n",
220 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "#dropping useless columns\n",
226 |     "for c in train.drop('QuoteConversion_Flag', axis=1):\n",
227 |     "    x=train[c].unique()\n",
228 |     "    if len(x) < 2:\n",
229 |     "        for frame in ALL:\n",
230 |     "            frame.drop(c, axis=1, inplace=True)\n",
231 |     "#rescaling\n",
232 |     "for c in train.drop('QuoteConversion_Flag', axis=1):\n",
233 |     "    if train[c].dtype != 'object':\n",
234 |     "        mean=train[c].mean()\n",
235 |     "        std = train[c].std()\n",
236 |     "        if std > 0.0001:\n",
237 |     "            for frame in ALL:\n",
238 |     "                frame = (frame[c]-mean)/std"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 42,
244 |    "metadata": {
245 |     "collapsed": false
246 |    },
247 |    "outputs": [
248 |     {
249 |      "name": "stderr",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:27: SettingWithCopyWarning: \n",
253 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
254 |       "\n",
255 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
256 |       "/Users/blerner/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:35: SettingWithCopyWarning: \n",
257 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
258 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
259 |       "\n",
260 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
261 |      ]
262 |     }
263 |    ],
264 |    "source": [
265 |     "#Replacing all categorical variables with dummy variables\n",
266 |     "for column in train:\n",
267 |     "    if train[column].dtype == 'object':\n",
268 |     "        [train, validation, test] = ReplaceWithDummies(column, train, validation, test)\n",
269 |     "    "
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {
276 |     "collapsed": true
277 |    },
278 |    "outputs": [],
279 |    "source": [
280 |     "#Run this for partial\n",
281 |     "\n",
282 |     "X_train=train.drop('QuoteConversion_Flag', axis=1)\n",
283 |     "X_test=test.drop('QuoteConversion_Flag', axis=1)\n",
284 |     "X_validation=test.drop('QuoteConversion_Flag', axis=1)\n",
285 |     "y_train=train['QuoteConversion_Flag']\n",
286 |     "y_test=test['QuoteConversion_Flag']\n",
287 |     "y_validation=validation['QuoteConversion_Flag']"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 44,
293 |    "metadata": {
294 |     "collapsed": true
295 |    },
296 |    "outputs": [],
297 |    "source": [
298 |     "#Run this for full\n",
299 |     "X_train=train.drop('QuoteConversion_Flag', axis=1)\n",
300 |     "y_train=train['QuoteConversion_Flag']\n",
301 |     "X_validation= validation"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 49,
307 |    "metadata": {
308 |     "collapsed": false
309 |    },
310 |    "outputs": [
311 |     {
312 |      "data": {
313 |       "text/plain": [
314 |        "173836"
315 |       ]
316 |      },
317 |      "execution_count": 49,
318 |      "metadata": {},
319 |      "output_type": "execute_result"
320 |     }
321 |    ],
322 |    "source": [
323 |     "len(X_validation)"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {
330 |     "collapsed": true
331 |    },
332 |    "outputs": [],
333 |    "source": [
334 |     "#Support Vector machine\n",
335 |     "model_svc = svm.SVC()\n",
336 |     "model_svc.fit(X_train, y_train)\n",
337 |     "print(sum(y_train))\n",
338 |     "print(1-sum(abs(np.array(y_train)-np.array(model_svc.predict(X_train))))/float(len(y_train)))"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 48,
344 |    "metadata": {
345 |     "collapsed": false
346 |    },
347 |    "outputs": [
348 |     {
349 |      "data": {
350 |       "text/plain": [
351 |        "(260753, 601)"
352 |       ]
353 |      },
354 |      "execution_count": 48,
355 |      "metadata": {},
356 |      "output_type": "execute_result"
357 |     }
358 |    ],
359 |    "source": [
360 |     "X_train.shape\n"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 19,
366 |    "metadata": {
367 |     "collapsed": false
368 |    },
369 |    "outputs": [],
370 |    "source": [
371 |     "#Poly\n",
372 |     "pca = PCA(n_components=550) #Instantiate the model & set parameters\n",
373 |     "pca.fit(X_train); #Fit the model\n",
374 |     "X_train_red = pca.transform(X_train)\n",
375 |     "X_validation_red = pca.transform(X_validation)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 11,
381 |    "metadata": {
382 |     "collapsed": true
383 |    },
384 |    "outputs": [],
385 |    "source": [
386 |     "#introduce interaction terms\n",
387 |     "poly = PolynomialFeatures()\n",
388 |     "poly.fit(X_train_red)\n",
389 |     "X_train_poly=poly.transform(X_train_red)\n",
390 |     "X_validation_poly = poly.transform(X_validation_red)"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 12,
396 |    "metadata": {
397 |     "collapsed": false
398 |    },
399 |    "outputs": [
400 |     {
401 |      "data": {
402 |       "text/plain": [
403 |        "(260753, 1326)"
404 |       ]
405 |      },
406 |      "execution_count": 12,
407 |      "metadata": {},
408 |      "output_type": "execute_result"
409 |     }
410 |    ],
411 |    "source": [
412 |     "X_train_poly.shape"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {
419 |     "collapsed": false
420 |    },
421 |    "outputs": [],
422 |    "source": [
423 |     "X_train_red.shape"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {
430 |     "collapsed": false
431 |    },
432 |    "outputs": [],
433 |    "source": [
434 |     "#Logistioc regression\n",
435 |     "model = linear_model.LogisticRegression(C=0.1)\n",
436 |     "model.fit(X_train, y_train)\n",
437 |     "predictions=model.predict_proba(X_validation)[:,1]\n",
438 |     "#print(sum(y_train))\n",
439 |     "#print(1-sum(abs(np.array(y_train)-np.array(model.predict(X_train))))/float(len(y_train)))\n",
440 |     "#p1=model.predict(validation)\n",
441 |     "#print(sum(y_validation))\n",
442 |     "#print(1-sum(abs(np.array(y_validation)-np.array(model.predict(X_validation))))/float(len(y_validation)))\n"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 50,
448 |    "metadata": {
449 |     "collapsed": false
450 |    },
451 |    "outputs": [],
452 |    "source": [
453 |     "#Gradient boosting \n",
454 |     "model_xgb = xgb.DMatrix(np.array(X_train), label=np.array(y_train))\n",
455 |     "bst = xgb.train({'objective':'reg:logistic'},dtrain=model_xgb)\n",
456 |     "predictions = bst.predict(xgb.DMatrix(X_validation))\n",
457 |     "#predictions = bst.predict(xgb.DMatrix(X_train))\n",
458 |     "#predictions_binary = []\n",
459 |     "#for x in list(predictions):\n",
460 |     "#    if x>=0.5:\n",
461 |     "#        predictions_binary.append(1)\n",
462 |     "#    else:\n",
463 |     "#        predictions_binary.append(0)\n",
464 |     "\n",
465 |     "#print(sum(y_train))\n",
466 |     "#print(1-sum(abs(np.array(y_train)-np.array(predictions_binary)))/float(len(y_train)))"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 51,
472 |    "metadata": {
473 |     "collapsed": false
474 |    },
475 |    "outputs": [
476 |     {
477 |      "data": {
478 |       "text/plain": [
479 |        "173836"
480 |       ]
481 |      },
482 |      "execution_count": 51,
483 |      "metadata": {},
484 |      "output_type": "execute_result"
485 |     }
486 |    ],
487 |    "source": [
488 |     "len(predictions)"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {
495 |     "collapsed": false
496 |    },
497 |    "outputs": [],
498 |    "source": [
499 |     "predictions = bst.predict(xgb.DMatrix(X_validation))\n",
500 |     "#predictions_binary = []\n",
501 |     "#for x in list(predictions):\n",
502 |     "#    if x>=0.5:\n",
503 |     "#        predictions_binary.append(1)\n",
504 |     "#    else:\n",
505 |     "#        predictions_binary.append(0)\n",
506 |     "#print(sum(y_train))\n",
507 |     "#print(1-sum(abs(np.array(y_validation)-np.array(predictions_binary)))/float(len(y_validation)))\n",
508 |     "#p2=predictions_binary"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {
515 |     "collapsed": false
516 |    },
517 |    "outputs": [],
518 |    "source": [
519 |     "len(p1)"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "metadata": {
526 |     "collapsed": false
527 |    },
528 |    "outputs": [],
529 |    "source": [
530 |     "#Let's try random forest\n",
531 |     "\n",
532 |     "rfc_model = RandomForestClassifier(n_estimators = 10, n_jobs=-1)\n",
533 |     "rfc_model.fit(X_train,y_train)\n",
534 |     "print(sum(y_train))\n",
535 |     "print(1-sum(abs(np.array(y_train)-np.array(rfc_model.predict(X_train))))/float(len(y_train)))\n",
536 |     "#predict2=rfc_model.predict(validation)\n",
537 |     "#print(sum(y_validation))\n",
538 |     "#print(1-sum(abs(np.array(y_validation)-np.array(rfc_model.predict(X_validation))))/float(len(y_validation)))\n",
539 |     "p3=rfc_model.predict(X_validation)"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "code",
544 |    "execution_count": null,
545 |    "metadata": {
546 |     "collapsed": false
547 |    },
548 |    "outputs": [],
549 |    "source": [
550 |     "#Not run for full\n",
551 |     "ensemble_train =list(np.logical_or(np.array(p1),np.array(p2), np.array(p3)))\n",
552 |     "ensemble_train = int(ensemble_train==1)\n",
553 |     "print(1-sum(abs(np.array(y_train)-np.array(ensemble_train)))/float(len(y_train)))\n",
554 |     "\n",
555 |     "ensemble_validation =list(np.logical_or(np.array(model.predict(X_validation)),np.array(rfc_model.predict(X_validation))))\n",
556 |     "ensemble_validation = int(ensemble_validation==1)\n",
557 |     "print(1-sum(abs(np.array(y_validation)-np.array(ensemble_validation)))/float(len(y_validation)))"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": null,
563 |    "metadata": {
564 |     "collapsed": false
565 |    },
566 |    "outputs": [],
567 |    "source": [
568 |     "ensemble_predict =np.logical_or(np.array(p1),np.array(p2), np.array(p3)).astype(int)\n",
569 |     "len(ensemble_predict)\n"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": 53,
575 |    "metadata": {
576 |     "collapsed": false
577 |    },
578 |    "outputs": [],
579 |    "source": [
580 |     "#creates the ouput to be submitted\n",
581 |     "output =pd.DataFrame()\n",
582 |     "output[\"QuoteNumber\"] = te[\"QuoteNumber\"]\n",
583 |     "output[\"QuoteConversion_Flag\"] = predictions\n",
584 |     "output.to_csv(\"output_boost_newdate.csv\", index=False)\n",
585 |     "\n",
586 |     "\n"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": null,
592 |    "metadata": {
593 |     "collapsed": false
594 |    },
595 |    "outputs": [],
596 |    "source": [
597 |     "#attempt at neural network: not working right now\n",
598 |     "\n",
599 |     "model = Sequential()\n",
600 |     "\n",
601 |     "# Dense(64) is a fully-connected layer with 64 hidden units.\n",
602 |     "# in the first layer, you must specify the expected input data shape:\n",
603 |     "# here, 20-dimensional vectors.\n",
604 |     "model.add(Dense(64, input_dim=584, init='uniform'))\n",
605 |     "#model.add(Activation('tanh'))\n",
606 |     "#model.add(Dropout(0.5))\n",
607 |     "#model.add(Dense(64, init='uniform'))\n",
608 |     "#model.add(Activation('tanh'))\n",
609 |     "#model.add(Dropout(0.5))\n",
610 |     "#model.add(Dense(2, init='uniform'))\n",
611 |     "#model.add(Activation('softmax'))\n",
612 |     "\n",
613 |     "#sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)\n",
614 |     "model.compile(loss='mean_squared_error', optimizer='sgd')\n",
615 |     "\n",
616 |     "model.fit(X_train1, y_train1)"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": null,
622 |    "metadata": {
623 |     "collapsed": false
624 |    },
625 |    "outputs": [],
626 |    "source": []
627 |   },
628 |   {
629 |    "cell_type": "code",
630 |    "execution_count": null,
631 |    "metadata": {
632 |     "collapsed": true
633 |    },
634 |    "outputs": [],
635 |    "source": []
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": null,
640 |    "metadata": {
641 |     "collapsed": true
642 |    },
643 |    "outputs": [],
644 |    "source": []
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": null,
649 |    "metadata": {
650 |     "collapsed": true
651 |    },
652 |    "outputs": [],
653 |    "source": []
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": null,
658 |    "metadata": {
659 |     "collapsed": true
660 |    },
661 |    "outputs": [],
662 |    "source": []
663 |   },
664 |   {
665 |    "cell_type": "markdown",
666 |    "metadata": {},
667 |    "source": [
668 |     "https://www.kaggle.com/mpearmain/homesite-quote-conversion/xgboost-benchmark/discussion"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": null,
674 |    "metadata": {
675 |     "collapsed": true
676 |    },
677 |    "outputs": [],
678 |    "source": []
679 |   },
680 |   {
681 |    "cell_type": "code",
682 |    "execution_count": null,
683 |    "metadata": {
684 |     "collapsed": true
685 |    },
686 |    "outputs": [],
687 |    "source": []
688 |   },
689 |   {
690 |    "cell_type": "code",
691 |    "execution_count": null,
692 |    "metadata": {
693 |     "collapsed": true
694 |    },
695 |    "outputs": [],
696 |    "source": []
697 |   },
698 |   {
699 |    "cell_type": "code",
700 |    "execution_count": null,
701 |    "metadata": {
702 |     "collapsed": true
703 |    },
704 |    "outputs": [],
705 |    "source": []
706 |   }
707 |  ],
708 |  "metadata": {
709 |   "kernelspec": {
710 |    "display_name": "Python 3",
711 |    "language": "python",
712 |    "name": "python3"
713 |   },
714 |   "language_info": {
715 |    "codemirror_mode": {
716 |     "name": "ipython",
717 |     "version": 3
718 |    },
719 |    "file_extension": ".py",
720 |    "mimetype": "text/x-python",
721 |    "name": "python",
722 |    "nbconvert_exporter": "python",
723 |    "pygments_lexer": "ipython3",
724 |    "version": "3.4.3"
725 |   }
726 |  },
727 |  "nbformat": 4,
728 |  "nbformat_minor": 0
729 | }
730 | 


--------------------------------------------------------------------------------
/homesite/initial_foray_insurance_grad_boosting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# INITIAL FORAY INTO INSURANCE DATA"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "(260753, 299)\n"
 22 |      ]
 23 |     },
 24 |     {
 25 |      "data": {
 26 |       "text/html": [
 27 |        "<div>\n",
 28 |        "<table border=\"1\" class=\"dataframe\">\n",
 29 |        "  <thead>\n",
 30 |        "    <tr style=\"text-align: right;\">\n",
 31 |        "      <th></th>\n",
 32 |        "      <th>QuoteNumber</th>\n",
 33 |        "      <th>Original_Quote_Date</th>\n",
 34 |        "      <th>QuoteConversion_Flag</th>\n",
 35 |        "      <th>Field6</th>\n",
 36 |        "      <th>Field7</th>\n",
 37 |        "      <th>Field8</th>\n",
 38 |        "      <th>Field9</th>\n",
 39 |        "      <th>Field10</th>\n",
 40 |        "      <th>Field11</th>\n",
 41 |        "      <th>Field12</th>\n",
 42 |        "      <th>...</th>\n",
 43 |        "      <th>GeographicField59A</th>\n",
 44 |        "      <th>GeographicField59B</th>\n",
 45 |        "      <th>GeographicField60A</th>\n",
 46 |        "      <th>GeographicField60B</th>\n",
 47 |        "      <th>GeographicField61A</th>\n",
 48 |        "      <th>GeographicField61B</th>\n",
 49 |        "      <th>GeographicField62A</th>\n",
 50 |        "      <th>GeographicField62B</th>\n",
 51 |        "      <th>GeographicField63</th>\n",
 52 |        "      <th>GeographicField64</th>\n",
 53 |        "    </tr>\n",
 54 |        "  </thead>\n",
 55 |        "  <tbody>\n",
 56 |        "    <tr>\n",
 57 |        "      <th>0</th>\n",
 58 |        "      <td>1</td>\n",
 59 |        "      <td>2013-08-16</td>\n",
 60 |        "      <td>0</td>\n",
 61 |        "      <td>B</td>\n",
 62 |        "      <td>23</td>\n",
 63 |        "      <td>0.9403</td>\n",
 64 |        "      <td>0.0006</td>\n",
 65 |        "      <td>965</td>\n",
 66 |        "      <td>1.0200</td>\n",
 67 |        "      <td>N</td>\n",
 68 |        "      <td>...</td>\n",
 69 |        "      <td>9</td>\n",
 70 |        "      <td>9</td>\n",
 71 |        "      <td>-1</td>\n",
 72 |        "      <td>8</td>\n",
 73 |        "      <td>-1</td>\n",
 74 |        "      <td>18</td>\n",
 75 |        "      <td>-1</td>\n",
 76 |        "      <td>10</td>\n",
 77 |        "      <td>N</td>\n",
 78 |        "      <td>CA</td>\n",
 79 |        "    </tr>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>1</th>\n",
 82 |        "      <td>2</td>\n",
 83 |        "      <td>2014-04-22</td>\n",
 84 |        "      <td>0</td>\n",
 85 |        "      <td>F</td>\n",
 86 |        "      <td>7</td>\n",
 87 |        "      <td>1.0006</td>\n",
 88 |        "      <td>0.0040</td>\n",
 89 |        "      <td>548</td>\n",
 90 |        "      <td>1.2433</td>\n",
 91 |        "      <td>N</td>\n",
 92 |        "      <td>...</td>\n",
 93 |        "      <td>10</td>\n",
 94 |        "      <td>10</td>\n",
 95 |        "      <td>-1</td>\n",
 96 |        "      <td>11</td>\n",
 97 |        "      <td>-1</td>\n",
 98 |        "      <td>17</td>\n",
 99 |        "      <td>-1</td>\n",
100 |        "      <td>20</td>\n",
101 |        "      <td>N</td>\n",
102 |        "      <td>NJ</td>\n",
103 |        "    </tr>\n",
104 |        "    <tr>\n",
105 |        "      <th>2</th>\n",
106 |        "      <td>4</td>\n",
107 |        "      <td>2014-08-25</td>\n",
108 |        "      <td>0</td>\n",
109 |        "      <td>F</td>\n",
110 |        "      <td>7</td>\n",
111 |        "      <td>1.0006</td>\n",
112 |        "      <td>0.0040</td>\n",
113 |        "      <td>548</td>\n",
114 |        "      <td>1.2433</td>\n",
115 |        "      <td>N</td>\n",
116 |        "      <td>...</td>\n",
117 |        "      <td>15</td>\n",
118 |        "      <td>18</td>\n",
119 |        "      <td>-1</td>\n",
120 |        "      <td>21</td>\n",
121 |        "      <td>-1</td>\n",
122 |        "      <td>11</td>\n",
123 |        "      <td>-1</td>\n",
124 |        "      <td>8</td>\n",
125 |        "      <td>N</td>\n",
126 |        "      <td>NJ</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>3</th>\n",
130 |        "      <td>6</td>\n",
131 |        "      <td>2013-04-15</td>\n",
132 |        "      <td>0</td>\n",
133 |        "      <td>J</td>\n",
134 |        "      <td>10</td>\n",
135 |        "      <td>0.9769</td>\n",
136 |        "      <td>0.0004</td>\n",
137 |        "      <td>1,165</td>\n",
138 |        "      <td>1.2665</td>\n",
139 |        "      <td>N</td>\n",
140 |        "      <td>...</td>\n",
141 |        "      <td>6</td>\n",
142 |        "      <td>5</td>\n",
143 |        "      <td>-1</td>\n",
144 |        "      <td>10</td>\n",
145 |        "      <td>-1</td>\n",
146 |        "      <td>9</td>\n",
147 |        "      <td>-1</td>\n",
148 |        "      <td>21</td>\n",
149 |        "      <td>N</td>\n",
150 |        "      <td>TX</td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <th>4</th>\n",
154 |        "      <td>8</td>\n",
155 |        "      <td>2014-01-25</td>\n",
156 |        "      <td>0</td>\n",
157 |        "      <td>E</td>\n",
158 |        "      <td>23</td>\n",
159 |        "      <td>0.9472</td>\n",
160 |        "      <td>0.0006</td>\n",
161 |        "      <td>1,487</td>\n",
162 |        "      <td>1.3045</td>\n",
163 |        "      <td>N</td>\n",
164 |        "      <td>...</td>\n",
165 |        "      <td>18</td>\n",
166 |        "      <td>22</td>\n",
167 |        "      <td>-1</td>\n",
168 |        "      <td>10</td>\n",
169 |        "      <td>-1</td>\n",
170 |        "      <td>11</td>\n",
171 |        "      <td>-1</td>\n",
172 |        "      <td>12</td>\n",
173 |        "      <td>N</td>\n",
174 |        "      <td>IL</td>\n",
175 |        "    </tr>\n",
176 |        "  </tbody>\n",
177 |        "</table>\n",
178 |        "<p>5 rows × 299 columns</p>\n",
179 |        "</div>"
180 |       ],
181 |       "text/plain": [
182 |        "   QuoteNumber Original_Quote_Date  QuoteConversion_Flag Field6  Field7  \\\n",
183 |        "0            1          2013-08-16                     0      B      23   \n",
184 |        "1            2          2014-04-22                     0      F       7   \n",
185 |        "2            4          2014-08-25                     0      F       7   \n",
186 |        "3            6          2013-04-15                     0      J      10   \n",
187 |        "4            8          2014-01-25                     0      E      23   \n",
188 |        "\n",
189 |        "   Field8  Field9 Field10  Field11 Field12        ...          \\\n",
190 |        "0  0.9403  0.0006     965   1.0200       N        ...           \n",
191 |        "1  1.0006  0.0040     548   1.2433       N        ...           \n",
192 |        "2  1.0006  0.0040     548   1.2433       N        ...           \n",
193 |        "3  0.9769  0.0004   1,165   1.2665       N        ...           \n",
194 |        "4  0.9472  0.0006   1,487   1.3045       N        ...           \n",
195 |        "\n",
196 |        "   GeographicField59A  GeographicField59B  GeographicField60A  \\\n",
197 |        "0                   9                   9                  -1   \n",
198 |        "1                  10                  10                  -1   \n",
199 |        "2                  15                  18                  -1   \n",
200 |        "3                   6                   5                  -1   \n",
201 |        "4                  18                  22                  -1   \n",
202 |        "\n",
203 |        "   GeographicField60B  GeographicField61A  GeographicField61B  \\\n",
204 |        "0                   8                  -1                  18   \n",
205 |        "1                  11                  -1                  17   \n",
206 |        "2                  21                  -1                  11   \n",
207 |        "3                  10                  -1                   9   \n",
208 |        "4                  10                  -1                  11   \n",
209 |        "\n",
210 |        "   GeographicField62A  GeographicField62B  GeographicField63  \\\n",
211 |        "0                  -1                  10                  N   \n",
212 |        "1                  -1                  20                  N   \n",
213 |        "2                  -1                   8                  N   \n",
214 |        "3                  -1                  21                  N   \n",
215 |        "4                  -1                  12                  N   \n",
216 |        "\n",
217 |        "   GeographicField64  \n",
218 |        "0                 CA  \n",
219 |        "1                 NJ  \n",
220 |        "2                 NJ  \n",
221 |        "3                 TX  \n",
222 |        "4                 IL  \n",
223 |        "\n",
224 |        "[5 rows x 299 columns]"
225 |       ]
226 |      },
227 |      "execution_count": 1,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "#data from this kaggle comp.: https://www.kaggle.com/c/homesite-quote-conversion\n",
234 |     "#I NEED TO ADD MORE COMMENTS, I KNOW!\n",
235 |     "import numpy as np\n",
236 |     "import pandas as pd\n",
237 |     "import matplotlib.pyplot as plt\n",
238 |     "%matplotlib inline\n",
239 |     "pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier\n",
240 |     "##check out tutorial here:\n",
241 |     "##http://nbviewer.ipython.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%201%20-%20Reading%20from%20a%20CSV.ipynb\n",
242 |     "df_train = pd.read_csv('train.csv')\n",
243 |     "print np.shape(df_train)\n",
244 |     "df_train.head()"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {
251 |     "collapsed": false
252 |    },
253 |    "outputs": [],
254 |    "source": [
255 |     "##CHOOSE A SUBSET TO WORK WITH INITIALLY\n",
256 |     "# df_train = df_train[0:10000]\n",
257 |     "df_train.head()"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 2,
263 |    "metadata": {
264 |     "collapsed": false
265 |    },
266 |    "outputs": [
267 |     {
268 |      "data": {
269 |       "text/html": [
270 |        "<div>\n",
271 |        "<table border=\"1\" class=\"dataframe\">\n",
272 |        "  <thead>\n",
273 |        "    <tr style=\"text-align: right;\">\n",
274 |        "      <th></th>\n",
275 |        "      <th>QuoteNumber</th>\n",
276 |        "      <th>Original_Quote_Date</th>\n",
277 |        "      <th>Field6</th>\n",
278 |        "      <th>Field7</th>\n",
279 |        "      <th>Field8</th>\n",
280 |        "      <th>Field9</th>\n",
281 |        "      <th>Field10</th>\n",
282 |        "      <th>Field11</th>\n",
283 |        "      <th>Field12</th>\n",
284 |        "      <th>CoverageField1A</th>\n",
285 |        "      <th>...</th>\n",
286 |        "      <th>GeographicField59A</th>\n",
287 |        "      <th>GeographicField59B</th>\n",
288 |        "      <th>GeographicField60A</th>\n",
289 |        "      <th>GeographicField60B</th>\n",
290 |        "      <th>GeographicField61A</th>\n",
291 |        "      <th>GeographicField61B</th>\n",
292 |        "      <th>GeographicField62A</th>\n",
293 |        "      <th>GeographicField62B</th>\n",
294 |        "      <th>GeographicField63</th>\n",
295 |        "      <th>GeographicField64</th>\n",
296 |        "    </tr>\n",
297 |        "  </thead>\n",
298 |        "  <tbody>\n",
299 |        "    <tr>\n",
300 |        "      <th>0</th>\n",
301 |        "      <td>3</td>\n",
302 |        "      <td>2014-08-12</td>\n",
303 |        "      <td>E</td>\n",
304 |        "      <td>16</td>\n",
305 |        "      <td>0.9364</td>\n",
306 |        "      <td>0.0006</td>\n",
307 |        "      <td>1,487</td>\n",
308 |        "      <td>1.3045</td>\n",
309 |        "      <td>N</td>\n",
310 |        "      <td>4</td>\n",
311 |        "      <td>...</td>\n",
312 |        "      <td>1</td>\n",
313 |        "      <td>1</td>\n",
314 |        "      <td>-1</td>\n",
315 |        "      <td>1</td>\n",
316 |        "      <td>-1</td>\n",
317 |        "      <td>20</td>\n",
318 |        "      <td>-1</td>\n",
319 |        "      <td>25</td>\n",
320 |        "      <td>Y</td>\n",
321 |        "      <td>IL</td>\n",
322 |        "    </tr>\n",
323 |        "    <tr>\n",
324 |        "      <th>1</th>\n",
325 |        "      <td>5</td>\n",
326 |        "      <td>2013-09-07</td>\n",
327 |        "      <td>F</td>\n",
328 |        "      <td>11</td>\n",
329 |        "      <td>0.9919</td>\n",
330 |        "      <td>0.0038</td>\n",
331 |        "      <td>564</td>\n",
332 |        "      <td>1.1886</td>\n",
333 |        "      <td>N</td>\n",
334 |        "      <td>8</td>\n",
335 |        "      <td>...</td>\n",
336 |        "      <td>10</td>\n",
337 |        "      <td>10</td>\n",
338 |        "      <td>-1</td>\n",
339 |        "      <td>5</td>\n",
340 |        "      <td>-1</td>\n",
341 |        "      <td>5</td>\n",
342 |        "      <td>-1</td>\n",
343 |        "      <td>21</td>\n",
344 |        "      <td>N</td>\n",
345 |        "      <td>NJ</td>\n",
346 |        "    </tr>\n",
347 |        "    <tr>\n",
348 |        "      <th>2</th>\n",
349 |        "      <td>7</td>\n",
350 |        "      <td>2013-03-29</td>\n",
351 |        "      <td>F</td>\n",
352 |        "      <td>15</td>\n",
353 |        "      <td>0.8945</td>\n",
354 |        "      <td>0.0038</td>\n",
355 |        "      <td>564</td>\n",
356 |        "      <td>1.0670</td>\n",
357 |        "      <td>N</td>\n",
358 |        "      <td>11</td>\n",
359 |        "      <td>...</td>\n",
360 |        "      <td>10</td>\n",
361 |        "      <td>11</td>\n",
362 |        "      <td>-1</td>\n",
363 |        "      <td>20</td>\n",
364 |        "      <td>-1</td>\n",
365 |        "      <td>22</td>\n",
366 |        "      <td>-1</td>\n",
367 |        "      <td>11</td>\n",
368 |        "      <td>N</td>\n",
369 |        "      <td>NJ</td>\n",
370 |        "    </tr>\n",
371 |        "    <tr>\n",
372 |        "      <th>3</th>\n",
373 |        "      <td>9</td>\n",
374 |        "      <td>2015-03-21</td>\n",
375 |        "      <td>K</td>\n",
376 |        "      <td>21</td>\n",
377 |        "      <td>0.8870</td>\n",
378 |        "      <td>0.0004</td>\n",
379 |        "      <td>1,113</td>\n",
380 |        "      <td>1.2665</td>\n",
381 |        "      <td>Y</td>\n",
382 |        "      <td>14</td>\n",
383 |        "      <td>...</td>\n",
384 |        "      <td>8</td>\n",
385 |        "      <td>8</td>\n",
386 |        "      <td>-1</td>\n",
387 |        "      <td>13</td>\n",
388 |        "      <td>-1</td>\n",
389 |        "      <td>8</td>\n",
390 |        "      <td>-1</td>\n",
391 |        "      <td>21</td>\n",
392 |        "      <td>N</td>\n",
393 |        "      <td>TX</td>\n",
394 |        "    </tr>\n",
395 |        "    <tr>\n",
396 |        "      <th>4</th>\n",
397 |        "      <td>10</td>\n",
398 |        "      <td>2014-12-10</td>\n",
399 |        "      <td>B</td>\n",
400 |        "      <td>25</td>\n",
401 |        "      <td>0.9153</td>\n",
402 |        "      <td>0.0007</td>\n",
403 |        "      <td>935</td>\n",
404 |        "      <td>1.0200</td>\n",
405 |        "      <td>N</td>\n",
406 |        "      <td>4</td>\n",
407 |        "      <td>...</td>\n",
408 |        "      <td>7</td>\n",
409 |        "      <td>7</td>\n",
410 |        "      <td>-1</td>\n",
411 |        "      <td>3</td>\n",
412 |        "      <td>-1</td>\n",
413 |        "      <td>22</td>\n",
414 |        "      <td>-1</td>\n",
415 |        "      <td>21</td>\n",
416 |        "      <td>N</td>\n",
417 |        "      <td>CA</td>\n",
418 |        "    </tr>\n",
419 |        "  </tbody>\n",
420 |        "</table>\n",
421 |        "<p>5 rows × 298 columns</p>\n",
422 |        "</div>"
423 |       ],
424 |       "text/plain": [
425 |        "   QuoteNumber Original_Quote_Date Field6  Field7  Field8  Field9 Field10  \\\n",
426 |        "0            3          2014-08-12      E      16  0.9364  0.0006   1,487   \n",
427 |        "1            5          2013-09-07      F      11  0.9919  0.0038     564   \n",
428 |        "2            7          2013-03-29      F      15  0.8945  0.0038     564   \n",
429 |        "3            9          2015-03-21      K      21  0.8870  0.0004   1,113   \n",
430 |        "4           10          2014-12-10      B      25  0.9153  0.0007     935   \n",
431 |        "\n",
432 |        "   Field11 Field12  CoverageField1A        ...          GeographicField59A  \\\n",
433 |        "0   1.3045       N                4        ...                           1   \n",
434 |        "1   1.1886       N                8        ...                          10   \n",
435 |        "2   1.0670       N               11        ...                          10   \n",
436 |        "3   1.2665       Y               14        ...                           8   \n",
437 |        "4   1.0200       N                4        ...                           7   \n",
438 |        "\n",
439 |        "   GeographicField59B  GeographicField60A  GeographicField60B  \\\n",
440 |        "0                   1                  -1                   1   \n",
441 |        "1                  10                  -1                   5   \n",
442 |        "2                  11                  -1                  20   \n",
443 |        "3                   8                  -1                  13   \n",
444 |        "4                   7                  -1                   3   \n",
445 |        "\n",
446 |        "   GeographicField61A  GeographicField61B  GeographicField62A  \\\n",
447 |        "0                  -1                  20                  -1   \n",
448 |        "1                  -1                   5                  -1   \n",
449 |        "2                  -1                  22                  -1   \n",
450 |        "3                  -1                   8                  -1   \n",
451 |        "4                  -1                  22                  -1   \n",
452 |        "\n",
453 |        "   GeographicField62B  GeographicField63  GeographicField64  \n",
454 |        "0                  25                  Y                 IL  \n",
455 |        "1                  21                  N                 NJ  \n",
456 |        "2                  11                  N                 NJ  \n",
457 |        "3                  21                  N                 TX  \n",
458 |        "4                  21                  N                 CA  \n",
459 |        "\n",
460 |        "[5 rows x 298 columns]"
461 |       ]
462 |      },
463 |      "execution_count": 2,
464 |      "metadata": {},
465 |      "output_type": "execute_result"
466 |     }
467 |    ],
468 |    "source": [
469 |     "X_test = pd.read_csv('test.csv')\n",
470 |     "# X_test = X_test[0:5000]\n",
471 |     "X_test.head()"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "markdown",
476 |    "metadata": {},
477 |    "source": [
478 |     "<h2>WE FIRST DEAL WITH MISSING VALUES</h2>"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 3,
484 |    "metadata": {
485 |     "collapsed": false
486 |    },
487 |    "outputs": [
488 |     {
489 |      "data": {
490 |       "text/plain": [
491 |        "(434589, 1489)"
492 |       ]
493 |      },
494 |      "execution_count": 3,
495 |      "metadata": {},
496 |      "output_type": "execute_result"
497 |     }
498 |    ],
499 |    "source": [
500 |     "from sklearn.decomposition import PCA #import principal component analysis\n",
501 |     "from sklearn.preprocessing import Imputer\n",
502 |     "from sklearn.preprocessing import scale\n",
503 |     "df_train_nt = df_train.drop('QuoteConversion_Flag', 1)\n",
504 |     "from sklearn.decomposition import PCA #import principal component analysis\n",
505 |     "from sklearn.preprocessing import Imputer\n",
506 |     "from sklearn.preprocessing import scale\n",
507 |     "# df_train_nt = df_train.drop('QuoteConversion_Flag', 1)\n",
508 |     "frames = [df_train_nt , X_test]\n",
509 |     "X = pd.concat( frames )\n",
510 |     "X_hot = pd.get_dummies( X )\n",
511 |     "imp = Imputer(missing_values='NaN', strategy='mean', axis=0)\n",
512 |     "imp.fit( X_hot )\n",
513 |     "X_hot_imp = imp.transform( X_hot )\n",
514 |     "df_all = scale(X_hot_imp) #scaled data\n",
515 |     "np.shape(df_all)"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": 31,
521 |    "metadata": {
522 |     "collapsed": false
523 |    },
524 |    "outputs": [
525 |     {
526 |      "name": "stdout",
527 |      "output_type": "stream",
528 |      "text": [
529 |       "(260753,)\n"
530 |      ]
531 |     }
532 |    ],
533 |    "source": [
534 |     "# # X_train = df_all[0:10000]\n",
535 |     "# X_train = df_all[0:260753]\n",
536 |     "# # X_test = df_all[10000:]\n",
537 |     "X_test = df_all[260753:]\n",
538 |     "# print np.shape(X_train)\n",
539 |     "# print np.shape(X_test)\n",
540 |     "# # y_train = df_train['QuoteConversion_Flag'][0:10000]\n",
541 |     "# y_train = df_train['QuoteConversion_Flag'][0:260753]\n",
542 |     "X_train = df_all[0:260753]\n",
543 |     "y_train = df_train['QuoteConversion_Flag'][0:260753]\n",
544 |     "print np.shape(y_train)"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "markdown",
549 |    "metadata": {
550 |     "collapsed": true
551 |    },
552 |    "source": [
553 |     "# LET'S TRY SOME GRADIENT BOOSTING!"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": null,
559 |    "metadata": {
560 |     "collapsed": false
561 |    },
562 |    "outputs": [],
563 |    "source": [
564 |     "# import xgboost as xgb\n",
565 |     "\n",
566 |     "# #also see here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md\n",
567 |     "# # hacking this: https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py\n",
568 |     "# dtrain = xgb.DMatrix(np.array(X_train), label=np.array(y_train))\n",
569 |     "# param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:logistic'}\n",
570 |     "# num_round = 2\n",
571 |     "# print ('running cross validation')\n",
572 |     "# bst_cv = xgb.cv(param, dtrain, num_round, nfold=5,\n",
573 |     "#        metrics={'error'}, seed = 0)\n",
574 |     "\n",
575 |     "# print 'done'\n",
576 |     "\n",
577 |     "#you actually wasnt to use GridSearchCV, I think:\n",
578 |     "#https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py"
579 |    ]
580 |   },
581 |   {
582 |    "cell_type": "code",
583 |    "execution_count": null,
584 |    "metadata": {
585 |     "collapsed": false
586 |    },
587 |    "outputs": [],
588 |    "source": [
589 |     "# type(bst_cv)\n",
590 |     "# print bst_cv\n",
591 |     "# # predictions = bst_cv.predict(xgb.DMatrix(X_test))"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": 38,
597 |    "metadata": {
598 |     "collapsed": false
599 |    },
600 |    "outputs": [],
601 |    "source": [
602 |     "#Gradient boosting a la BL\n",
603 |     "import xgboost as xgb\n",
604 |     "model_xgb = xgb.DMatrix(np.array(X_train), label=np.array(y_train))\n",
605 |     "bst = xgb.train({'max_depth':10 , 'n_estimators': 50 , 'objective':'reg:logistic'},dtrain=model_xgb)\n",
606 |     "predictions = bst.predict(xgb.DMatrix(X_test))"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": 33,
612 |    "metadata": {
613 |     "collapsed": false
614 |    },
615 |    "outputs": [
616 |     {
617 |      "name": "stdout",
618 |      "output_type": "stream",
619 |      "text": [
620 |       "(173836,)\n",
621 |       "[ 0.04766377  0.1459558   0.12043708 ...,  0.45562789  0.04766377\n",
622 |       "  0.2594822 ]\n"
623 |      ]
624 |     }
625 |    ],
626 |    "source": [
627 |     "print np.shape(predictions)\n",
628 |     "print predictions"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": 39,
634 |    "metadata": {
635 |     "collapsed": false
636 |    },
637 |    "outputs": [],
638 |    "source": [
639 |     "X_test_q = pd.read_csv('test.csv')\n",
640 |     "output =pd.DataFrame()\n",
641 |     "output[\"QuoteNumber\"] = X_test_q[\"QuoteNumber\"]\n",
642 |     "output[\"QuoteConversion_Flag\"] = predictions\n",
643 |     "output.to_csv(\"pred_test_6.csv\", index=False)"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": 20,
649 |    "metadata": {
650 |     "collapsed": false
651 |    },
652 |    "outputs": [
653 |     {
654 |      "name": "stderr",
655 |      "output_type": "stream",
656 |      "text": [
657 |       "/Users/hugobowne-anderson/repos/scikit-learn/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
658 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
659 |       "/Users/hugobowne-anderson/repos/scikit-learn/sklearn/grid_search.py:43: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
660 |       "  DeprecationWarning)\n"
661 |      ]
662 |     }
663 |    ],
664 |    "source": [
665 |     "from sklearn.cross_validation import KFold, train_test_split\n",
666 |     "from sklearn.metrics import confusion_matrix, mean_squared_error\n",
667 |     "from sklearn.grid_search import GridSearchCV"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": 27,
673 |    "metadata": {
674 |     "collapsed": false
675 |    },
676 |    "outputs": [
677 |     {
678 |      "name": "stdout",
679 |      "output_type": "stream",
680 |      "text": [
681 |       "0.100004890254\n",
682 |       "0.0794442980374\n",
683 |       "Parameter optimization\n"
684 |      ]
685 |     }
686 |    ],
687 |    "source": [
688 |     "#taking lead from here: https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py\n",
689 |     "rng = np.random.RandomState(31337)\n",
690 |     "kf = KFold(y_train.shape[0], n_folds=2, shuffle=True, random_state=rng)\n",
691 |     "for train_index, test_index in kf:\n",
692 |     "    model_xgb = xgb.DMatrix(np.array(X_train[train_index]), label=np.array(y_train[train_index]))\n",
693 |     "    bst = xgb.train({'max_depth':6 , 'n_estimators': 150 , 'objective':'reg:logistic'},dtrain=model_xgb)\n",
694 |     "    predictions = bst.predict(xgb.DMatrix(X_train[test_index]))\n",
695 |     "    actuals = y_train[test_index]\n",
696 |     "    print(mean_squared_error(actuals, predictions))\n",
697 |     "\n",
698 |     "print(\"Parameter optimization\")"
699 |    ]
700 |   },
701 |   {
702 |    "cell_type": "code",
703 |    "execution_count": 37,
704 |    "metadata": {
705 |     "collapsed": false
706 |    },
707 |    "outputs": [
708 |     {
709 |      "name": "stdout",
710 |      "output_type": "stream",
711 |      "text": [
712 |       "Fitting 3 folds for each of 15 candidates, totalling 45 fits\n",
713 |       "0.644338929872\n",
714 |       "{'n_estimators': 50, 'max_depth': 10}\n"
715 |      ]
716 |     },
717 |     {
718 |      "name": "stderr",
719 |      "output_type": "stream",
720 |      "text": [
721 |       "[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 897.3min finished\n"
722 |      ]
723 |     }
724 |    ],
725 |    "source": [
726 |     "xgb_model = xgb.XGBRegressor()\n",
727 |     "clf = GridSearchCV(xgb_model,\n",
728 |     "                   {'max_depth': [2,4,6,8,10],\n",
729 |     "                    'n_estimators': [50,100,200]}, verbose=1)\n",
730 |     "clf.fit(X_train,y_train)\n",
731 |     "print(clf.best_score_)\n",
732 |     "print(clf.best_params_)"
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "code",
737 |    "execution_count": null,
738 |    "metadata": {
739 |     "collapsed": true
740 |    },
741 |    "outputs": [],
742 |    "source": []
743 |   }
744 |  ],
745 |  "metadata": {
746 |   "kernelspec": {
747 |    "display_name": "Python 2",
748 |    "language": "python",
749 |    "name": "python2"
750 |   },
751 |   "language_info": {
752 |    "codemirror_mode": {
753 |     "name": "ipython",
754 |     "version": 2
755 |    },
756 |    "file_extension": ".py",
757 |    "mimetype": "text/x-python",
758 |    "name": "python",
759 |    "nbconvert_exporter": "python",
760 |    "pygments_lexer": "ipython2",
761 |    "version": "2.7.10"
762 |   }
763 |  },
764 |  "nbformat": 4,
765 |  "nbformat_minor": 0
766 | }
767 | 


--------------------------------------------------------------------------------
/paribas/README.md:
--------------------------------------------------------------------------------
1 | for this kaggle comp.: https://www.kaggle.com/c/bnp-paribas-cardif-claims-management


--------------------------------------------------------------------------------
/paribas/boosting_in_barbados.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#I want to get to know gradient boosting methods (in particular, the xgboost library) and i am also currently in barbados.\n",
 12 |     "#Import libraries:\n",
 13 |     "import numpy as np\n",
 14 |     "import pandas as pd\n",
 15 |     "import xgboost as xgb\n",
 16 |     "import time\n",
 17 |     "#load data:\n",
 18 |     "train = pd.read_csv(\"train.csv\")\n",
 19 |     "target = train['target']\n",
 20 |     "#drop targets & (unique row) IDs from training data\n",
 21 |     "train = train.drop(['ID','target'],axis=1)\n",
 22 |     "test = pd.read_csv(\"test.csv\")\n",
 23 |     "IDs = test['ID'].values\n",
 24 |     "test = test.drop(['ID'],axis=1)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "# PREPROCESSING"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 4,
 37 |    "metadata": {
 38 |     "collapsed": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "#impute both numerical & categorical features a la\n",
 43 |     "#http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn\n",
 44 |     "\n",
 45 |     "from sklearn.base import TransformerMixin\n",
 46 |     "\n",
 47 |     "class DataFrameImputer(TransformerMixin):\n",
 48 |     "\n",
 49 |     "    def __init__(self):\n",
 50 |     "        \"\"\"Impute missing values.\n",
 51 |     "\n",
 52 |     "        Columns of dtype object are imputed with the most frequent value \n",
 53 |     "        in column.\n",
 54 |     "\n",
 55 |     "        Columns of other types are imputed with mean of column.\n",
 56 |     "\n",
 57 |     "        \"\"\"\n",
 58 |     "    def fit(self, X, y=None):\n",
 59 |     "\n",
 60 |     "        self.fill = pd.Series([X[c].value_counts().index[0]\n",
 61 |     "            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],\n",
 62 |     "            index=X.columns)\n",
 63 |     "\n",
 64 |     "        return self\n",
 65 |     "\n",
 66 |     "    def transform(self, X, y=None):\n",
 67 |     "        return X.fillna(self.fill)\n",
 68 |     "    \n",
 69 |     "xtrain = DataFrameImputer().fit_transform( train )\n",
 70 |     "xtest = DataFrameImputer().fit_transform( test )"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 6,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "#factorize categorical columns:\n",
 82 |     "for column in xtrain:\n",
 83 |     "    if xtrain[column].dtype == 'O':\n",
 84 |     "#         print pd.factorize(xtrain[column])\n",
 85 |     "        xtrain[column] = pd.factorize(xtrain[column])[0]\n",
 86 |     "    \n",
 87 |     "for column in xtest:\n",
 88 |     "    if xtest[column].dtype == 'O':\n",
 89 |     "#         print pd.factorize(xtrain[column])\n",
 90 |     "        xtest[column] = pd.factorize(xtest[column])[0]"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "Next up: scaling/transforms/get_dummies/dimensionality reduction"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "# GRADIENT BOOSTING & CROSS VALIDATION"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 8,
110 |    "metadata": {
111 |     "collapsed": false
112 |    },
113 |    "outputs": [
114 |     {
115 |      "name": "stderr",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "/Users/hugobowne-anderson/repos/scikit-learn/sklearn/cross_validation.py:42: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
119 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "#check this out: http://xgboost.readthedocs.org/en/latest/model.html\n",
125 |     "from sklearn.cross_validation import KFold, train_test_split\n",
126 |     "X = xtrain.values\n",
127 |     "y = target.values\n",
128 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1 , random_state=0)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 9,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [
138 |     {
139 |      "name": "stderr",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "Will train until validation_0 error hasn't decreased in 50 rounds.\n",
143 |       "[0]\tvalidation_0-logloss:0.660512\n",
144 |       "[1]\tvalidation_0-logloss:0.633833\n",
145 |       "[2]\tvalidation_0-logloss:0.611452\n",
146 |       "[3]\tvalidation_0-logloss:0.592876\n",
147 |       "[4]\tvalidation_0-logloss:0.577408\n",
148 |       "[5]\tvalidation_0-logloss:0.564382\n",
149 |       "[6]\tvalidation_0-logloss:0.553405\n",
150 |       "[7]\tvalidation_0-logloss:0.543860\n",
151 |       "[8]\tvalidation_0-logloss:0.536023\n",
152 |       "[9]\tvalidation_0-logloss:0.528992\n",
153 |       "[10]\tvalidation_0-logloss:0.523194\n",
154 |       "[11]\tvalidation_0-logloss:0.518142\n",
155 |       "[12]\tvalidation_0-logloss:0.514029\n",
156 |       "[13]\tvalidation_0-logloss:0.510511\n",
157 |       "[14]\tvalidation_0-logloss:0.507166\n",
158 |       "[15]\tvalidation_0-logloss:0.504451\n",
159 |       "[16]\tvalidation_0-logloss:0.502286\n",
160 |       "[17]\tvalidation_0-logloss:0.500022\n",
161 |       "[18]\tvalidation_0-logloss:0.498165\n",
162 |       "[19]\tvalidation_0-logloss:0.496556\n",
163 |       "[20]\tvalidation_0-logloss:0.495312\n",
164 |       "[21]\tvalidation_0-logloss:0.493927\n",
165 |       "[22]\tvalidation_0-logloss:0.492970\n",
166 |       "[23]\tvalidation_0-logloss:0.491965\n",
167 |       "[24]\tvalidation_0-logloss:0.491137\n",
168 |       "[25]\tvalidation_0-logloss:0.490338\n",
169 |       "[26]\tvalidation_0-logloss:0.489665\n",
170 |       "[27]\tvalidation_0-logloss:0.489089\n",
171 |       "[28]\tvalidation_0-logloss:0.488625\n",
172 |       "[29]\tvalidation_0-logloss:0.488127\n",
173 |       "[30]\tvalidation_0-logloss:0.487602\n",
174 |       "[31]\tvalidation_0-logloss:0.487334\n",
175 |       "[32]\tvalidation_0-logloss:0.486997\n",
176 |       "[33]\tvalidation_0-logloss:0.486487\n",
177 |       "[34]\tvalidation_0-logloss:0.486237\n",
178 |       "[35]\tvalidation_0-logloss:0.485890\n",
179 |       "[36]\tvalidation_0-logloss:0.485579\n",
180 |       "[37]\tvalidation_0-logloss:0.485430\n",
181 |       "[38]\tvalidation_0-logloss:0.485202\n",
182 |       "[39]\tvalidation_0-logloss:0.484802\n",
183 |       "[40]\tvalidation_0-logloss:0.484583\n",
184 |       "[41]\tvalidation_0-logloss:0.484348\n",
185 |       "[42]\tvalidation_0-logloss:0.484242\n",
186 |       "[43]\tvalidation_0-logloss:0.483940\n",
187 |       "[44]\tvalidation_0-logloss:0.483843\n",
188 |       "[45]\tvalidation_0-logloss:0.483686\n",
189 |       "[46]\tvalidation_0-logloss:0.483566\n",
190 |       "[47]\tvalidation_0-logloss:0.483306\n",
191 |       "[48]\tvalidation_0-logloss:0.482803\n",
192 |       "[49]\tvalidation_0-logloss:0.482678\n",
193 |       "[50]\tvalidation_0-logloss:0.482645\n",
194 |       "[51]\tvalidation_0-logloss:0.482599\n",
195 |       "[52]\tvalidation_0-logloss:0.482483\n",
196 |       "[53]\tvalidation_0-logloss:0.482222\n",
197 |       "[54]\tvalidation_0-logloss:0.482145\n",
198 |       "[55]\tvalidation_0-logloss:0.482033\n",
199 |       "[56]\tvalidation_0-logloss:0.481891\n",
200 |       "[57]\tvalidation_0-logloss:0.481790\n",
201 |       "[58]\tvalidation_0-logloss:0.481366\n",
202 |       "[59]\tvalidation_0-logloss:0.481314\n",
203 |       "[60]\tvalidation_0-logloss:0.481271\n",
204 |       "[61]\tvalidation_0-logloss:0.481124\n",
205 |       "[62]\tvalidation_0-logloss:0.481000\n",
206 |       "[63]\tvalidation_0-logloss:0.480625\n",
207 |       "[64]\tvalidation_0-logloss:0.480605\n",
208 |       "[65]\tvalidation_0-logloss:0.480526\n",
209 |       "[66]\tvalidation_0-logloss:0.480480\n",
210 |       "[67]\tvalidation_0-logloss:0.480338\n",
211 |       "[68]\tvalidation_0-logloss:0.480286\n",
212 |       "[69]\tvalidation_0-logloss:0.480209\n",
213 |       "[70]\tvalidation_0-logloss:0.480185\n",
214 |       "[71]\tvalidation_0-logloss:0.480126\n",
215 |       "[72]\tvalidation_0-logloss:0.480102\n",
216 |       "[73]\tvalidation_0-logloss:0.479896\n",
217 |       "[74]\tvalidation_0-logloss:0.479858\n",
218 |       "[75]\tvalidation_0-logloss:0.479808\n",
219 |       "[76]\tvalidation_0-logloss:0.479728\n",
220 |       "[77]\tvalidation_0-logloss:0.479648\n",
221 |       "[78]\tvalidation_0-logloss:0.479611\n",
222 |       "[79]\tvalidation_0-logloss:0.479599\n",
223 |       "[80]\tvalidation_0-logloss:0.479577\n",
224 |       "[81]\tvalidation_0-logloss:0.479537\n",
225 |       "[82]\tvalidation_0-logloss:0.479479\n",
226 |       "[83]\tvalidation_0-logloss:0.479466\n",
227 |       "[84]\tvalidation_0-logloss:0.479452\n",
228 |       "[85]\tvalidation_0-logloss:0.479426\n",
229 |       "[86]\tvalidation_0-logloss:0.479434\n",
230 |       "[87]\tvalidation_0-logloss:0.479411\n",
231 |       "[88]\tvalidation_0-logloss:0.479377\n",
232 |       "[89]\tvalidation_0-logloss:0.479222\n",
233 |       "[90]\tvalidation_0-logloss:0.479141\n",
234 |       "[91]\tvalidation_0-logloss:0.479086\n",
235 |       "[92]\tvalidation_0-logloss:0.479091\n",
236 |       "[93]\tvalidation_0-logloss:0.479068\n",
237 |       "[94]\tvalidation_0-logloss:0.479094\n",
238 |       "[95]\tvalidation_0-logloss:0.479089\n",
239 |       "[96]\tvalidation_0-logloss:0.479038\n",
240 |       "[97]\tvalidation_0-logloss:0.479036\n",
241 |       "[98]\tvalidation_0-logloss:0.478962\n",
242 |       "[99]\tvalidation_0-logloss:0.478884\n"
243 |      ]
244 |     },
245 |     {
246 |      "data": {
247 |       "text/plain": [
248 |        "XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\n",
249 |        "       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
250 |        "       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,\n",
251 |        "       objective='binary:logistic', reg_alpha=0, reg_lambda=1,\n",
252 |        "       scale_pos_weight=1, seed=0, silent=True, subsample=1)"
253 |       ]
254 |      },
255 |      "execution_count": 9,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "# Early-stopping\n",
262 |     "#http://xgboost.readthedocs.org/en/latest/python/python_intro.html#early-stopping\n",
263 |     "#Also see https://github.com/dmlc/xgboost/blob/master/demo/guide-python/sklearn_examples.py (Jamie Hall et al.)\n",
264 |     "clf = xgb.XGBClassifier()\n",
265 |     "clf.fit(X_train, y_train, early_stopping_rounds=50, eval_metric=\"logloss\",\n",
266 |     "        eval_set=[(X_test, y_test)])"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 14,
272 |    "metadata": {
273 |     "collapsed": false
274 |    },
275 |    "outputs": [],
276 |    "source": [
277 |     "preds = clf.predict_proba(xtest.values, ntree_limit=clf.best_iteration)[:,1]"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 16,
283 |    "metadata": {
284 |     "collapsed": false
285 |    },
286 |    "outputs": [],
287 |    "source": [
288 |     "import csv\n",
289 |     "predictions_file = open(\"xgboost_predictions.csv\", \"w\")\n",
290 |     "open_file_object = csv.writer(predictions_file)\n",
291 |     "open_file_object.writerow([\"ID\", \"PredictedProb\"])\n",
292 |     "open_file_object.writerows(zip(IDs, preds))\n",
293 |     "predictions_file.close()"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {
299 |     "collapsed": true
300 |    },
301 |    "source": [
302 |     "This above performed okay: logloss = -0.5252. But I think we need to increase num_rounds and at least try to change preprocessing:"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "# TESTING ANOTHER APPROACH"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "Loading & preprocessing:"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 53,
322 |    "metadata": {
323 |     "collapsed": false
324 |    },
325 |    "outputs": [
326 |     {
327 |      "name": "stdout",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "Load data...\n",
331 |       "Clearing...\n"
332 |      ]
333 |     }
334 |    ],
335 |    "source": [
336 |     "#https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code\n",
337 |     "print('Load data...')\n",
338 |     "train = pd.read_csv(\"train.csv\")\n",
339 |     "target = train['target']\n",
340 |     "train = train.drop(['ID','target'],axis=1)\n",
341 |     "test = pd.read_csv(\"test.csv\")\n",
342 |     "ids = test['ID'].values\n",
343 |     "test = test.drop(['ID'],axis=1)\n",
344 |     "#\n",
345 |     "print('Clearing...')\n",
346 |     "for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):\n",
347 |     "    if train_series.dtype == 'O':\n",
348 |     "        #for objects: factorize\n",
349 |     "        train[train_name], tmp_indexer = pd.factorize(train[train_name])\n",
350 |     "        test[test_name] = tmp_indexer.get_indexer(test[test_name])\n",
351 |     "        #but now we have -1 values (NaN)\n",
352 |     "    else:\n",
353 |     "        #for int or float: fill NaN\n",
354 |     "        tmp_len = len(train[train_series.isnull()])\n",
355 |     "        if tmp_len>0:\n",
356 |     "            train.loc[train_series.isnull(), train_name] = train_series.mean()\n",
357 |     "        #and Test\n",
358 |     "        tmp_len = len(test[test_series.isnull()])\n",
359 |     "        if tmp_len>0:\n",
360 |     "            test.loc[test_series.isnull(), test_name] = train_series.mean()  #TODO"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "A little function to report best scores (from cross validation):"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 60,
373 |    "metadata": {
374 |     "collapsed": true
375 |    },
376 |    "outputs": [],
377 |    "source": [
378 |     "from operator import itemgetter\n",
379 |     "# Utility function to report best scores\n",
380 |     "def report(grid_scores, n_top=3):\n",
381 |     "    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]\n",
382 |     "    for i, score in enumerate(top_scores):\n",
383 |     "        print(\"Model with rank: {0}\".format(i + 1))\n",
384 |     "        print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n",
385 |     "              score.mean_validation_score,\n",
386 |     "              np.std(score.cv_validation_scores)))\n",
387 |     "        print(\"Parameters: {0}\".format(score.parameters))\n",
388 |     "        print(\"\")"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "markdown",
393 |    "metadata": {},
394 |    "source": [
395 |     "Now we perform a randomizedsearchCV over a number of parameters (using xgb.XGBClassifier()) --\n",
396 |     "I do this because I don't know how to do it with xgb.train() --\n",
397 |     "Important question: what is the relation between these two xgb.train() & xgb.XGBClassifier()?\n",
398 |     "This is important because I can only do hyperparameter tuning on the latter AND I can only alter num_rounds on the former (which is necessary for a good model, it seems). Any thoughts?"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 65,
404 |    "metadata": {
405 |     "collapsed": false
406 |    },
407 |    "outputs": [
408 |     {
409 |      "name": "stdout",
410 |      "output_type": "stream",
411 |      "text": [
412 |       "5031.24653196\n",
413 |       "Model with rank: 1\n",
414 |       "Mean validation score: -0.515 (std: 0.000)\n",
415 |       "Parameters: {'objective': 'binary:logistic', 'subsample': 0.80000000000000004, 'learning_rate': 0.01, 'colsample_bytree': 0.90000000000000002, 'max_depth': 11}\n",
416 |       "\n",
417 |       "Model with rank: 2\n",
418 |       "Mean validation score: -0.516 (std: 0.000)\n",
419 |       "Parameters: {'objective': 'binary:logistic', 'subsample': 0.40000000000000002, 'learning_rate': 0.01, 'colsample_bytree': 1.0, 'max_depth': 12}\n",
420 |       "\n",
421 |       "Model with rank: 3\n",
422 |       "Mean validation score: -0.516 (std: 0.000)\n",
423 |       "Parameters: {'objective': 'binary:logistic', 'subsample': 0.5, 'learning_rate': 0.01, 'colsample_bytree': 0.90000000000000002, 'max_depth': 13}\n",
424 |       "\n",
425 |       "None\n"
426 |      ]
427 |     }
428 |    ],
429 |    "source": [
430 |     "# X = train.values\n",
431 |     "# y = target.values\n",
432 |     "#https://www.kaggle.com/c/springleaf-marketing-response/forums/t/16627/help-with-xgboost-sklearn-randomized-grid-search\n",
433 |     "# -*- coding: utf-8 -*-\n",
434 |     "\"\"\"\n",
435 |     "\"\"\"\n",
436 |     "t0 = time.time()\n",
437 |     "#http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html\n",
438 |     "#http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.RandomizedSearchCV.html\n",
439 |     "param_grid = {'max_depth': range(4,15),\n",
440 |     "#                         'min_child_weight': [1,40],\n",
441 |     "                      'objective':['binary:logistic'],\n",
442 |     "#                       'n_estimators':[5],\n",
443 |     "                      'learning_rate':[0.01], #this is same as eta\n",
444 |     "                      'subsample': np.arange(0.1,1.1,0.1),\n",
445 |     "                      'colsample_bytree': np.arange(0.1,1.1,0.1),\n",
446 |     "                      #'scale_pos_weight': [0.5, 1]\n",
447 |     "                      #'model__eta':[0.01,0.02],\n",
448 |     "                     #'model__scale_pos_weight':[0.8,1.0]\n",
449 |     "                      #'model__silent':[1],\n",
450 |     "                      }\n",
451 |     "\n",
452 |     "\n",
453 |     "from sklearn.grid_search import GridSearchCV, RandomizedSearchCV\n",
454 |     "from sklearn import metrics\n",
455 |     "\n",
456 |     "xgb_model = xgb.XGBClassifier()\n",
457 |     "n_iter_search=20\n",
458 |     "random_search = RandomizedSearchCV(xgb_model, param_distributions=param_grid,\n",
459 |     "                                   n_iter=n_iter_search, scoring =\"log_loss\")\n",
460 |     "\n",
461 |     "# start = time()\n",
462 |     "# training and y_training are \n",
463 |     "# small dataset and target variable that I generated from the training dataset\n",
464 |     "random_search.fit(train, target) \n",
465 |     "t1 = time.time()\n",
466 |     "total_time = t1 - t0\n",
467 |     "print total_time\n",
468 |     "\n",
469 |     "print report(random_search.grid_scores_)\n",
470 |     "xgb_model_best = xgb.XGBClassifier()\n",
471 |     "xgb_model_best.set_params(**random_search.best_params_)\n",
472 |     "#http://stackoverflow.com/questions/34674797/xgboost-xgbclassifier-defaults-in-python\n",
473 |     "xgb_model_best.fit(X , y)\n",
474 |     "preds = xgb_model_best.predict_proba(xtest.values)[:,1]\n",
475 |     "#also see this! https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings/forums/t/18494/gridsearchcv-on-xgboost/105272"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": 66,
481 |    "metadata": {
482 |     "collapsed": true
483 |    },
484 |    "outputs": [],
485 |    "source": [
486 |     "import csv\n",
487 |     "predictions_file = open(\"xgb_rgs_larger_predictions.csv\", \"w\")\n",
488 |     "open_file_object = csv.writer(predictions_file)\n",
489 |     "open_file_object.writerow([\"ID\", \"PredictedProb\"])\n",
490 |     "open_file_object.writerows(zip(IDs, preds))\n",
491 |     "predictions_file.close()"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "markdown",
496 |    "metadata": {},
497 |    "source": [
498 |     "This above performed ok (logloss = -.53791) but not as well as other people's xgbtrain() w/ a large num_rounds. For example, see:\n",
499 |     "https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": 67,
505 |    "metadata": {
506 |     "collapsed": true
507 |    },
508 |    "outputs": [],
509 |    "source": [
510 |     "#https://www.kaggle.com/mpearmain/homesite-quote-conversion/xgboost-benchmark\n",
511 |     "#https://www.kaggle.com/c/springleaf-marketing-response/forums/t/17089/beating-the-benchmark/96855\n",
512 |     "#https://github.com/lenguyenthedat/kaggle-for-fun/blob/master/springleaf-marketing-response/springleaf-xgb-native.py"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "markdown",
517 |    "metadata": {},
518 |    "source": [
519 |     "So now I'll try using the best parameters for xgb.XGBClassifier() in xgb.train() AND make num_boost_round = 200."
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "metadata": {
526 |     "collapsed": true
527 |    },
528 |    "outputs": [],
529 |    "source": [
530 |     "#cf https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code\n",
531 |     "t0 = time.time()\n",
532 |     "xgtrain = xgb.DMatrix(train.values, target.values)\n",
533 |     "xgtest = xgb.DMatrix(test.values)\n",
534 |     "\n",
535 |     "#Now let's fit the model\n",
536 |     "print('Fit the model...')\n",
537 |     "boost_round = 2000 #1800 CHANGE THIS BEFORE START\n",
538 |     "clf = xgb.train(random_search.best_params_,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)\n",
539 |     "\n",
540 |     "#Make predict\n",
541 |     "print('Predict...')\n",
542 |     "preds = clf.predict(xgtest, ntree_limit=clf.best_iteration )\n",
543 |     "##check here for eval metrics + https://github.com/dmlc/xgboost/blob/master/demo/guide-python/evals_result.py\n",
544 |     "t1 = time.time()\n",
545 |     "total_time = t1 - t0\n",
546 |     "print total_time"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": null,
552 |    "metadata": {
553 |     "collapsed": true
554 |    },
555 |    "outputs": [],
556 |    "source": [
557 |     "import csv\n",
558 |     "predictions_file = open(\"xgb_rgs_more_rounds_predictions.csv\", \"w\")\n",
559 |     "open_file_object = csv.writer(predictions_file)\n",
560 |     "open_file_object.writerow([\"ID\", \"PredictedProb\"])\n",
561 |     "open_file_object.writerows(zip(IDs, preds))\n",
562 |     "predictions_file.close()"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "markdown",
567 |    "metadata": {},
568 |    "source": [
569 |     "This performed well: logloss = -0.45991 . "
570 |    ]
571 |   }
572 |  ],
573 |  "metadata": {
574 |   "kernelspec": {
575 |    "display_name": "Python 2",
576 |    "language": "python",
577 |    "name": "python2"
578 |   },
579 |   "language_info": {
580 |    "codemirror_mode": {
581 |     "name": "ipython",
582 |     "version": 2
583 |    },
584 |    "file_extension": ".py",
585 |    "mimetype": "text/x-python",
586 |    "name": "python",
587 |    "nbconvert_exporter": "python",
588 |    "pygments_lexer": "ipython2",
589 |    "version": "2.7.11"
590 |   }
591 |  },
592 |  "nbformat": 4,
593 |  "nbformat_minor": 0
594 | }
595 | 


--------------------------------------------------------------------------------
/paribas/stratified_CV_with_xgboost.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#I want to get to know gradient boosting methods (in particular, the xgboost library) and i am also currently in barbados.\n",
 12 |     "#Import libraries:\n",
 13 |     "import numpy as np\n",
 14 |     "import pandas as pd\n",
 15 |     "import xgboost as xgb\n",
 16 |     "import time"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Stratified CV w/ XGBoost"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "Loading & preprocessing:"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "Load data...\n",
 45 |       "Clearing...\n"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "#https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146/code\n",
 51 |     "print('Load data...')\n",
 52 |     "train = pd.read_csv(\"train.csv\")\n",
 53 |     "target = train['target']\n",
 54 |     "train = train.drop(['ID','target'],axis=1)\n",
 55 |     "test = pd.read_csv(\"test.csv\")\n",
 56 |     "ids = test['ID'].values\n",
 57 |     "test = test.drop(['ID'],axis=1)\n",
 58 |     "#\n",
 59 |     "print('Clearing...')\n",
 60 |     "for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()):\n",
 61 |     "    if train_series.dtype == 'O':\n",
 62 |     "        #for objects: factorize\n",
 63 |     "        train[train_name], tmp_indexer = pd.factorize(train[train_name])\n",
 64 |     "        test[test_name] = tmp_indexer.get_indexer(test[test_name])\n",
 65 |     "        #but now we have -1 values (NaN)\n",
 66 |     "    else:\n",
 67 |     "        #for int or float: fill NaN\n",
 68 |     "        tmp_len = len(train[train_series.isnull()])\n",
 69 |     "        if tmp_len>0:\n",
 70 |     "            train.loc[train_series.isnull(), train_name] = train_series.mean()\n",
 71 |     "        #and Test\n",
 72 |     "        tmp_len = len(test[test_series.isnull()])\n",
 73 |     "        if tmp_len>0:\n",
 74 |     "            test.loc[test_series.isnull(), test_name] = train_series.mean()  #TODO"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "#https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/19083/best-practices-for-parameter-tuning-on-models/\n",
 86 |     "#https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 48,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "Fit the model...\n",
101 |       "408.922273874\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "t0 = time.time()\n",
107 |     "xgtrain = xgb.DMatrix(train.values, target.values)\n",
108 |     "xgtest = xgb.DMatrix(test.values)\n",
109 |     "\n",
110 |     "params = {'objective': 'binary:logistic', \n",
111 |     "              'subsample': 1, \n",
112 |     "              'eta': 0.1, \n",
113 |     "              'colsample_bytree': 0.9, \n",
114 |     "              'max_depth': 10,\n",
115 |     "              'min_child_weight' : 5,\n",
116 |     "                 'silent':1}\n",
117 |     "\n",
118 |     "#Now let's fit the model\n",
119 |     "print('Fit the model...')\n",
120 |     "num_round = 50 #1800 CHANGE THIS BEFORE START\n",
121 |     "clf = xgb.cv(params,xgtrain,num_boost_round=num_round,metrics={'logloss'}, nfold = 5 ,\n",
122 |     "             seed = 0 ,maximize=False)\n",
123 |     "\n",
124 |     "#i have attempted this with argument stratified = 1 and get the following error:\n",
125 |     "#TypeError: cv() got an unexpected keyword argument 'stratified'\n",
126 |     "\n",
127 |     "\n",
128 |     "#Make predict\n",
129 |     "# print('Predict...')\n",
130 |     "##check here for eval metrics + https://github.com/dmlc/xgboost/blob/master/demo/guide-python/evals_result.py\n",
131 |     "t1 = time.time()\n",
132 |     "total_time = t1 - t0\n",
133 |     "print total_time"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 49,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [
143 |     {
144 |      "data": {
145 |       "text/html": [
146 |        "<div>\n",
147 |        "<table border=\"1\" class=\"dataframe\">\n",
148 |        "  <thead>\n",
149 |        "    <tr style=\"text-align: right;\">\n",
150 |        "      <th></th>\n",
151 |        "      <th>test-logloss-mean</th>\n",
152 |        "      <th>test-logloss-std</th>\n",
153 |        "      <th>train-logloss-mean</th>\n",
154 |        "      <th>train-logloss-std</th>\n",
155 |        "    </tr>\n",
156 |        "  </thead>\n",
157 |        "  <tbody>\n",
158 |        "    <tr>\n",
159 |        "      <th>0</th>\n",
160 |        "      <td>0.657933</td>\n",
161 |        "      <td>0.000591</td>\n",
162 |        "      <td>0.655429</td>\n",
163 |        "      <td>0.000550</td>\n",
164 |        "    </tr>\n",
165 |        "    <tr>\n",
166 |        "      <th>1</th>\n",
167 |        "      <td>0.628350</td>\n",
168 |        "      <td>0.000739</td>\n",
169 |        "      <td>0.623086</td>\n",
170 |        "      <td>0.000943</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>2</th>\n",
174 |        "      <td>0.603884</td>\n",
175 |        "      <td>0.001104</td>\n",
176 |        "      <td>0.595802</td>\n",
177 |        "      <td>0.001063</td>\n",
178 |        "    </tr>\n",
179 |        "    <tr>\n",
180 |        "      <th>3</th>\n",
181 |        "      <td>0.583370</td>\n",
182 |        "      <td>0.001012</td>\n",
183 |        "      <td>0.572541</td>\n",
184 |        "      <td>0.000836</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>4</th>\n",
188 |        "      <td>0.566755</td>\n",
189 |        "      <td>0.000924</td>\n",
190 |        "      <td>0.553194</td>\n",
191 |        "      <td>0.000422</td>\n",
192 |        "    </tr>\n",
193 |        "    <tr>\n",
194 |        "      <th>5</th>\n",
195 |        "      <td>0.552614</td>\n",
196 |        "      <td>0.000855</td>\n",
197 |        "      <td>0.536283</td>\n",
198 |        "      <td>0.000151</td>\n",
199 |        "    </tr>\n",
200 |        "    <tr>\n",
201 |        "      <th>6</th>\n",
202 |        "      <td>0.540211</td>\n",
203 |        "      <td>0.000855</td>\n",
204 |        "      <td>0.521392</td>\n",
205 |        "      <td>0.000222</td>\n",
206 |        "    </tr>\n",
207 |        "    <tr>\n",
208 |        "      <th>7</th>\n",
209 |        "      <td>0.529721</td>\n",
210 |        "      <td>0.000880</td>\n",
211 |        "      <td>0.508403</td>\n",
212 |        "      <td>0.000270</td>\n",
213 |        "    </tr>\n",
214 |        "    <tr>\n",
215 |        "      <th>8</th>\n",
216 |        "      <td>0.520790</td>\n",
217 |        "      <td>0.000867</td>\n",
218 |        "      <td>0.497102</td>\n",
219 |        "      <td>0.000514</td>\n",
220 |        "    </tr>\n",
221 |        "    <tr>\n",
222 |        "      <th>9</th>\n",
223 |        "      <td>0.513212</td>\n",
224 |        "      <td>0.000905</td>\n",
225 |        "      <td>0.487175</td>\n",
226 |        "      <td>0.000640</td>\n",
227 |        "    </tr>\n",
228 |        "    <tr>\n",
229 |        "      <th>10</th>\n",
230 |        "      <td>0.506994</td>\n",
231 |        "      <td>0.000959</td>\n",
232 |        "      <td>0.478506</td>\n",
233 |        "      <td>0.000730</td>\n",
234 |        "    </tr>\n",
235 |        "    <tr>\n",
236 |        "      <th>11</th>\n",
237 |        "      <td>0.501447</td>\n",
238 |        "      <td>0.001094</td>\n",
239 |        "      <td>0.470861</td>\n",
240 |        "      <td>0.000729</td>\n",
241 |        "    </tr>\n",
242 |        "    <tr>\n",
243 |        "      <th>12</th>\n",
244 |        "      <td>0.496789</td>\n",
245 |        "      <td>0.001237</td>\n",
246 |        "      <td>0.464108</td>\n",
247 |        "      <td>0.000729</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>13</th>\n",
251 |        "      <td>0.492689</td>\n",
252 |        "      <td>0.001184</td>\n",
253 |        "      <td>0.457947</td>\n",
254 |        "      <td>0.000677</td>\n",
255 |        "    </tr>\n",
256 |        "    <tr>\n",
257 |        "      <th>14</th>\n",
258 |        "      <td>0.489190</td>\n",
259 |        "      <td>0.001212</td>\n",
260 |        "      <td>0.452237</td>\n",
261 |        "      <td>0.000786</td>\n",
262 |        "    </tr>\n",
263 |        "    <tr>\n",
264 |        "      <th>15</th>\n",
265 |        "      <td>0.486248</td>\n",
266 |        "      <td>0.001257</td>\n",
267 |        "      <td>0.447101</td>\n",
268 |        "      <td>0.000871</td>\n",
269 |        "    </tr>\n",
270 |        "    <tr>\n",
271 |        "      <th>16</th>\n",
272 |        "      <td>0.483681</td>\n",
273 |        "      <td>0.001239</td>\n",
274 |        "      <td>0.442427</td>\n",
275 |        "      <td>0.000849</td>\n",
276 |        "    </tr>\n",
277 |        "    <tr>\n",
278 |        "      <th>17</th>\n",
279 |        "      <td>0.481431</td>\n",
280 |        "      <td>0.001276</td>\n",
281 |        "      <td>0.438261</td>\n",
282 |        "      <td>0.000909</td>\n",
283 |        "    </tr>\n",
284 |        "    <tr>\n",
285 |        "      <th>18</th>\n",
286 |        "      <td>0.479627</td>\n",
287 |        "      <td>0.001380</td>\n",
288 |        "      <td>0.434358</td>\n",
289 |        "      <td>0.000855</td>\n",
290 |        "    </tr>\n",
291 |        "    <tr>\n",
292 |        "      <th>19</th>\n",
293 |        "      <td>0.477896</td>\n",
294 |        "      <td>0.001453</td>\n",
295 |        "      <td>0.430595</td>\n",
296 |        "      <td>0.000764</td>\n",
297 |        "    </tr>\n",
298 |        "    <tr>\n",
299 |        "      <th>20</th>\n",
300 |        "      <td>0.476538</td>\n",
301 |        "      <td>0.001381</td>\n",
302 |        "      <td>0.427351</td>\n",
303 |        "      <td>0.000817</td>\n",
304 |        "    </tr>\n",
305 |        "    <tr>\n",
306 |        "      <th>21</th>\n",
307 |        "      <td>0.475284</td>\n",
308 |        "      <td>0.001424</td>\n",
309 |        "      <td>0.424263</td>\n",
310 |        "      <td>0.000826</td>\n",
311 |        "    </tr>\n",
312 |        "    <tr>\n",
313 |        "      <th>22</th>\n",
314 |        "      <td>0.474239</td>\n",
315 |        "      <td>0.001395</td>\n",
316 |        "      <td>0.421472</td>\n",
317 |        "      <td>0.000912</td>\n",
318 |        "    </tr>\n",
319 |        "    <tr>\n",
320 |        "      <th>23</th>\n",
321 |        "      <td>0.473294</td>\n",
322 |        "      <td>0.001422</td>\n",
323 |        "      <td>0.418848</td>\n",
324 |        "      <td>0.000894</td>\n",
325 |        "    </tr>\n",
326 |        "    <tr>\n",
327 |        "      <th>24</th>\n",
328 |        "      <td>0.472517</td>\n",
329 |        "      <td>0.001478</td>\n",
330 |        "      <td>0.416437</td>\n",
331 |        "      <td>0.000846</td>\n",
332 |        "    </tr>\n",
333 |        "    <tr>\n",
334 |        "      <th>25</th>\n",
335 |        "      <td>0.471818</td>\n",
336 |        "      <td>0.001503</td>\n",
337 |        "      <td>0.414173</td>\n",
338 |        "      <td>0.000894</td>\n",
339 |        "    </tr>\n",
340 |        "    <tr>\n",
341 |        "      <th>26</th>\n",
342 |        "      <td>0.471244</td>\n",
343 |        "      <td>0.001540</td>\n",
344 |        "      <td>0.412018</td>\n",
345 |        "      <td>0.000970</td>\n",
346 |        "    </tr>\n",
347 |        "    <tr>\n",
348 |        "      <th>27</th>\n",
349 |        "      <td>0.470718</td>\n",
350 |        "      <td>0.001583</td>\n",
351 |        "      <td>0.410244</td>\n",
352 |        "      <td>0.001135</td>\n",
353 |        "    </tr>\n",
354 |        "    <tr>\n",
355 |        "      <th>28</th>\n",
356 |        "      <td>0.470290</td>\n",
357 |        "      <td>0.001598</td>\n",
358 |        "      <td>0.408401</td>\n",
359 |        "      <td>0.001012</td>\n",
360 |        "    </tr>\n",
361 |        "    <tr>\n",
362 |        "      <th>29</th>\n",
363 |        "      <td>0.469917</td>\n",
364 |        "      <td>0.001631</td>\n",
365 |        "      <td>0.406792</td>\n",
366 |        "      <td>0.001054</td>\n",
367 |        "    </tr>\n",
368 |        "    <tr>\n",
369 |        "      <th>30</th>\n",
370 |        "      <td>0.469591</td>\n",
371 |        "      <td>0.001632</td>\n",
372 |        "      <td>0.405117</td>\n",
373 |        "      <td>0.001180</td>\n",
374 |        "    </tr>\n",
375 |        "    <tr>\n",
376 |        "      <th>31</th>\n",
377 |        "      <td>0.469236</td>\n",
378 |        "      <td>0.001656</td>\n",
379 |        "      <td>0.403349</td>\n",
380 |        "      <td>0.000978</td>\n",
381 |        "    </tr>\n",
382 |        "    <tr>\n",
383 |        "      <th>32</th>\n",
384 |        "      <td>0.468996</td>\n",
385 |        "      <td>0.001689</td>\n",
386 |        "      <td>0.401859</td>\n",
387 |        "      <td>0.000776</td>\n",
388 |        "    </tr>\n",
389 |        "    <tr>\n",
390 |        "      <th>33</th>\n",
391 |        "      <td>0.468792</td>\n",
392 |        "      <td>0.001664</td>\n",
393 |        "      <td>0.400643</td>\n",
394 |        "      <td>0.000672</td>\n",
395 |        "    </tr>\n",
396 |        "    <tr>\n",
397 |        "      <th>34</th>\n",
398 |        "      <td>0.468562</td>\n",
399 |        "      <td>0.001643</td>\n",
400 |        "      <td>0.399290</td>\n",
401 |        "      <td>0.000633</td>\n",
402 |        "    </tr>\n",
403 |        "    <tr>\n",
404 |        "      <th>35</th>\n",
405 |        "      <td>0.468300</td>\n",
406 |        "      <td>0.001700</td>\n",
407 |        "      <td>0.397969</td>\n",
408 |        "      <td>0.000679</td>\n",
409 |        "    </tr>\n",
410 |        "    <tr>\n",
411 |        "      <th>36</th>\n",
412 |        "      <td>0.468079</td>\n",
413 |        "      <td>0.001699</td>\n",
414 |        "      <td>0.396654</td>\n",
415 |        "      <td>0.000702</td>\n",
416 |        "    </tr>\n",
417 |        "    <tr>\n",
418 |        "      <th>37</th>\n",
419 |        "      <td>0.467950</td>\n",
420 |        "      <td>0.001706</td>\n",
421 |        "      <td>0.395548</td>\n",
422 |        "      <td>0.000823</td>\n",
423 |        "    </tr>\n",
424 |        "    <tr>\n",
425 |        "      <th>38</th>\n",
426 |        "      <td>0.467791</td>\n",
427 |        "      <td>0.001684</td>\n",
428 |        "      <td>0.394453</td>\n",
429 |        "      <td>0.000800</td>\n",
430 |        "    </tr>\n",
431 |        "    <tr>\n",
432 |        "      <th>39</th>\n",
433 |        "      <td>0.467616</td>\n",
434 |        "      <td>0.001657</td>\n",
435 |        "      <td>0.393437</td>\n",
436 |        "      <td>0.000796</td>\n",
437 |        "    </tr>\n",
438 |        "    <tr>\n",
439 |        "      <th>40</th>\n",
440 |        "      <td>0.467466</td>\n",
441 |        "      <td>0.001650</td>\n",
442 |        "      <td>0.392171</td>\n",
443 |        "      <td>0.000925</td>\n",
444 |        "    </tr>\n",
445 |        "    <tr>\n",
446 |        "      <th>41</th>\n",
447 |        "      <td>0.467381</td>\n",
448 |        "      <td>0.001702</td>\n",
449 |        "      <td>0.391102</td>\n",
450 |        "      <td>0.001146</td>\n",
451 |        "    </tr>\n",
452 |        "    <tr>\n",
453 |        "      <th>42</th>\n",
454 |        "      <td>0.467253</td>\n",
455 |        "      <td>0.001774</td>\n",
456 |        "      <td>0.389948</td>\n",
457 |        "      <td>0.000796</td>\n",
458 |        "    </tr>\n",
459 |        "    <tr>\n",
460 |        "      <th>43</th>\n",
461 |        "      <td>0.467122</td>\n",
462 |        "      <td>0.001733</td>\n",
463 |        "      <td>0.389037</td>\n",
464 |        "      <td>0.000645</td>\n",
465 |        "    </tr>\n",
466 |        "    <tr>\n",
467 |        "      <th>44</th>\n",
468 |        "      <td>0.467043</td>\n",
469 |        "      <td>0.001747</td>\n",
470 |        "      <td>0.387892</td>\n",
471 |        "      <td>0.000712</td>\n",
472 |        "    </tr>\n",
473 |        "    <tr>\n",
474 |        "      <th>45</th>\n",
475 |        "      <td>0.466965</td>\n",
476 |        "      <td>0.001757</td>\n",
477 |        "      <td>0.386938</td>\n",
478 |        "      <td>0.000627</td>\n",
479 |        "    </tr>\n",
480 |        "    <tr>\n",
481 |        "      <th>46</th>\n",
482 |        "      <td>0.466865</td>\n",
483 |        "      <td>0.001787</td>\n",
484 |        "      <td>0.385881</td>\n",
485 |        "      <td>0.000904</td>\n",
486 |        "    </tr>\n",
487 |        "    <tr>\n",
488 |        "      <th>47</th>\n",
489 |        "      <td>0.466820</td>\n",
490 |        "      <td>0.001841</td>\n",
491 |        "      <td>0.384970</td>\n",
492 |        "      <td>0.000891</td>\n",
493 |        "    </tr>\n",
494 |        "    <tr>\n",
495 |        "      <th>48</th>\n",
496 |        "      <td>0.466724</td>\n",
497 |        "      <td>0.001895</td>\n",
498 |        "      <td>0.384210</td>\n",
499 |        "      <td>0.000894</td>\n",
500 |        "    </tr>\n",
501 |        "    <tr>\n",
502 |        "      <th>49</th>\n",
503 |        "      <td>0.466667</td>\n",
504 |        "      <td>0.001911</td>\n",
505 |        "      <td>0.383509</td>\n",
506 |        "      <td>0.000787</td>\n",
507 |        "    </tr>\n",
508 |        "  </tbody>\n",
509 |        "</table>\n",
510 |        "</div>"
511 |       ],
512 |       "text/plain": [
513 |        "    test-logloss-mean  test-logloss-std  train-logloss-mean  train-logloss-std\n",
514 |        "0            0.657933          0.000591            0.655429           0.000550\n",
515 |        "1            0.628350          0.000739            0.623086           0.000943\n",
516 |        "2            0.603884          0.001104            0.595802           0.001063\n",
517 |        "3            0.583370          0.001012            0.572541           0.000836\n",
518 |        "4            0.566755          0.000924            0.553194           0.000422\n",
519 |        "5            0.552614          0.000855            0.536283           0.000151\n",
520 |        "6            0.540211          0.000855            0.521392           0.000222\n",
521 |        "7            0.529721          0.000880            0.508403           0.000270\n",
522 |        "8            0.520790          0.000867            0.497102           0.000514\n",
523 |        "9            0.513212          0.000905            0.487175           0.000640\n",
524 |        "10           0.506994          0.000959            0.478506           0.000730\n",
525 |        "11           0.501447          0.001094            0.470861           0.000729\n",
526 |        "12           0.496789          0.001237            0.464108           0.000729\n",
527 |        "13           0.492689          0.001184            0.457947           0.000677\n",
528 |        "14           0.489190          0.001212            0.452237           0.000786\n",
529 |        "15           0.486248          0.001257            0.447101           0.000871\n",
530 |        "16           0.483681          0.001239            0.442427           0.000849\n",
531 |        "17           0.481431          0.001276            0.438261           0.000909\n",
532 |        "18           0.479627          0.001380            0.434358           0.000855\n",
533 |        "19           0.477896          0.001453            0.430595           0.000764\n",
534 |        "20           0.476538          0.001381            0.427351           0.000817\n",
535 |        "21           0.475284          0.001424            0.424263           0.000826\n",
536 |        "22           0.474239          0.001395            0.421472           0.000912\n",
537 |        "23           0.473294          0.001422            0.418848           0.000894\n",
538 |        "24           0.472517          0.001478            0.416437           0.000846\n",
539 |        "25           0.471818          0.001503            0.414173           0.000894\n",
540 |        "26           0.471244          0.001540            0.412018           0.000970\n",
541 |        "27           0.470718          0.001583            0.410244           0.001135\n",
542 |        "28           0.470290          0.001598            0.408401           0.001012\n",
543 |        "29           0.469917          0.001631            0.406792           0.001054\n",
544 |        "30           0.469591          0.001632            0.405117           0.001180\n",
545 |        "31           0.469236          0.001656            0.403349           0.000978\n",
546 |        "32           0.468996          0.001689            0.401859           0.000776\n",
547 |        "33           0.468792          0.001664            0.400643           0.000672\n",
548 |        "34           0.468562          0.001643            0.399290           0.000633\n",
549 |        "35           0.468300          0.001700            0.397969           0.000679\n",
550 |        "36           0.468079          0.001699            0.396654           0.000702\n",
551 |        "37           0.467950          0.001706            0.395548           0.000823\n",
552 |        "38           0.467791          0.001684            0.394453           0.000800\n",
553 |        "39           0.467616          0.001657            0.393437           0.000796\n",
554 |        "40           0.467466          0.001650            0.392171           0.000925\n",
555 |        "41           0.467381          0.001702            0.391102           0.001146\n",
556 |        "42           0.467253          0.001774            0.389948           0.000796\n",
557 |        "43           0.467122          0.001733            0.389037           0.000645\n",
558 |        "44           0.467043          0.001747            0.387892           0.000712\n",
559 |        "45           0.466965          0.001757            0.386938           0.000627\n",
560 |        "46           0.466865          0.001787            0.385881           0.000904\n",
561 |        "47           0.466820          0.001841            0.384970           0.000891\n",
562 |        "48           0.466724          0.001895            0.384210           0.000894\n",
563 |        "49           0.466667          0.001911            0.383509           0.000787"
564 |       ]
565 |      },
566 |      "execution_count": 49,
567 |      "metadata": {},
568 |      "output_type": "execute_result"
569 |     }
570 |    ],
571 |    "source": [
572 |     "clf"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "markdown",
577 |    "metadata": {},
578 |    "source": [
579 |     "# Some notes on xgb.train() "
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "code",
584 |    "execution_count": 89,
585 |    "metadata": {
586 |     "collapsed": false
587 |    },
588 |    "outputs": [],
589 |    "source": [
590 |     "from sklearn.cross_validation import KFold, train_test_split\n",
591 |     "X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.05 ,random_state=0)"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": 90,
597 |    "metadata": {
598 |     "collapsed": true
599 |    },
600 |    "outputs": [],
601 |    "source": [
602 |     "xgtrains = xgb.DMatrix(X_train.values, y_train.values)\n",
603 |     "xgtest = xgb.DMatrix(X_test.values, y_test.values)\n",
604 |     "# xgtest = xgb.DMatrix(test.values)\n",
605 |     "params = {'objective': 'binary:logistic', \n",
606 |     "              'subsample': 1, \n",
607 |     "              'eta': 0.1, \n",
608 |     "              'colsample_bytree': 0.9, \n",
609 |     "              'max_depth': 10,\n",
610 |     "              'min_child_weight' : 5,\n",
611 |     "                 'silent':1}"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 91,
617 |    "metadata": {
618 |     "collapsed": true
619 |    },
620 |    "outputs": [
621 |     {
622 |      "name": "stderr",
623 |      "output_type": "stream",
624 |      "text": [
625 |       "Will train until logloss error hasn't decreased in 10 rounds.\n",
626 |       "[0]\tlogloss-error:0.235963\n",
627 |       "[1]\tlogloss-error:0.232639\n",
628 |       "[2]\tlogloss-error:0.232290\n",
629 |       "[3]\tlogloss-error:0.231065\n",
630 |       "[4]\tlogloss-error:0.229491\n",
631 |       "[5]\tlogloss-error:0.230191\n",
632 |       "[6]\tlogloss-error:0.227567\n",
633 |       "[7]\tlogloss-error:0.226517\n",
634 |       "[8]\tlogloss-error:0.225993\n",
635 |       "[9]\tlogloss-error:0.226867\n",
636 |       "[10]\tlogloss-error:0.226517\n",
637 |       "[11]\tlogloss-error:0.227917\n",
638 |       "[12]\tlogloss-error:0.226168\n",
639 |       "[13]\tlogloss-error:0.227567\n",
640 |       "[14]\tlogloss-error:0.226692\n",
641 |       "[15]\tlogloss-error:0.227042\n",
642 |       "[16]\tlogloss-error:0.227742\n",
643 |       "[17]\tlogloss-error:0.227567\n",
644 |       "[18]\tlogloss-error:0.226692\n",
645 |       "Stopping. Best iteration:\n",
646 |       "[8]\tlogloss-error:0.225993\n",
647 |       "\n"
648 |      ]
649 |     }
650 |    ],
651 |    "source": [
652 |     "clft = xgb.train(params,xgtrains,num_boost_round=num_round,\n",
653 |     "                 evals= [(xgtest,'logloss')] , early_stopping_rounds = 10,\n",
654 |     "                 verbose_eval=True)"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 92,
660 |    "metadata": {
661 |     "collapsed": true
662 |    },
663 |    "outputs": [],
664 |    "source": [
665 |     "#see here:\n",
666 |     "#https://www.kaggle.com/ashhafez/springleaf-marketing-response/xgb-learning-rate-eta-decay/run/78945/code\n",
667 |     "#http://discuss.analyticsvidhya.com/t/how-to-predict-class-labels-using-xgboost-in-python-when-objective-function-is-binary-logistic/7809"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": null,
673 |    "metadata": {
674 |     "collapsed": true
675 |    },
676 |    "outputs": [],
677 |    "source": []
678 |   }
679 |  ],
680 |  "metadata": {
681 |   "kernelspec": {
682 |    "display_name": "Python 2",
683 |    "language": "python",
684 |    "name": "python2"
685 |   },
686 |   "language_info": {
687 |    "codemirror_mode": {
688 |     "name": "ipython",
689 |     "version": 2
690 |    },
691 |    "file_extension": ".py",
692 |    "mimetype": "text/x-python",
693 |    "name": "python",
694 |    "nbconvert_exporter": "python",
695 |    "pygments_lexer": "ipython2",
696 |    "version": "2.7.11"
697 |   }
698 |  },
699 |  "nbformat": 4,
700 |  "nbformat_minor": 0
701 | }
702 | 


--------------------------------------------------------------------------------
/wine_quality/README.txt:
--------------------------------------------------------------------------------
1 | #dataset is from here: #http://archive.ics.uci.edu/ml/datasets/Wine+Quality
2 | #there are many more great data sets there!


--------------------------------------------------------------------------------
/wine_quality/ipython_notebooks/box_cox.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Sept  29 2015
 4 | 
 5 | @author: hugobowne-anderson
 6 | @email: hugobowne at gmail dot com
 7 | """
 8 | 
 9 | from scipy import stats
10 | import pandas as pd
11 | 
12 | def box_cox(df, lmbda=None, alpha=None):
13 |     """
14 |     Performs a Box-Cox Transformation on all columns (features) of a pandas
15 |     dataframe. Currently, there is some ambiguity as to how to deal with
16 |     non-positive values & I need to check this out: at the moment, I just centre
17 |     the data so that min(value) > 0, for all features, as necessitated by
18 |     the very nature of the Box-Cox Transformation.
19 |     """
20 |     df_tr = pd.DataFrame(columns=df.columns)  #initialize empty data frame with same features as df
21 |     for val in list(df.columns):
22 |         df_tr[val] = stats.boxcox(df[val] - min(df[val]) + 0.1,lmbda, alpha)[0] #populate dataframe with transformed data
23 |     return df_tr
24 | 


--------------------------------------------------------------------------------
/wine_quality/ipython_notebooks/yeo_johnson.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Sept  29 2015
 4 | 
 5 | @author: hugobowne-anderson
 6 | @email: hugobowne at gmail dot com
 7 | """
 8 | 
 9 | from scipy import stats
10 | import numpy as np
11 | import pandas as pd
12 | import math as math
13 | 
14 | def yeo_johnson(x, lmbda=0 ):
15 |     """
16 |     Performs a Yeo-Johnson Transformation on a numpy array.
17 |     Arguments:
18 |     Input array. Should be 1-dimensional.
19 |     lmbda : {scalar}, optional.
20 |     IN PROGRESS: I WILL COMMENT CODE BELOW ASAP; A RUNTIME WARNING MAY BE THROWN
21 |     DURING EXECUTION BUT THE RESULT SHOULD NOT BE AFFECTED.
22 |     I HAVE USED THE DEFINITION OF YEO-JOHNSON TRANSFORMATION FROM THE ORIGINAL PAPER:
23 |     Yeo, In-Kwon and Johnson, Richard (2000). A new family of power transformations
24 |     to improve normality or symmetry. Biometrika, 87, 954-959.
25 |     """
26 |     #The Yeo-Johnson Transform is defined differently for differing values of lambda
27 |     if lmbda == 0:
28 |         #as transform is defined piecewise, I compute it using the sum of relational
29 |         #operators: for this reason, I 1st define the 2 functions
30 |         A1 = np.log(abs(x+1)) 
31 |         A1[A1 == -np.inf] = 0 #subtlety: if value = -inf , then term will not be used
32 |                               # BUT I do need to set it to 0 so that it IS unused below
33 |         A2 = (np.power(1-x , 2) - 1)/2
34 |         A2[np.isnan(A2)] = 0#subtlety: if value = NaN , then term will not be used
35 |                               # BUT I do need to set it to 0 so that it IS unused below
36 |         x_yj = (x>=0)*A1 - (x<0)*A2
37 |     elif lmbda == 2:
38 |         #as transform is defined piecewise, I compute it using the sum of relational
39 |         #operators: for this reason, I 1st define the 2 functions
40 |         B1 = (np.power(x+1 , 2) - 1)/2
41 |         B1[np.isnan(B1)] = 0#subtlety: if value = NaN , then term will not be used
42 |                               # BUT I do need to set it to 0 so that it IS unused below
43 |         B2 = np.log(abs(1-x))
44 |         B2[B2==-np.inf] = 0#subtlety: if value = -inf , then term will not be used
45 |                               # BUT I do need to set it to 0 so that it IS unused below
46 |         x_yj = (x>=0)*B1 - (x<0)*B2
47 |     else:
48 |         #as transform is defined piecewise, I compute it using the sum of relational
49 |         #operators: for this reason, I 1st define the 2 functions
50 |         C1 = (np.power(x+1 , lmbda) - 1)/lmbda
51 |         C1[np.isnan(C1)] = 0#subtlety: if value = NaN , then term will not be used
52 |                               # BUT I do need to set it to 0 so that it IS unused below
53 |         C2 = (np.power(1-x , 2-lmbda) - 1)/(2 - lmbda)
54 |         C2[np.isnan(C2)] = 0#subtlety: if value = NaN , then term will not be used
55 |                               # BUT I do need to set it to 0 so that it IS unused below
56 |         x_yj = (x>=0)*C1 + (x<0)*C2
57 | 
58 |     return x_yj
59 |     
60 | def dfyeo_johnson(df, lmbda=0 ):
61 |     """
62 |     Performs a Yeo-Johnson Transformation on all columns (features)of a dataframe
63 |     """
64 |     df_yj = pd.DataFrame(columns=df.columns)  #initialize empty data frame with same features as df
65 |     for val in list(df.columns):
66 |         df_yj[val] = yeo_johnson(df[val]) #populate dataframe with transformed data
67 |     return df_yj
68 | 


--------------------------------------------------------------------------------
/wine_quality/ipython_notebooks/yjscratch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Sep 29 13:53:59 2015
 4 | 
 5 | @author: hugobowne-anderson
 6 | """
 7 | 
 8 | import numpy as np
 9 | 
10 | s = np.arange(10)-5
11 | import math as math
12 | import yeo_johnson as yj
13 | 
14 | 
15 | math.log(math.exp(1))
16 | 
17 | yeo_johnson(s , lmbda = 2)
18 | 


--------------------------------------------------------------------------------
/wine_quality/python/wine_classifier.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Fernando Carrillo"
 2 | __email__ = "fernando at carrillo.at"
 3 | 
 4 | from sklearn.grid_search import GridSearchCV
 5 | from sklearn.cross_validation import cross_val_score
 6 | from sklearn.metrics import classification_report
 7 | 
 8 | class WineClassifier(object):
 9 | 	"""
10 | 	Use classification (not regression) for wine quality. 
11 | 	"""
12 | 	def __init__(self, X_train, y_train, X_valid, y_valid, pipeline, param_grid):
13 | 		"""
14 | 		Set the data sets. 
15 | 		"""
16 | 		self.X_train = X_train
17 | 		self.y_train = y_train
18 | 		self.X_valid = X_valid
19 | 		self.y_valid = y_valid
20 | 		self.pipeline = pipeline
21 | 		self.param_grid = param_grid
22 | 
23 | 	def train(self, verbose=1, n_jobs=-1, scoring='accuracy', cv=10): 
24 | 		"""
25 | 		Train the classifier by grid search 
26 | 		"""
27 | 		if len(self.param_grid) != 0: 
28 | 			grid_search = GridSearchCV(self.pipeline, param_grid=self.param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs, scoring=scoring)
29 | 			grid_search.fit(self.X_train, self.y_train)
30 | 			if verbose > 1: 
31 | 				print( ('Best score %s with parameters %s') % (grid_search.best_score_, grid_search.best_params_))
32 | 			self.pipeline = grid_search.best_estimator_
33 | 		else: 
34 | 			if verbose > 1:
35 | 				scores = cross_val_score(self.pipeline, self.X_train, self.y_train, cv=cv)
36 | 				print(('Best score %s') % (scores.mean()))
37 | 			self.pipeline.fit(self.X_train, self.y_train)
38 | 
39 | 	def classification_report(self, print_stdout=True):
40 | 		"""
41 | 		Valid classifier on validation set. 
42 | 		"""
43 | 		report = classification_report(self.y_valid, self.pipeline.predict(self.X_valid))
44 | 		if print_stdout: print(report)
45 | 		return(report)
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 		


--------------------------------------------------------------------------------
/wine_quality/python/wine_data.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Fernando Carrillo"
 2 | __email__ = "fernando at carrillo.at"
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | 
 7 | class WineData(object):
 8 | 	"""docstring for WineData"""
 9 | 	def __init__(self, path_to_red, path_to_white):
10 | 		self.path_to_red = path_to_red
11 | 		self.path_to_white = path_to_white
12 | 		
13 | 	def _load(self, path_to_data): 
14 | 		"""
15 | 		Loads the data from data 
16 | 		"""
17 | 		data = np.array(pd.read_csv(path_to_data, header=0, sep=';'))
18 | 		X = data[:,:-1]
19 | 		y = data[:,-1]
20 | 		return X, y 
21 | 
22 | 	def load_red(self):
23 | 		"""
24 | 		Loads the red wine data 
25 | 		"""
26 | 		return self._load(self.path_to_red)
27 | 
28 | 	def load_white(self):
29 | 		"""
30 | 		Loads the white wine data 
31 | 		"""
32 | 		return self._load(self.path_to_white)


--------------------------------------------------------------------------------
/wine_quality/python/wine_explore.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Fernando Carrillo"
 2 | __email__ = "fernando at carrillo.at"
 3 | 
 4 | from matplotlib import pyplot as plt
 5 | from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler
 6 | from sklearn.decomposition import PCA
 7 | from sklearn.manifold import Isomap, TSNE, LocallyLinearEmbedding, SpectralEmbedding, MDS
 8 | 
 9 | import pandas as pd
10 | 
11 | def plot2d(X, y, scale=True, normalize=False, embedding='pca', title=''): 
12 | 	"""
13 | 	Plot data transformed into two dimensions by PCA. 
14 | 	PCA transforms into a new embedding dimension such that 
15 | 	the first dimension contains the maximal variance and following 
16 | 	dimensions maximal remaining variance. 
17 | 	This shoudl spread the observed n-dimensional data maximal. This 
18 | 	is unsupervised and will not consider target values. 
19 | 	"""
20 | 	if (scale): 
21 | 		scaler = StandardScaler()
22 | 		X = scaler.fit_transform(X)
23 | 
24 | 	if (normalize): 
25 | 		normalizer = Normalizer(norm='l2')
26 | 		X = normalizer.fit_transform(X)
27 | 		
28 | 	if (embedding is 'pca'): 
29 | 		pca = PCA(n_components=2)
30 | 		X_transformed = pca.fit_transform(X)
31 | 	elif (embedding is 'isomap'):
32 | 		isomap = Isomap(n_components=2, n_neighbors=20)
33 | 		X_transformed = isomap.fit_transform(X)
34 | 	elif (embedding is 'lle' ): 
35 | 		lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5)
36 | 		X_transformed = lle.fit_transform(X)
37 | 	elif (embedding is 'tsne'): 
38 | 		t_sne = TSNE(n_components=2)
39 | 		X_transformed = t_sne.fit_transform(X)
40 | 	elif (embedding is 'spectral'): 
41 | 		se = SpectralEmbedding(n_components=2)
42 | 		X_transformed = se.fit_transform(X)
43 | 	elif (embedding is 'mds'):
44 | 		mds = MDS(n_components=2)
45 | 		X_transformed = mds.fit_transform(X)
46 | 	elif (embedding is 'gallery'): 
47 | 		plt.figure(1)
48 | 		
49 | 		plt.subplot(231)
50 | 		plt.title('pca')
51 | 		X_t = PCA(n_components=2).fit_transform(X)
52 | 		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
53 | 
54 | 		plt.subplot(232)
55 | 		plt.title('isomap')
56 | 		X_t = Isomap(n_neighbors=20).fit_transform(X)
57 | 		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
58 | 
59 | 		plt.subplot(233)
60 | 		plt.title('lle')
61 | 		X_t = LocallyLinearEmbedding(n_neighbors=20).fit_transform(X)
62 | 		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
63 | 
64 | 		plt.subplot(234)
65 | 		plt.title('tsne')
66 | 		X_t = TSNE().fit_transform(X)
67 | 		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
68 | 
69 | 		plt.subplot(235)
70 | 		plt.title('spectral')
71 | 		X_t = SpectralEmbedding().fit_transform(X)
72 | 		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
73 | 
74 | 		plt.subplot(236)
75 | 		plt.title('mds')
76 | 		X_t = MDS().fit_transform(X)
77 | 		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)
78 | 
79 | 		plt.suptitle('Gallery transforms ' + title)
80 | 
81 | 		return plt
82 | 	else:
83 | 		raise ValueError("Choose between pca, isomap and tsne")
84 | 
85 | 	plt.title(title + ' ' + embedding + ' plot')
86 | 	sc = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y)
87 | 	plt.colorbar(sc)
88 | 	return plt
89 | 
90 | def pairs(X, y, title): 
91 | 	"""
92 | 	Quick and dirty version of pairs. 
93 | 	"""
94 | 	df = pd.DataFrame(X)
95 | 	df[df.shape[1]] = y
96 | 	plt.title(title + ' Pairwise plot')
97 | 	axes = pd.tools.plotting.scatter_matrix(df, alpha=0.2)
98 | 	return plt
99 | 


--------------------------------------------------------------------------------
/wine_quality/python/wine_main.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Fernando Carrillo"
 2 | __email__ = "fernando at carrillo.at"
 3 | 
 4 | from wine_data import WineData
 5 | from wine_preprocesser import WinePreprocesser
 6 | from wine_explore import plot2d, pairs
 7 | from wine_classifier import WineClassifier
 8 | 
 9 | from time import time
10 | import numpy as np 
11 | 
12 | from sklearn.pipeline import Pipeline
13 | from sklearn.cross_validation import train_test_split
14 | from sklearn.preprocessing import StandardScaler
15 | from sklearn.naive_bayes import GaussianNB
16 | from sklearn.neighbors import KNeighborsClassifier
17 | from sklearn.svm import LinearSVC
18 | from sklearn.linear_model import LogisticRegression
19 | from sklearn.decomposition import PCA
20 | 
21 | # Load data and preprocess (everything you don't put in the pipeline)
22 | data = WineData('../winequality-red.csv', '../winequality-white.csv')
23 | 
24 | print('Preprocesing.')
25 | t0 = time() 
26 | wp = WinePreprocesser(data)
27 | #wp.add_divided_features(replace_inf_with_absmax=True)
28 | wp.polynomial_expansion(rank=2)
29 | wp.remove_low_variance_features(variance_threshold=0)
30 | X_red, y_red = wp.get_red()
31 | X_white, y_white = wp.get_white()
32 | print('Preprocesing. Done in %fs' % (time()-t0) )
33 | ###############################
34 | # Explore data 
35 | # 1. Plot in 2d, color code classes: 
36 | #	-> no simple low dimension linear separation
37 | # 2. Plot paris 
38 | #	-> correlation: transform data or use regularized methods
39 | #	-> non-normal distributed featues: Box-Cox transform
40 | ###############################
41 | do_plot = False 
42 | if (do_plot): 
43 | 	plot2d(X_red, y_red, embedding='gallery', title='Red wine').show()#.savefig('../data/red_whine_2d_gallery.png')
44 | 	plot2d(X_white, y_white, embedding='gallery', title='White wine').show()#.savefig('../data/white_whine_2d_gallery.png')
45 | 	pairs(X_red, y_red, 'Red wine')
46 | 	pairs(X_white, y_white, 'White wine')
47 | 
48 | ###############################
49 | # Classification 
50 | # Prepare data 
51 | ###############################
52 | #X = X_white
53 | #y = y_white
54 | X = X_red
55 | y = y_red
56 | X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state=23, test_size=0.2)
57 | 
58 | ###############################
59 | # Classify on transformed dataset. 
60 | ###############################
61 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('cls', GaussianNB())])
62 | cls_nb = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10)})
63 | cls_nb.train(verbose=1, n_jobs=-1, scoring='f1_micro')
64 | cls_nb.classification_report()
65 | 
66 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('nn', KNeighborsClassifier())])
67 | cls_nn = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10), 'nn__n_neighbors': [1, 2, 4, 8, 32, 64]})
68 | cls_nn.train(verbose=1, n_jobs=1, scoring='f1_micro') # crashes with n_jobs > 1
69 | cls_nn.classification_report()
70 | 
71 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('svc', LinearSVC())])
72 | cls_svc = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10), 'svc__C': 10. ** np.arange(-3, 4)})
73 | cls_svc.train(verbose=1, n_jobs=1, scoring='f1_micro')
74 | cls_svc.classification_report()
75 | 
76 | pipeline = Pipeline([('scale', StandardScaler()), ('trans', PCA()), ('logistic', LogisticRegression(multi_class='multinomial', solver='lbfgs'))])
77 | cls_log = WineClassifier(X_train, y_train, X_holdout, y_holdout, pipeline, param_grid={'trans__n_components': np.arange(2,X_train.shape[1]+1, 10), 'logistic__C': 10. ** np.arange(-3, 4)})
78 | cls_log.train(verbose=1, n_jobs=1, scoring='f1_micro') # Not sure why, but multi_class logisitc regression crashes with multithreading. 
79 | cls_log.classification_report()


--------------------------------------------------------------------------------
/wine_quality/python/wine_preprocesser.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Fernando Carrillo"
 2 | __email__ = "fernando at carrillo.at"
 3 | 
 4 | import numpy as np 
 5 | from sklearn.preprocessing import PolynomialFeatures
 6 | 
 7 | class WinePreprocesser(object):
 8 | 	"""docstring for WinePreprocesser"""
 9 | 	def __init__(self, wine_data):
10 | 		self.X_red, self.y_red = wine_data.load_red()
11 | 		self.X_white, self.y_white = wine_data.load_white()
12 | 
13 | 	def _divide_features(self, X, replace_inf_with_absmax): 
14 | 		"""
15 | 		Divide 1 by feature value. 
16 | 		"""
17 | 		# Do the division 
18 | 		nf = np.divide(1, X)
19 | 		# Replace inf by nan or by the maximal absolute value 
20 | 		for i in np.arange(nf.shape[1]):
21 | 			if np.inf in nf[:,i]: 
22 | 				a = nf[:,i]
23 | 				if replace_inf_with_absmax: 
24 | 					a[np.isinf(a)] = a[np.argmax(abs(a[np.isfinite(a)]))]
25 | 				else:
26 | 					a[np.isinf(a)] = np.nan
27 | 				nf[:,i] = a 
28 | 		return(nf)
29 | 		
30 | 
31 | 	def add_divided_features(self, replace_inf_with_absmax=True):
32 | 		"""
33 | 		For each feature y_i add 1/y_i
34 | 		"""
35 | 		X_red_divided = self._divide_features(X=self.X_red, replace_inf_with_absmax=replace_inf_with_absmax)
36 | 		self.X_red = np.concatenate((self.X_red, X_red_divided), axis=1)
37 | 		X_white_divided = self._divide_features(X=self.X_white, replace_inf_with_absmax=replace_inf_with_absmax)
38 | 		self.X_white = np.concatenate((self.X_white, X_white_divided), axis=1)		
39 | 
40 | 	def polynomial_expansion(self, rank=2): 
41 | 		"""
42 | 		Expand the features with polynonial of rank rnank 
43 | 		"""
44 | 		pf = PolynomialFeatures(degree=2)
45 | 		self.X_red = pf.fit_transform(self.X_red)
46 | 		self.X_white = pf.fit_transform(self.X_white)
47 | 
48 | 	def _remove_low_var(self, X, variance_threshold): 
49 | 		"""
50 | 		Remove features with variance below threshold. 
51 | 		"""
52 | 		remove_index = [] 
53 | 		for col in range(X.shape[1]): 
54 | 			if np.var(X[:,col]) < variance_threshold: 
55 | 				remove_index.append(col)
56 | 		return(np.delete(X, remove_index, 1)) 
57 | 
58 | 	def remove_low_variance_features(self, variance_threshold=0): 
59 | 		"""
60 | 		Remove features with variance below threshold. 
61 | 		"""
62 | 		self.X_red = self._remove_low_var(self.X_red, variance_threshold)
63 | 		self.X_white = self._remove_low_var(self.X_white, variance_threshold)
64 | 
65 | 	def yeo_johnson_transform(self): 
66 | 		"""
67 | 		Implement yeo johnson transform 
68 | 		"""
69 | 		raise NotImplementedError
70 | 
71 | 	def get_red(self):
72 | 		"""
73 | 		Returns X, y of red wine data 
74 | 		"""
75 | 		return self.X_red, self.y_red 
76 | 
77 | 	def get_white(self):
78 | 		"""
79 | 		Returns X, y of white wine data 
80 | 		"""
81 | 		return self.X_white, self.y_white 
82 | 		


--------------------------------------------------------------------------------