├── input
    ├── ch01-titanic
    │   └── .gitkeep
    ├── ch03
    │   ├── multi_table_product.csv
    │   ├── time_series_events.csv
    │   ├── time_series_wide.csv
    │   ├── multi_table_log.csv
    │   └── multi_table_train.csv
    ├── readme.md
    └── sample-data
    │   ├── input_preprocess.py
    │   └── input_create.py
├── ch04-model-interface
    ├── input
    │   └── .gitkeep
    ├── model
    │   └── .gitkeep
    ├── submission
    │   └── .gitkeep
    ├── readme.md
    └── code
    │   ├── model.py
    │   ├── model_xgb.py
    │   ├── run.py
    │   ├── util.py
    │   ├── model_nn.py
    │   └── runner.py
├── misc
    ├── cover.jpg
    └── cover_small.jpg
├── ch02
    ├── ch02-05-custom-function.py
    ├── ch02-03-optimize.py
    ├── ch02-04-optimize-cv.py
    ├── ch02-02-custom-usage.py
    └── ch02-01-metrics.py
├── ch03
    ├── ch03-03-multi_tables.py
    ├── ch03-06-reduction-mnist.py
    ├── ch03-04-time_series.py
    ├── ch03-05-reduction.py
    ├── ch03-01-numerical.py
    └── ch03-02-categorical.py
├── ch06
    ├── ch06-02-hopt_xgb.py
    ├── ch06-05-embedded.py
    ├── ch06-04-filter.py
    ├── ch06-06-wrapper.py
    ├── ch06-01-hopt.py
    └── ch06-03-hopt_nn.py
├── LICENSE
├── ch04
    ├── ch04-05-run_linear.py
    ├── ch04-03-run_lgb.py
    ├── ch04-02-run_xgb.py
    ├── ch04-04-run_nn.py
    └── ch04-01-introduction.py
├── ch07
    ├── ch07-03-adversarial.py
    ├── ch07-02-blending.py
    ├── models.py
    └── ch07-01-stacking.py
├── ch05
    ├── ch05-02-timeseries.py
    └── ch05-01-validation.py
├── readme.md
└── ch01
    └── ch01-01-titanic.py


/input/ch01-titanic/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ch04-model-interface/input/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ch04-model-interface/model/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ch04-model-interface/submission/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/misc/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmalins/kagglebook/HEAD/misc/cover.jpg


--------------------------------------------------------------------------------
/misc/cover_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexmalins/kagglebook/HEAD/misc/cover_small.jpg


--------------------------------------------------------------------------------
/input/ch03/multi_table_product.csv:
--------------------------------------------------------------------------------
 1 | product_id,product_category,price
 2 | P1,C1,550
 3 | P2,C1,100
 4 | P3,C2,100
 5 | P4,C3,100
 6 | P5,C1,200
 7 | P6,C5,1000
 8 | P7,C5,1500
 9 | P8,C4,300
10 | P9,C1,200
11 | P10,C1,370
12 | P11,C1,350
13 | P12,C1,300
14 | P13,C2,70
15 | P14,C3,300
16 | P15,C1,400
17 | P16,C1,600
18 | P17,C3,700
19 | P18,C3,600
20 | P19,C5,2500
21 | P20,C5,4000
22 | 


--------------------------------------------------------------------------------
/input/ch03/time_series_events.csv:
--------------------------------------------------------------------------------
 1 | date,event
 2 | 2018/1/3,sale
 3 | 2018/1/3,conpon
 4 | 2018/1/4,points
 5 | 2018/1/5,points
 6 | 2018/5/3,sale
 7 | 2018/5/4,sale
 8 | 2018/5/5,sale
 9 | 2018/5/6,points
10 | 2018/5/7,points
11 | 2018/5/8,points
12 | 2018/7/1,conpon
13 | 2018/8/13,points
14 | 2018/8/14,points
15 | 2018/8/15,points
16 | 2018/8/16,points
17 | 2018/8/17,points
18 | 2018/8/30,points
19 | 2018/8/31,points
20 | 2018/9/1,conpon
21 | 2018/10/30,points
22 | 2018/10/31,points
23 | 2018/12/30,sale
24 | 2018/12/30,points
25 | 2018/12/31,points
26 | 


--------------------------------------------------------------------------------
/ch04-model-interface/readme.md:
--------------------------------------------------------------------------------
 1 | ### Chapter 4 section on "class and directory structures for competitions": sample code
 2 | 
 3 | This is the sample code for the section in chapter 4 on "class and directory structures for competitions".
 4 | 
 5 | Input data is from the Kaggle competition [Otto Group Product Classification Challenge](https://www.kaggle.com/c/otto-group-product-classification-challenge/).
 6 | The code shows the process of training and making predictions using xgboost and keras.  
 7 | Refer to https://github.com/puyokw/kaggle_Otto/ to understand parameters and modelling method.
 8 | 
 9 | Execute the code using the following steps
10 | 
11 | 1. Download the [Data](https://www.kaggle.com/c/otto-group-product-classification-challenge/data) and save in the `input` folder.
12 | 2. Make a `code` folder then execute ```python run.py```. 
13 | 


--------------------------------------------------------------------------------
/ch02/ch02-05-custom-function.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | # -----------------------------------
 6 | # Optimizing MAE by approximating metric with a custom objective function
 7 | # -----------------------------------
 8 | 
 9 | # Fair function
10 | def fair(preds, dtrain):
11 |     x = preds - dtrain.get_labels()  # Get residual
12 |     c = 1.0  # Parameter of fair function
13 |     den = abs(x) + c  # Calculate denominator of gradient formula
14 |     grad = c * x / den  # Gradient
15 |     hess = c * c / den ** 2  # Second derivative
16 |     return grad, hess
17 | 
18 | 
19 | # Pseudo-Huber function
20 | def psuedo_huber(preds, dtrain):
21 |     d = preds - dtrain.get_labels()  # Get residual
22 |     delta = 1.0  # Parameter of Pseudo-Huber function
23 |     scale = 1 + (d / delta) ** 2
24 |     scale_sqrt = np.sqrt(scale)
25 |     grad = d / scale_sqrt  # Gradient
26 |     hess = 1 / scale / scale_sqrt  # Second derivative
27 |     return grad, hess
28 | 


--------------------------------------------------------------------------------
/ch03/ch03-03-multi_tables.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | # -----------------------------------
 5 | # Merging data
 6 | # -----------------------------------
 7 | # Load the data
 8 | train = pd.read_csv('../input/ch03/multi_table_train.csv')
 9 | product_master = pd.read_csv('../input/ch03/multi_table_product.csv')
10 | user_log = pd.read_csv('../input/ch03/multi_table_log.csv')
11 | 
12 | # -----------------------------------
13 | # Suppose we have a data frame in the format shown in the diagram
14 | # train         : Training data (UserID, ProductID, Target value columns etc.)
15 | # product_master: Product data (ProductID, Product information columns etc.)
16 | # user_log      : User actions log data (UserID, Columns recording user action data etc.)
17 | 
18 | # Combine the product data and training data
19 | train = train.merge(product_master, on='product_id', how='left')
20 | 
21 | # Aggregate the lines containing data for each user, and append to the training data
22 | user_log_agg = user_log.groupby('user_id').size().reset_index().rename(columns={0: 'user_count'})
23 | train = train.merge(user_log_agg, on='user_id', how='left')
24 | 


--------------------------------------------------------------------------------
/ch06/ch06-02-hopt_xgb.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from hyperopt import hp
 3 | 
 4 | # -----------------------------------
 5 | # Example of xgboost parameter space
 6 | # -----------------------------------
 7 | 
 8 | # Baseline parameters
 9 | params = {
10 |     'booster': 'gbtree',
11 |     'objective': 'binary:logistic',
12 |     'eta': 0.1,
13 |     'gamma': 0.0,
14 |     'alpha': 0.0,
15 |     'lambda': 1.0,
16 |     'min_child_weight': 1,
17 |     'max_depth': 5,
18 |     'subsample': 0.8,
19 |     'colsample_bytree': 0.8,
20 |     'random_state': 71,
21 | }
22 | 
23 | # Parameter search space
24 | param_space = {
25 |     'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(10)),
26 |     'max_depth': hp.quniform('max_depth', 3, 9, 1),
27 |     'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05),
28 |     'colsample_bytree': hp.quniform('subsample', 0.6, 0.95, 0.05),
29 |     'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
30 |     # If there is enough leeway tune alpha and lambda as well
31 |     # 'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
32 |     # 'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0)),
33 | }
34 | 


--------------------------------------------------------------------------------
/ch02/ch02-03-optimize.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | # -----------------------------------
 5 | # Optimal threshold
 6 | # -----------------------------------
 7 | from sklearn.metrics import f1_score
 8 | from scipy.optimize import minimize
 9 | 
10 | # Prepartions for creating sample data
11 | rand = np.random.RandomState(seed=71)
12 | train_y_prob = np.linspace(0, 1.0, 10000)
13 | 
14 | # Assume that the true and predicted values are train_y and train_pred_prob, respectively
15 | train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob)
16 | train_pred_prob = np.clip(train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0)
17 | 
18 | # When the threshold is 0.5, F1 is 0.722
19 | init_threshold = 0.5
20 | init_score = f1_score(train_y, train_pred_prob >= init_threshold)
21 | print(init_threshold, init_score)
22 | 
23 | 
24 | # Objective function for optimization
25 | def f1_opt(x):
26 |     return -f1_score(train_y, train_pred_prob >= x)
27 | 
28 | 
29 | # Use scipy.optimize minimize() function to find optimal threshold
30 | # F1 is 0.756 when obtained with the optimal threshold
31 | result = minimize(f1_opt, x0=np.array([0.5]), method='Nelder-Mead')
32 | best_threshold = result['x'].item()
33 | best_score = f1_score(train_y, train_pred_prob >= best_threshold)
34 | print(best_threshold, best_score)
35 | 


--------------------------------------------------------------------------------
/ch04-model-interface/code/model.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from abc import ABCMeta, abstractmethod
 4 | from typing import Optional
 5 | 
 6 | 
 7 | class Model(metaclass=ABCMeta):
 8 | 
 9 |     def __init__(self, run_fold_name: str, params: dict) -> None:
10 |         """Constructor
11 | 
12 |         :param run_fold_name: concatenation of run name and fold number
13 |         :param params: hyperparameters
14 |         """
15 |         self.run_fold_name = run_fold_name
16 |         self.params = params
17 |         self.model = None
18 | 
19 |     @abstractmethod
20 |     def train(self, tr_x: pd.DataFrame, tr_y: pd.Series,
21 |               va_x: Optional[pd.DataFrame] = None,
22 |               va_y: Optional[pd.Series] = None) -> None:
23 |         """Perform model training and save trained model
24 | 
25 |         :param tr_x: Training data features
26 |         :param tr_y: Training data target values
27 |         :param va_x: Validation data features
28 |         :param va_y: Validation data target values
29 |         """
30 |         pass
31 | 
32 |     @abstractmethod
33 |     def predict(self, te_x: pd.DataFrame) -> np.array:
34 |         """Return predictions from trained model
35 | 
36 |         :param te_x: Validation data or test data features
37 |         :return: Predictions
38 |         """
39 |         pass
40 | 
41 |     @abstractmethod
42 |     def save_model(self) -> None:
43 |         """Save the model """
44 |         pass
45 | 
46 |     @abstractmethod
47 |     def load_model(self) -> None:
48 |         """Load the model"""
49 |         pass
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, ghmagazine
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/input/readme.md:
--------------------------------------------------------------------------------
 1 | ## Input data
 2 | 
 3 | ### Sample data
 4 | 
 5 | #### Sample data overview
 6 | 
 7 | * Input data for sample code after chapter 2
 8 | * Use data from Kaggle competition [Prudential Life Insurance Assessment](https://www.kaggle.com/c/prudential-life-insurance-assessment)
 9 | as a reference. Data was made artificially to simulate insurance underwriting data. The data construction was simple, so its structure is simpler than real life data.
10 | * Total of training and test data is 10000 lines
11 | 
12 | #### Sample data items
13 | 
14 | | Column name | Notes |
15 | |:----|:-------|
16 | | age |  |
17 | | gender |  |
18 | | height |  |
19 | | weight |  |
20 | | product | product type |
21 | | amount | insurance premium |
22 | | date | application date |
23 | | medical_info_a1/a2/a3 | medical information - continuous variable |
24 | | medical_info_b1/b2/b3 | medical information - continuous and catergorical variables |
25 | | medical_info_c1/c2 | medical information - continuous and catergorical variables |
26 | | medical_keyword_1-10 | medical information - binary variable |
27 | | target | target values (binary) |
28 | 
29 | 
30 | ### Input data used in chapter 1 (ch01-titanic)
31 | 
32 | * From Kaggle competition [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic), save following [data](https://www.kaggle.com/c/titanic/data) 
33 |   (save into folders as follows: ch01-titanic/train.csv, ch01-titanic/test.csv)
34 | 
35 | 
36 | ### Input data used in chapter 3 (ch03)
37 | 
38 | * Data for explanations on how to combine different tables
39 | * Data for explanations on how to process time series data
40 | 
41 | 


--------------------------------------------------------------------------------
/ch04-model-interface/code/model_xgb.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import xgboost as xgb
 6 | 
 7 | from model import Model
 8 | from util import Util
 9 | 
10 | 
11 | class ModelXGB(Model):
12 | 
13 |     def train(self, tr_x, tr_y, va_x=None, va_y=None):
14 | 
15 |         # Set the data
16 |         validation = va_x is not None
17 |         dtrain = xgb.DMatrix(tr_x, label=tr_y)
18 |         if validation:
19 |             dvalid = xgb.DMatrix(va_x, label=va_y)
20 | 
21 |         # Set the hyperparameters
22 |         params = dict(self.params)
23 |         num_round = params.pop('num_round')
24 | 
25 |         # Train
26 |         if validation:
27 |             early_stopping_rounds = params.pop('early_stopping_rounds')
28 |             watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
29 |             self.model = xgb.train(params, dtrain, num_round, evals=watchlist,
30 |                                    early_stopping_rounds=early_stopping_rounds)
31 |         else:
32 |             watchlist = [(dtrain, 'train')]
33 |             self.model = xgb.train(params, dtrain, num_round, evals=watchlist)
34 | 
35 |     def predict(self, te_x):
36 |         dtest = xgb.DMatrix(te_x)
37 |         return self.model.predict(dtest, ntree_limit=self.model.best_ntree_limit)
38 | 
39 |     def save_model(self):
40 |         model_path = os.path.join('../model/model', f'{self.run_fold_name}.model')
41 |         os.makedirs(os.path.dirname(model_path), exist_ok=True)
42 |         # To prevent loss of best_ntree_limit model, save model using pickle
43 |         Util.dump(self.model, model_path)
44 | 
45 |     def load_model(self):
46 |         model_path = os.path.join('../model/model', f'{self.run_fold_name}.model')
47 |         self.model = Util.load(model_path)
48 | 


--------------------------------------------------------------------------------
/ch04/ch04-05-run_linear.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # train_x is the training data, train_y is the target values, and test_x is the test data
 8 | # stored in pandas DataFrames and Series (numpy arrays also used)
 9 | # Load one-hot encoded data
10 | 
11 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
12 | train_x = train.drop(['target'], axis=1)
13 | train_y = train['target']
14 | test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv')
15 | 
16 | # Split the training data into training and validation data
17 | from sklearn.model_selection import KFold
18 | 
19 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
20 | tr_idx, va_idx = list(kf.split(train_x))[0]
21 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
22 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
23 | 
24 | # -----------------------------------
25 | # Linear model implementation
26 | # -----------------------------------
27 | from sklearn.linear_model import LogisticRegression
28 | from sklearn.metrics import log_loss
29 | from sklearn.preprocessing import StandardScaler
30 | 
31 | # Data scaling
32 | scaler = StandardScaler()
33 | tr_x = scaler.fit_transform(tr_x)
34 | va_x = scaler.transform(va_x)
35 | test_x = scaler.transform(test_x)
36 | 
37 | # Construction and training of linear model
38 | model = LogisticRegression(C=1.0)
39 | model.fit(tr_x, tr_y)
40 | 
41 | # Check score for validation data
42 | # Use predict_proba() to output probabilities. (predict() outputs binary class predictions)
43 | va_pred = model.predict_proba(va_x)
44 | score = log_loss(va_y, va_pred)
45 | print(f'logloss: {score:.4f}')
46 | 
47 | # Predictions
48 | pred = model.predict(test_x)
49 | 


--------------------------------------------------------------------------------
/ch06/ch06-05-embedded.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | # ---------------------------------
 5 | # Importance of random forest features
 6 | # ---------------------------------
 7 | # train_x is training data, train_y is target values
 8 | # Cannot deal with missing values so read data with missing values already imputed
 9 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
10 | train_x = train.drop(['target'], axis=1)
11 | train_y = train['target']
12 | # ---------------------------------
13 | from sklearn.ensemble import RandomForestClassifier
14 | 
15 | # Random forest
16 | clf = RandomForestClassifier(n_estimators=10, random_state=71)
17 | clf.fit(train_x, train_y)
18 | fi = clf.feature_importances_
19 | 
20 | # Output in order to top importance 
21 | idx = np.argsort(fi)[::-1]
22 | top_cols, top_importances = train_x.columns.values[idx][:5], fi[idx][:5]
23 | print('random forest importance')
24 | print(top_cols, top_importances)
25 | 
26 | # ---------------------------------
27 | # Importance of xgboost features
28 | # ---------------------------------
29 | # train_x is training data, train_y is target values
30 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
31 | train_x = train.drop(['target'], axis=1)
32 | train_y = train['target']
33 | # ---------------------------------
34 | import xgboost as xgb
35 | 
36 | # xgboost
37 | dtrain = xgb.DMatrix(train_x, label=train_y)
38 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
39 | num_round = 50
40 | model = xgb.train(params, dtrain, num_round)
41 | 
42 | # Output in order to top importance
43 | fscore = model.get_score(importance_type='total_gain')
44 | fscore = sorted([(k, v) for k, v in fscore.items()], key=lambda tpl: tpl[1], reverse=True)
45 | print('xgboost importance')
46 | print(fscore[:5])
47 | 


--------------------------------------------------------------------------------
/ch04-model-interface/code/run.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from model_nn import ModelNN
 5 | from model_xgb import ModelXGB
 6 | from runner import Runner
 7 | from util import Submission
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     params_xgb = {
12 |         'objective': 'multi:softprob',
13 |         'eval_metric': 'mlogloss',
14 |         'num_class': 9,
15 |         'max_depth': 12,
16 |         'eta': 0.1,
17 |         'min_child_weight': 10,
18 |         'subsample': 0.9,
19 |         'colsample_bytree': 0.8,
20 |         'silent': 1,
21 |         'random_state': 71,
22 |         'num_round': 10000,
23 |         'early_stopping_rounds': 10,
24 |     }
25 | 
26 |     params_xgb_all = dict(params_xgb)
27 |     params_xgb_all['num_round'] = 350
28 | 
29 |     params_nn = {
30 |         'layers': 3,
31 |         # Setting so this sample code executes quickly
32 |         'nb_epoch': 5,  # 1000
33 |         'patience': 10,
34 |         'dropout': 0.5,
35 |         'units': 512,
36 |     }
37 | 
38 |     # Specify features
39 |     features = [f'feat_{i}' for i in range(1, 94)]
40 | 
41 |     # Train and predict using xgboost
42 |     runner = Runner('xgb1', ModelXGB, features, params_xgb)
43 |     runner.run_train_cv()
44 |     runner.run_predict_cv()
45 |     Submission.create_submission('xgb1')
46 | 
47 |     # Train and predict using neural network
48 |     runner = Runner('nn1', ModelNN, features, params_nn)
49 |     runner.run_train_cv()
50 |     runner.run_predict_cv()
51 |     Submission.create_submission('nn1')
52 | 
53 |     '''
54 |     # (For reference) Train and predict using xgboost on all training data
55 |     runner = Runner('xgb1-train-all', ModelXGB, features, params_xgb_all)
56 |     runner.run_train_all()
57 |     runner.run_test_all()
58 |     Submission.create_submission('xgb1-train-all')
59 |     '''
60 | 


--------------------------------------------------------------------------------
/ch02/ch02-04-optimize-cv.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | # -----------------------------------
 5 | # Optimization of out-of-fold threshold
 6 | # -----------------------------------
 7 | from scipy.optimize import minimize
 8 | from sklearn.metrics import f1_score
 9 | from sklearn.model_selection import KFold
10 | 
11 | # Prepartions for creating sample data
12 | rand = np.random.RandomState(seed=71)
13 | train_y_prob = np.linspace(0, 1.0, 10000)
14 | 
15 | # Assume that the true and predicted values are train_y and train_pred_prob, respectively
16 | train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob)
17 | train_pred_prob = np.clip(train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0)
18 | 
19 | # Find thresholds using cross validation framework
20 | thresholds = []
21 | scores_tr = []
22 | scores_va = []
23 | 
24 | kf = KFold(n_splits=4, random_state=71, shuffle=True)
25 | for i, (tr_idx, va_idx) in enumerate(kf.split(train_pred_prob)):
26 |     tr_pred_prob, va_pred_prob = train_pred_prob[tr_idx], train_pred_prob[va_idx]
27 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
28 | 
29 |     # Objective function for optimization
30 |     def f1_opt(x):
31 |         return -f1_score(tr_y, tr_pred_prob >= x)
32 | 
33 |     # Optimize thresholds with training data, and evaluate with validation data
34 |     result = minimize(f1_opt, x0=np.array([0.5]), method='Nelder-Mead')
35 |     threshold = result['x'].item()
36 |     score_tr = f1_score(tr_y, tr_pred_prob >= threshold)
37 |     score_va = f1_score(va_y, va_pred_prob >= threshold)
38 |     print(threshold, score_tr, score_va)
39 | 
40 |     thresholds.append(threshold)
41 |     scores_tr.append(score_tr)
42 |     scores_va.append(score_va)
43 | 
44 | # Apply mean of the fold thresholds to the test data 
45 | threshold_test = np.mean(thresholds)
46 | print(threshold_test)
47 | 


--------------------------------------------------------------------------------
/ch04/ch04-03-run_lgb.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # train_x is the training data, train_y is the target values, and test_x is the test data
 8 | # stored in pandas DataFrames and Series (also possible to use numpy arrays)）
 9 | 
10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
11 | train_x = train.drop(['target'], axis=1)
12 | train_y = train['target']
13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
14 | 
15 | # Split the training data into training and validation data
16 | from sklearn.model_selection import KFold
17 | 
18 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
19 | tr_idx, va_idx = list(kf.split(train_x))[0]
20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
22 | 
23 | # -----------------------------------
24 | # lightgbm implementation
25 | # -----------------------------------
26 | import lightgbm as lgb
27 | from sklearn.metrics import log_loss
28 | 
29 | # Change the features and target values into format suitable for lightgbm
30 | lgb_train = lgb.Dataset(tr_x, tr_y)
31 | lgb_eval = lgb.Dataset(va_x, va_y)
32 | 
33 | # Set the hyperparameters
34 | params = {'objective': 'binary', 'seed': 71, 'verbose': 0, 'metrics': 'binary_logloss'}
35 | num_round = 100
36 | 
37 | # Perform training
38 | # Specify categorical features as a parameter
39 | # Pass the validation data to the model, and monitor how the score changes during training
40 | categorical_features = ['product', 'medical_info_b2', 'medical_info_b3']
41 | model = lgb.train(params, lgb_train, num_boost_round=num_round,
42 |                   categorical_feature=categorical_features,
43 |                   valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_eval])
44 | 
45 | # Check score for validation data
46 | va_pred = model.predict(va_x)
47 | score = log_loss(va_y, va_pred)
48 | print(f'logloss: {score:.4f}')
49 | 
50 | # Predictions
51 | pred = model.predict(test_x)
52 | 


--------------------------------------------------------------------------------
/ch07/ch07-03-adversarial.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # Data creation (just random data)
 8 | rand = np.random.RandomState(71)
 9 | train_x = pd.DataFrame(rand.uniform(0.0, 1.0, (10000, 2)), columns=['model1', 'model2'])
10 | adv_train = pd.Series(rand.uniform(0.0, 1.0, 10000))
11 | w = np.array([0.3, 0.7]).reshape(1, -1)
12 | train_y = pd.Series((train_x.values * w).sum(axis=1) > 0.5)
13 | 
14 | # ---------------------------------
15 | # adversarial stochastic blending
16 | # ----------------------------------
17 | # Use adversarial validation to calculate weights for averaging predicted values from models
18 | # train_x: Predicted probabilities from each model (actually using results that have been ordered)
19 | # train_y: Target values
20 | # adv_train: Values that represent likelihood that training data was also test data
21 | 
22 | from scipy.optimize import minimize
23 | from sklearn.metrics import roc_auc_score
24 | 
25 | n_sampling = 50  # Number of times to sample
26 | frac_sampling = 0.5  # Fraction of training data to take when sampling
27 | 
28 | 
29 | def score(x, data_x, data_y):
30 |     # Use AUC as evaluation metric
31 |     y_prob = data_x['model1'] * x + data_x['model2'] * (1 - x)
32 |     return -roc_auc_score(data_y, y_prob)
33 | 
34 | 
35 | # Repeatedly use sampling to calculate weights for weighted averaging
36 | results = []
37 | for i in range(n_sampling):
38 |     # Perform sampling
39 |     seed = i
40 |     idx = pd.Series(np.arange(len(train_y))).sample(frac=frac_sampling, replace=False,
41 |                                                     random_state=seed, weights=adv_train)
42 |     x_sample = train_x.iloc[idx]
43 |     y_sample = train_y.iloc[idx]
44 | 
45 |     # Want to use sampling data to find most optimum weights for weighted averaging
46 |     # As there are constraints use the COBYLA algorithm
47 |     init_x = np.array(0.5)
48 |     constraints = (
49 |         {'type': 'ineq', 'fun': lambda x: x},
50 |         {'type': 'ineq', 'fun': lambda x: 1.0 - x},
51 |     )
52 |     result = minimize(score, x0=init_x,
53 |                       args=(x_sample, y_sample),
54 |                       constraints=constraints,
55 |                       method='COBYLA')
56 |     results.append((result.x, 1.0 - result.x))
57 | 
58 | # Weights for model1 and model2 weighted averaging
59 | results = np.array(results)
60 | w_model1, w_model2 = results.mean(axis=0)
61 | 


--------------------------------------------------------------------------------
/ch04/ch04-02-run_xgb.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # train_x is the training data, train_y is the target values, and test_x is the test data
 8 | # stored in pandas DataFrames and Series (numpy arrays also used)
 9 | 
10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
11 | train_x = train.drop(['target'], axis=1)
12 | train_y = train['target']
13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
14 | 
15 | # Split the training data into training and validation data
16 | from sklearn.model_selection import KFold
17 | 
18 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
19 | tr_idx, va_idx = list(kf.split(train_x))[0]
20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
22 | 
23 | # -----------------------------------
24 | # xgboost implementation
25 | # -----------------------------------
26 | import xgboost as xgb
27 | from sklearn.metrics import log_loss
28 | 
29 | # Change the features and target values into format suitable for xgboost
30 | dtrain = xgb.DMatrix(tr_x, label=tr_y)
31 | dvalid = xgb.DMatrix(va_x, label=va_y)
32 | dtest = xgb.DMatrix(test_x)
33 | 
34 | # Set the hyperparameters
35 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
36 | num_round = 50
37 | 
38 | # Train the model
39 | # Pass the validation data to the model, and monitor how the score changes during training
40 | # In watchlist put the training and validation data
41 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
42 | model = xgb.train(params, dtrain, num_round, evals=watchlist)
43 | 
44 | # Check the score using the validation data
45 | va_pred = model.predict(dvalid)
46 | score = log_loss(va_y, va_pred)
47 | print(f'logloss: {score:.4f}')
48 | 
49 | # Output prediction (not a binary value but a probability)
50 | pred = model.predict(dtest)
51 | 
52 | # -----------------------------------
53 | # Monitor the scores for the training and validation data
54 | # -----------------------------------
55 | # Monitor the logless metric, set number of early stopping rounds to 20
56 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71,
57 |           'eval_metric': 'logloss'}
58 | num_round = 500
59 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
60 | model = xgb.train(params, dtrain, num_round, evals=watchlist,
61 |                   early_stopping_rounds=20)
62 | 
63 | # Use the optimal decision tree to make predictions
64 | pred = model.predict(dtest, ntree_limit=model.best_ntree_limit)
65 | 


--------------------------------------------------------------------------------
/ch04-model-interface/code/util.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import logging
 3 | import os
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.externals import joblib
 8 | 
 9 | 
10 | class Util:
11 | 
12 |     @classmethod
13 |     def dump(cls, value, path):
14 |         os.makedirs(os.path.dirname(path), exist_ok=True)
15 |         joblib.dump(value, path, compress=True)
16 | 
17 |     @classmethod
18 |     def load(cls, path):
19 |         return joblib.load(path)
20 | 
21 | 
22 | class Logger:
23 | 
24 |     def __init__(self):
25 |         self.general_logger = logging.getLogger('general')
26 |         self.result_logger = logging.getLogger('result')
27 |         stream_handler = logging.StreamHandler()
28 |         file_general_handler = logging.FileHandler('../model/general.log')
29 |         file_result_handler = logging.FileHandler('../model/result.log')
30 |         if len(self.general_logger.handlers) == 0:
31 |             self.general_logger.addHandler(stream_handler)
32 |             self.general_logger.addHandler(file_general_handler)
33 |             self.general_logger.setLevel(logging.INFO)
34 |             self.result_logger.addHandler(stream_handler)
35 |             self.result_logger.addHandler(file_result_handler)
36 |             self.result_logger.setLevel(logging.INFO)
37 | 
38 |     def info(self, message):
39 |         # Output time to console and log
40 |         self.general_logger.info('[{}] - {}'.format(self.now_string(), message))
41 | 
42 |     def result(self, message):
43 |         self.result_logger.info(message)
44 | 
45 |     def result_ltsv(self, dic):
46 |         self.result(self.to_ltsv(dic))
47 | 
48 |     def result_scores(self, run_name, scores):
49 |         # Output calculation results to console and results log
50 |         dic = dict()
51 |         dic['name'] = run_name
52 |         dic['score'] = np.mean(scores)
53 |         for i, score in enumerate(scores):
54 |             dic[f'score{i}'] = score
55 |         self.result(self.to_ltsv(dic))
56 | 
57 |     def now_string(self):
58 |         return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
59 | 
60 |     def to_ltsv(self, dic):
61 |         return '\t'.join(['{}:{}'.format(key, value) for key, value in dic.items()])
62 | 
63 | 
64 | class Submission:
65 | 
66 |     @classmethod
67 |     def create_submission(cls, run_name):
68 |         submission = pd.read_csv('../input/sampleSubmission.csv')
69 |         pred = Util.load(f'../model/pred/{run_name}-test.pkl')
70 |         for i in range(pred.shape[1]):
71 |             submission[f'Class_{i + 1}'] = pred[:, i]
72 |         submission.to_csv(f'../submission/{run_name}.csv', index=False)
73 | 


--------------------------------------------------------------------------------
/ch07/ch07-02-blending.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # train_x is the training data, train_y is the target values, and test_x is the test data
 8 | # stored in pandas DataFrames and Series (numpy arrays also used)
 9 | 
10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
11 | train_x = train.drop(['target'], axis=1)
12 | train_y = train['target']
13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
14 | 
15 | # Data for neural network
16 | train_nn = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
17 | train_x_nn = train_nn.drop(['target'], axis=1)
18 | train_y_nn = train_nn['target']
19 | test_x_nn = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv')
20 | 
21 | # ---------------------------------
22 | # Ensemble using predictions from hold-out data
23 | # ----------------------------------
24 | from sklearn.metrics import log_loss
25 | from sklearn.model_selection import KFold
26 | 
27 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
28 | tr_idx, va_index = list(kf.split(train_x))[0]
29 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_index]
30 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_index]
31 | tr_x_nn, va_x_nn = train_x_nn.iloc[tr_idx], train_x_nn.iloc[va_index]
32 | 
33 | # Assume Model1_1, Model1_2 and Model2 are defined in models.py
34 | # For each class train using fit and output prediction probabilities using predict
35 | from models import Model1Xgb, Model1NN, Model2Linear
36 | 
37 | # First level model
38 | # Train using training data, output predictions for hold-out and test data
39 | model_1a = Model1Xgb()
40 | model_1a.fit(tr_x, tr_y, va_x, va_y)
41 | va_pred_1a = model_1a.predict(va_x)
42 | test_pred_1a = model_1a.predict(test_x)
43 | 
44 | model_1b = Model1NN()
45 | model_1b.fit(tr_x_nn, tr_y, va_x_nn, va_y)
46 | va_pred_1b = model_1b.predict(va_x_nn)
47 | test_pred_1b = model_1b.predict(test_x_nn)
48 | 
49 | # Score when using hold-out data
50 | print(f'logloss: {log_loss(va_y, va_pred_1a, eps=1e-7):.4f}')
51 | print(f'logloss: {log_loss(va_y, va_pred_1b, eps=1e-7):.4f}')
52 | 
53 | # Make predictions from hold-out and test data a feature and create data frame
54 | va_x_2 = pd.DataFrame({'pred_1a': va_pred_1a, 'pred_1b': va_pred_1b})
55 | test_x_2 = pd.DataFrame({'pred_1a': test_pred_1a, 'pred_1b': test_pred_1b})
56 | 
57 | # Second level model
58 | # Trained using all hold-out data so cannot evaluate score
59 | # In order to score, a method further cross validating hold-out data can be considered
60 | model2 = Model2Linear()
61 | model2.fit(va_x_2, va_y, None, None)
62 | pred_test_2 = model2.predict(test_x_2)
63 | 


--------------------------------------------------------------------------------
/ch07/models.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import xgboost as xgb
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense, Dropout
 6 | from sklearn.linear_model import LogisticRegression
 7 | from sklearn.preprocessing import StandardScaler
 8 | 
 9 | # Suppress tensorflow warnings
10 | import os
11 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
12 | import tensorflow as tf
13 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
14 | 
15 | 
16 | # xgboost model
17 | class Model1Xgb:
18 | 
19 |     def __init__(self):
20 |         self.model = None
21 | 
22 |     def fit(self, tr_x, tr_y, va_x, va_y):
23 |         params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71,
24 |                   'eval_metric': 'logloss'}
25 |         num_round = 10
26 |         dtrain = xgb.DMatrix(tr_x, label=tr_y)
27 |         dvalid = xgb.DMatrix(va_x, label=va_y)
28 |         watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
29 |         self.model = xgb.train(params, dtrain, num_round, evals=watchlist)
30 | 
31 |     def predict(self, x):
32 |         data = xgb.DMatrix(x)
33 |         pred = self.model.predict(data)
34 |         return pred
35 | 
36 | 
37 | # Neural network model
38 | class Model1NN:
39 | 
40 |     def __init__(self):
41 |         self.model = None
42 |         self.scaler = None
43 | 
44 |     def fit(self, tr_x, tr_y, va_x, va_y):
45 |         self.scaler = StandardScaler()
46 |         self.scaler.fit(tr_x)
47 | 
48 |         batch_size = 128
49 |         epochs = 10
50 | 
51 |         tr_x = self.scaler.transform(tr_x)
52 |         va_x = self.scaler.transform(va_x)
53 |         model = Sequential()
54 |         model.add(Dense(256, activation='relu', input_shape=(tr_x.shape[1],)))
55 |         model.add(Dropout(0.2))
56 |         model.add(Dense(256, activation='relu'))
57 |         model.add(Dropout(0.2))
58 |         model.add(Dense(1, activation='sigmoid'))
59 | 
60 |         model.compile(loss='binary_crossentropy', optimizer='adam')
61 | 
62 |         history = model.fit(tr_x, tr_y,
63 |                             batch_size=batch_size, epochs=epochs,
64 |                             verbose=1, validation_data=(va_x, va_y))
65 |         self.model = model
66 | 
67 |     def predict(self, x):
68 |         x = self.scaler.transform(x)
69 |         pred = self.model.predict_proba(x).reshape(-1)
70 |         return pred
71 | 
72 | 
73 | # Linear model
74 | class Model2Linear:
75 | 
76 |     def __init__(self):
77 |         self.model = None
78 |         self.scaler = None
79 | 
80 |     def fit(self, tr_x, tr_y, va_x, va_y):
81 |         self.scaler = StandardScaler()
82 |         self.scaler.fit(tr_x)
83 |         tr_x = self.scaler.transform(tr_x)
84 |         self.model = LogisticRegression(solver='lbfgs', C=1.0)
85 |         self.model.fit(tr_x, tr_y)
86 | 
87 |     def predict(self, x):
88 |         x = self.scaler.transform(x)
89 |         pred = self.model.predict_proba(x)[:, 1]
90 |         return pred
91 | 


--------------------------------------------------------------------------------
/ch02/ch02-02-custom-usage.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # train_x is the training data, train_y contains the target values, test_x is the test data
 8 | # stored in pandas DataFrames and Series (numpy arrays also used)
 9 | 
10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
11 | train_x = train.drop(['target'], axis=1)
12 | train_y = train['target']
13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
14 | 
15 | from sklearn.model_selection import KFold
16 | 
17 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
18 | tr_idx, va_idx = list(kf.split(train_x))[0]
19 | 
20 | # Split the training data into training and validation data
21 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
22 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
23 | 
24 | # -----------------------------------
25 | # Examples of custom metrics and objective functions in xgboost
26 | # (Reference) https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
27 | # -----------------------------------
28 | import xgboost as xgb
29 | from sklearn.metrics import log_loss
30 | 
31 | # Convert features and target values into xgboost data structure
32 | # Test features and target values are tr_x, tr_x, validation features and target values va_x, va_y
33 | dtrain = xgb.DMatrix(tr_x, label=tr_y)
34 | dvalid = xgb.DMatrix(va_x, label=va_y)
35 | 
36 | 
37 | # Custom objective function (logless in this case, which is equivalent to xgboost's 'binary:logistic')
38 | def logregobj(preds, dtrain):
39 |     labels = dtrain.get_label()  # Get labels of true values
40 |     preds = 1.0 / (1.0 + np.exp(-preds))  # Sigmoid function
41 |     grad = preds - labels  # Gradient
42 |     hess = preds * (1.0 - preds)  # Second derivative
43 |     return grad, hess
44 | 
45 | 
46 | # Custom metric (error rate in this case)
47 | def evalerror(preds, dtrain):
48 |     labels = dtrain.get_label()  # Get labels of true values
49 |     return 'custom-error', float(sum(labels != (preds > 0.0))) / len(labels)
50 | 
51 | 
52 | # Set hyperparameters
53 | params = {'silent': 1, 'random_state': 71}
54 | num_round = 50
55 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
56 | 
57 | # Train the model
58 | bst = xgb.train(params, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)
59 | 
60 | # Unlike when binary:logistic is specified as the objective function,
61 | # the values outputted are not probabilities so they need to be converted
62 | pred_val = bst.predict(dvalid)
63 | pred = 1.0 / (1.0 + np.exp(-pred_val))
64 | logloss = log_loss(va_y, pred)
65 | print(logloss)
66 | 
67 | # For reference results from normal training method
68 | params = {'silent': 1, 'random_state': 71, 'objective': 'binary:logistic'}
69 | bst = xgb.train(params, dtrain, num_round, watchlist)
70 | 
71 | pred = bst.predict(dvalid)
72 | logloss = log_loss(va_y, pred)
73 | print(logloss)
74 | 


--------------------------------------------------------------------------------
/ch05/ch05-02-timeseries.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # train_x is the training data, train_y is the target values, and test_x is the test data
 8 | # stored in pandas DataFrames and Series (numpy arrays also used)
 9 | 
10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
11 | train_x = train.drop(['target'], axis=1)
12 | train_y = train['target']
13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
14 | 
15 | # As time-series data assume a period variable is set that changes with time
16 | train_x['period'] = np.arange(0, len(train_x)) // (len(train_x) // 4)
17 | train_x['period'] = np.clip(train_x['period'], 0, 3)
18 | test_x['period'] = 4
19 | 
20 | # -----------------------------------
21 | # Hold-out method for time-series data
22 | # -----------------------------------
23 | # Partition using the period variable as the basis (0 to 3 are the training data, 4 is the test data)
24 | # Here for within the training data period 3 is used for validation and periods 0 to 2 are used for training
25 | is_tr = train_x['period'] < 3
26 | is_va = train_x['period'] == 3
27 | tr_x, va_x = train_x[is_tr], train_x[is_va]
28 | tr_y, va_y = train_y[is_tr], train_y[is_va]
29 | 
30 | # -----------------------------------
31 | # Cross validation for time-series data (use method that follows time)
32 | # -----------------------------------
33 | # Partition using the period variable as the basis (0 to 3 are the training data, 4 is the test data)
34 | # Periods 1, 2 and 3 are each used for cross-validation, and the preceding periods are used for training
35 | 
36 | va_period_list = [1, 2, 3]
37 | for va_period in va_period_list:
38 |     is_tr = train_x['period'] < va_period
39 |     is_va = train_x['period'] == va_period
40 |     tr_x, va_x = train_x[is_tr], train_x[is_va]
41 |     tr_y, va_y = train_y[is_tr], train_y[is_va]
42 | 
43 | # (For reference) Using TimeSeriesSplit() function is difficult as only the order of the data can be used
44 | from sklearn.model_selection import TimeSeriesSplit
45 | 
46 | tss = TimeSeriesSplit(n_splits=4)
47 | for tr_idx, va_idx in tss.split(train_x):
48 |     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
49 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
50 | 
51 | # -----------------------------------
52 | # Cross validation for time-series data (method to simply partition by time)
53 | # -----------------------------------
54 | # Partition using the period variable as the basis (0 to 3 are the training data, 4 is the test data)
55 | # Periods 1, 2 and 3 are each used for cross-validation, and the preceding periods are used for training
56 | 
57 | va_period_list = [0, 1, 2, 3]
58 | for va_period in va_period_list:
59 |     is_tr = train_x['period'] != va_period
60 |     is_va = train_x['period'] == va_period
61 |     tr_x, va_x = train_x[is_tr], train_x[is_va]
62 |     tr_y, va_y = train_y[is_tr], train_y[is_va]
63 | 


--------------------------------------------------------------------------------
/ch03/ch03-06-reduction-mnist.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | # Visualization of MNIST data
 9 | 
10 | # Import MNIST data from keras.datasets
11 | from keras.datasets import mnist
12 | (train_x, train_y), (test_x, test_y) = mnist.load_data()
13 | 
14 | # Convert to 2D data
15 | train_x = train_x.reshape(train_x.shape[0], -1)
16 | 
17 | # Decrease size by taking only first 1000 data
18 | train_x = pd.DataFrame(train_x[:1000, :])
19 | train_y = train_y[:1000]
20 | 
21 | # -----------------------------------
22 | # PCA
23 | # -----------------------------------
24 | from sklearn.decomposition import PCA
25 | 
26 | # Fit the PCA transformation by using the training data
27 | pca = PCA()
28 | x_pca = pca.fit_transform(train_x)
29 | 
30 | # Plot in 2D, differentiating each class by color 
31 | f, ax = plt.subplots(1)
32 | for i in range(10):
33 |     mask = train_y == i
34 |     plt.scatter(x_pca[mask, 0], x_pca[mask, 1], label=i, s=10, alpha=0.5)
35 | ax.legend(bbox_to_anchor=(1.00, 1), loc='upper left')
36 | 
37 | plt.show()
38 | 
39 | # -----------------------------------
40 | # LDA (Linear Discriminant Analysis)
41 | # -----------------------------------
42 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
43 | 
44 | # Derive the 2 axes that best split the classes using linear discriminant analysis
45 | lda = LDA(n_components=2)
46 | x_lda = lda.fit_transform(train_x, train_y)
47 | 
48 | # Plot in 2D, differentiating each class by color
49 | # Note that the division is good, but this method is using the target values which gives it an advantage over other methods
50 | f, ax = plt.subplots(1)
51 | for i in range(10):
52 |     mask = train_y == i
53 |     plt.scatter(x_lda[mask, 0], x_lda[mask, 1], label=i, s=10, alpha=0.5)
54 | ax.legend(bbox_to_anchor=(1.00, 1), loc='upper left')
55 | 
56 | plt.show()
57 | 
58 | # -----------------------------------
59 | # t-sne
60 | # -----------------------------------
61 | from sklearn.manifold import TSNE
62 | 
63 | # Transform using t-sne
64 | tsne = TSNE(n_components=2)
65 | x_tsne = tsne.fit_transform(train_x)
66 | 
67 | # Plot in 2D, differentiating each class by color
68 | f, ax = plt.subplots(1)
69 | for i in range(10):
70 |     mask = train_y == i
71 |     plt.scatter(x_tsne[mask, 0], x_tsne[mask, 1], label=i, s=10, alpha=0.5)
72 | ax.legend(bbox_to_anchor=(1.00, 1), loc='upper left')
73 | 
74 | plt.show()
75 | 
76 | # -----------------------------------
77 | # UMAP
78 | # -----------------------------------
79 | import umap
80 | 
81 | # Transform using UMAP
82 | um = umap.UMAP()
83 | x_umap = um.fit_transform(train_x)
84 | 
85 | # Plot in 2D, differentiating each class by color
86 | f, ax = plt.subplots(1)
87 | for i in range(10):
88 |     mask = train_y == i
89 |     plt.scatter(x_umap[mask, 0], x_umap[mask, 1], label=i, s=10, alpha=0.5)
90 | ax.legend(bbox_to_anchor=(1.00, 1), loc='upper left')
91 | 
92 | plt.show()
93 | 


--------------------------------------------------------------------------------
/ch06/ch06-04-filter.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # train_x is the training data, train_y is the target values, and test_x is the test data
 8 | # stored in pandas DataFrames and Series (numpy arrays also used)
 9 | 
10 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
11 | train_x = train.drop(['target'], axis=1)
12 | train_y = train['target']
13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv')
14 | 
15 | # ---------------------------------
16 | # Use argsort() to do index sort
17 | # ---------------------------------
18 | # Arrays can be ordered using index sort into ascending and descending order with argsort()
19 | ary = np.array([10, 20, 30, 0])
20 | idx = ary.argsort()
21 | print(idx)  # Ascending order - [3 0 1 2]
22 | print(idx[::-1])  # Descending order - [2 1 0 3]
23 | 
24 | print(ary[idx[::-1][:3]])  # Output best three - [30, 20, 10]
25 | 
26 | # ---------------------------------
27 | # Correlation coefficient
28 | # ---------------------------------
29 | import scipy.stats as st
30 | 
31 | # Correlation coefficient
32 | corrs = []
33 | for c in train_x.columns:
34 |     corr = np.corrcoef(train_x[c], train_y)[0, 1]
35 |     corrs.append(corr)
36 | corrs = np.array(corrs)
37 | 
38 | # Spearman's rank correlation coefficient
39 | corrs_sp = []
40 | for c in train_x.columns:
41 |     corr_sp = st.spearmanr(train_x[c], train_y).correlation
42 |     corrs_sp.append(corr_sp)
43 | corrs_sp = np.array(corrs_sp)
44 | 
45 | # Output in order to top importance (maximum of top 5)
46 | # Using np.argsort(), you can get the indices of the ordered values
47 | idx = np.argsort(np.abs(corrs))[::-1]
48 | top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
49 | print(top_cols, top_importances)
50 | 
51 | idx2 = np.argsort(np.abs(corrs_sp))[::-1]
52 | top_cols2, top_importances2 = train_x.columns.values[idx][:5], corrs_sp[idx][:5]
53 | print(top_cols2, top_importances2)
54 | 
55 | # ---------------------------------
56 | # Chi-square statistic
57 | # ---------------------------------
58 | from sklearn.feature_selection import chi2
59 | from sklearn.preprocessing import MinMaxScaler
60 | 
61 | # Chi-square statistic
62 | x = MinMaxScaler().fit_transform(train_x)
63 | c2, _ = chi2(x, train_y)
64 | 
65 | # Output in order to top importance (maximum of top 5)
66 | idx = np.argsort(c2)[::-1]
67 | top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
68 | print(top_cols, top_importances)
69 | 
70 | # ---------------------------------
71 | # Mutual information
72 | # ---------------------------------
73 | from sklearn.feature_selection import mutual_info_classif
74 | 
75 | # Mutual information
76 | mi = mutual_info_classif(train_x, train_y)
77 | 
78 | # Output in order to top importance (maximum of top 5)
79 | idx = np.argsort(mi)[::-1]
80 | top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
81 | print(top_cols, top_importances)
82 | 


--------------------------------------------------------------------------------
/input/sample-data/input_preprocess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.preprocessing import LabelEncoder
 4 | 
 5 | # Read data, concatenate test and train once
 6 | df_train = pd.read_csv('train.csv')
 7 | df_train['is_train'] = True
 8 | df_test = pd.read_csv('test.csv')
 9 | df_test['target'] = 0
10 | df_test['is_train'] = False
11 | 
12 | df = pd.concat([df_train, df_test], axis=0)
13 | 
14 | # Preprocessing of dates
15 | df['date'] = pd.to_datetime(df['date'])
16 | df['year'] = df['date'].dt.year
17 | df['month'] = df['date'].dt.month
18 | df['day'] = df['date'].dt.day
19 | df['yearmonth'] = df['year'] * 12 + df['month']
20 | df = df.drop(['date'], axis=1)
21 | 
22 | # Different feature types
23 | numerical_features = ['age', 'height', 'weight', 'amount', 'year', 'month', 'month', 'yearmonth'
24 |                                                                                      'medical_info_a1',
25 |                       'medical_info_a2', 'medical_info_a3', 'medical_info_b1']
26 | binary_features = [f'medical_keyword_{i}' for i in range(10)]
27 | categorical_features = ['sex', 'product', 'medical_info_b2', 'medical_info_b3']
28 | 
29 | # Perform Label Encoding for categorical features
30 | for c in categorical_features:
31 |     le = LabelEncoder()
32 |     df[c] = le.fit_transform(df[c])
33 |     print(f'{c} - {le.classes_}')
34 | 
35 | # Move target to last column (for readability)
36 | df = df.reindex(columns=[c for c in df.columns if c != 'target'] + ['target'])
37 | 
38 | # Split into train/test and output
39 | train = df[df['is_train']].drop(['is_train'], axis=1).reset_index(drop=True)
40 | test = df[~df['is_train']].drop(['is_train', 'target'], axis=1).reset_index(drop=True)
41 | train.to_csv('train_preprocessed.csv', index=False)
42 | test.to_csv('test_preprocessed.csv', index=False)
43 | 
44 | # ----------------------
45 | # Preprocessing for neural network and linear models
46 | 
47 | # Impute missing values
48 | has_nan_features = ['medical_info_c1', 'medical_info_c2']
49 | for c in has_nan_features:
50 |     df[f'{c}_nan'] = df[c].isnull()
51 |     df[c].fillna(df[c].mean(), inplace=True)
52 | 
53 | # Perform One-hot Encoding
54 | df_onehot = pd.DataFrame(None, index=df.index)
55 | for c in df.columns:
56 |     if c in categorical_features and df[c].nunique() > 2:
57 |         dummies = pd.get_dummies(df[c], prefix=c)
58 |         df_onehot = pd.concat([df_onehot, dummies], axis=1)
59 |         print(f'one-hot encoded - {c}')
60 |     else:
61 |         df_onehot[c] = df[c]
62 | 
63 | 
64 | # Move target to last column (for readability)
65 | df_onehot = df_onehot.reindex(columns=[c for c in df_onehot.columns if c != 'target'] + ['target'])
66 | 
67 | # Split into train/test and output
68 | train_onehot = df_onehot[df_onehot['is_train']].drop(['is_train'], axis=1).reset_index(drop=True)
69 | test_onehot = df_onehot[~df_onehot['is_train']].drop(['is_train', 'target'], axis=1).reset_index(drop=True)
70 | train_onehot.to_csv('train_preprocessed_onehot.csv', index=False)
71 | test_onehot.to_csv('test_preprocessed_onehot.csv', index=False)
72 | 


--------------------------------------------------------------------------------
/ch04/ch04-04-run_nn.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # train_x is the training data, train_y is the target values, and test_x is the test data
 8 | # stored in pandas DataFrames and Series (numpy arrays also used)
 9 | # Load one-hot encoded data
10 | 
11 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
12 | train_x = train.drop(['target'], axis=1)
13 | train_y = train['target']
14 | test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv')
15 | 
16 | # Split the training data into training and validation data
17 | from sklearn.model_selection import KFold
18 | 
19 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
20 | tr_idx, va_idx = list(kf.split(train_x))[0]
21 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
22 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
23 | 
24 | # Suppress tensorflow warnings
25 | import os
26 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
27 | import tensorflow as tf
28 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
29 | 
30 | # -----------------------------------
31 | # Neural network implementation
32 | # -----------------------------------
33 | from keras.layers import Dense, Dropout
34 | from keras.models import Sequential
35 | from sklearn.metrics import log_loss
36 | from sklearn.preprocessing import StandardScaler
37 | 
38 | # Data scaling
39 | scaler = StandardScaler()
40 | tr_x = scaler.fit_transform(tr_x)
41 | va_x = scaler.transform(va_x)
42 | test_x = scaler.transform(test_x)
43 | 
44 | # Construct the neural network
45 | model = Sequential()
46 | model.add(Dense(256, activation='relu', input_shape=(train_x.shape[1],)))
47 | model.add(Dropout(0.2))
48 | model.add(Dense(256, activation='relu'))
49 | model.add(Dropout(0.2))
50 | model.add(Dense(1, activation='sigmoid'))
51 | 
52 | model.compile(loss='binary_crossentropy',
53 |               optimizer='adam', metrics=['accuracy'])
54 | 
55 | # Perform training
56 | # Pass the validation data to the model, and monitor how the score changes during training
57 | batch_size = 128
58 | epochs = 10
59 | history = model.fit(tr_x, tr_y,
60 |                     batch_size=batch_size, epochs=epochs,
61 |                     verbose=1, validation_data=(va_x, va_y))
62 | 
63 | # Check score for validation data
64 | va_pred = model.predict(va_x)
65 | score = log_loss(va_y, va_pred, eps=1e-7)
66 | print(f'logloss: {score:.4f}')
67 | 
68 | # Predictions
69 | pred = model.predict(test_x)
70 | 
71 | # -----------------------------------
72 | # Early stopping
73 | # -----------------------------------
74 | from keras.callbacks import EarlyStopping
75 | 
76 | # Set number of early stopping rounds to 20
77 | # By setting restore_best_weights, we use the model from the best epoch
78 | epochs = 50
79 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
80 | 
81 | history = model.fit(tr_x, tr_y,
82 |                     batch_size=batch_size, epochs=epochs,
83 |                     verbose=1, validation_data=(va_x, va_y), callbacks=[early_stopping])
84 | pred = model.predict(test_x)
85 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ## Overview
 2 | 
 3 | This is an English translation of the sample code that accompanies the bestselling Japanese Kaggle book "Data Analysis Techniques to Win Kaggle" ([Amazon Japan](https://www.amazon.co.jp/dp/4297108437)).
 4 | PDF copies of the book can be purchased from the [publisher's website](https://gihyo.jp/dp/ebook/2019/978-4-297-10844-1) from anywhere in the world using PayPal.
 5 | The book's authors are by Daisuke Kadowaki ([threecourse](https://www.kaggle.com/threecourse)), Ryuji Sakata ([Jack](https://www.kaggle.com/rsakata)), Keisuke Hosaka ([hskksk](https://www.kaggle.com/hskksk)) and Yuji Hiramatsu ([maxwell](https://www.kaggle.com/maxwell110)). 
 6 | It was first published on 9 October 2019 by Gijutsu-Hyohron Co., Ltd (ISBN-13: 978-4297108434).
 7 | 
 8 | Book cover:
 9 | 
10 | <img src="misc/cover_small.jpg" width="200">
11 | 
12 | ### Contents of each folder
13 | 
14 | | Folder | Contents |
15 | |:----|:-------|
16 | | input | Input files |
17 | | ch01 | Sample code for chapter 1 |
18 | | ch02 | Sample code for chapter 2 |
19 | | ch03 | Sample code for chapter 3 |
20 | | ch04 | Sample code for chapter 4 |
21 | | ch05 | Sample code for chapter 5 |
22 | | ch06 | Sample code for chapter 6 |
23 | | ch07 | Sample code for chapter 7 |
24 | | ch04-model-interface | Code for "class and directory structures for competitions" section of chapter 4 |
25 | 
26 | * Execute code with the each chapter folder directory as the current directory.
27 | * For chapter 1, download the titanic data first as described in [input/readme.md](input/readme.md).
28 | * For the chapter 4 model interface code, refer to [ch04-model-interface/readme.md](ch04-model-interface).
29 | 
30 | 
31 | ### Requirements
32 | 
33 | The sample code has been checked for operability on Google Cloud Platform (GCP) using the following environment.
34 | 
35 | * Ubuntu 18.04 LTS  
36 | * Anaconda 2019.03 Python 3.7
37 | * Necessary Python packages (check script below)
38 | 
39 | Use following script to set up GCP environment.
40 | ```
41 | # utils -----
42 | 
43 | # Install required tools for development
44 | cd ~/
45 | sudo apt-get update
46 | sudo apt-get install -y git build-essential libatlas-base-dev
47 | sudo apt-get install -y python3-dev
48 | 
49 | # anaconda -----
50 | 
51 | # Download and install Anaconda
52 | mkdir lib
53 | wget --quiet https://repo.continuum.io/archive/Anaconda3-2019.03-Linux-x86_64.sh -O lib/anaconda.sh
54 | /bin/bash lib/anaconda.sh -b
55 | 
56 | # Add to PATH
57 | echo export PATH=~/anaconda3/bin:$PATH >> ~/.bashrc
58 | source ~/.bashrc
59 | 
60 | # python packages -----
61 | 
62 | # Install Python packages
63 | # Use Anaconda 2019.03 default versions for numpy, scipy and pandas
64 | # pip install numpy==1.16.2 
65 | # pip install scipy==1.2.1 
66 | # pip install pandas==0.24.2
67 | pip install scikit-learn==0.21.2
68 | 
69 | pip install xgboost==0.81
70 | pip install lightgbm==2.2.2
71 | pip install tensorflow==1.14.0
72 | pip install keras==2.2.4
73 | pip install hyperopt==0.1.1
74 | pip install bhtsne==0.1.9
75 | pip install rgf_python==3.4.0
76 | pip install umap-learn==0.3.9
77 | 
78 | # set backend for matplotlib to Agg -----
79 | 
80 | # To execute on GCP, set matplotlib to backend
81 | matplotlibrc_path=$(python -c "import site, os, fileinput; packages_dir = site.getsitepackages()[0]; print(os.path.join(packages_dir, 'matplotlib', 'mpl-data', 'matplotlibrc'))") && \
82 | sed -i 's/^backend      : qt5agg/backend      : agg/' $matplotlibrc_path
83 | ```
84 | 


--------------------------------------------------------------------------------
/ch04-model-interface/code/model_nn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from keras.callbacks import EarlyStopping
 6 | from keras.layers.advanced_activations import PReLU
 7 | from keras.layers.core import Activation, Dense, Dropout
 8 | from keras.layers.normalization import BatchNormalization
 9 | from keras.models import Sequential, load_model
10 | from keras.utils import np_utils
11 | from sklearn.preprocessing import StandardScaler
12 | 
13 | from model import Model
14 | from util import Util
15 | 
16 | # Suppress tensorflow warnings
17 | import os
18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
19 | import tensorflow as tf
20 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
21 | 
22 | 
23 | class ModelNN(Model):
24 | 
25 |     def train(self, tr_x, tr_y, va_x=None, va_y=None):
26 | 
27 |         # Set and scale data
28 |         validation = va_x is not None
29 |         scaler = StandardScaler()
30 |         scaler.fit(tr_x)
31 |         tr_x = scaler.transform(tr_x)
32 |         tr_y = np_utils.to_categorical(tr_y, num_classes=9)
33 | 
34 |         if validation:
35 |             va_x = scaler.transform(va_x)
36 |             va_y = np_utils.to_categorical(va_y, num_classes=9)
37 | 
38 |         # Parameters
39 |         nb_classes = 9
40 |         layers = self.params['layers']
41 |         dropout = self.params['dropout']
42 |         units = self.params['units']
43 |         nb_epoch = self.params['nb_epoch']
44 |         patience = self.params['patience']
45 | 
46 |         # Construct model
47 |         model = Sequential()
48 |         model.add(Dense(units, input_shape=(tr_x.shape[1],)))
49 |         model.add(PReLU())
50 |         model.add(BatchNormalization())
51 |         model.add(Dropout(dropout))
52 | 
53 |         for l in range(layers - 1):
54 |             model.add(Dense(units))
55 |             model.add(PReLU())
56 |             model.add(BatchNormalization())
57 |             model.add(Dropout(dropout))
58 | 
59 |         model.add(Dense(nb_classes))
60 |         model.add(Activation('softmax'))
61 |         model.compile(loss='categorical_crossentropy', optimizer='adam')
62 | 
63 |         if validation:
64 |             early_stopping = EarlyStopping(monitor='val_loss', patience=patience,
65 |                                            verbose=1, restore_best_weights=True)
66 |             model.fit(tr_x, tr_y, epochs=nb_epoch, batch_size=128, verbose=2,
67 |                       validation_data=(va_x, va_y), callbacks=[early_stopping])
68 |         else:
69 |             model.fit(tr_x, tr_y, nb_epoch=nb_epoch, batch_size=128, verbose=2)
70 | 
71 |         # Retain model and scaler
72 |         self.model = model
73 |         self.scaler = scaler
74 | 
75 |     def predict(self, te_x):
76 |         te_x = self.scaler.transform(te_x)
77 |         pred = self.model.predict_proba(te_x)
78 |         return pred
79 | 
80 |     def save_model(self):
81 |         model_path = os.path.join('../model/model', f'{self.run_fold_name}.h5')
82 |         scaler_path = os.path.join('../model/model', f'{self.run_fold_name}-scaler.pkl')
83 |         os.makedirs(os.path.dirname(model_path), exist_ok=True)
84 |         self.model.save(model_path)
85 |         Util.dump(self.scaler, scaler_path)
86 | 
87 |     def load_model(self):
88 |         model_path = os.path.join('../model/model', f'{self.run_fold_name}.h5')
89 |         scaler_path = os.path.join('../model/model', f'{self.run_fold_name}-scaler.pkl')
90 |         self.model = load_model(model_path)
91 |         self.scaler = Util.load(scaler_path)
92 | 


--------------------------------------------------------------------------------
/ch07/ch07-01-stacking.py:
--------------------------------------------------------------------------------
 1 | # ---------------------------------
 2 | # Prepare the data etc.
 3 | # ----------------------------------
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | # train_x is the training data, train_y is the target values, and test_x is the test data
 8 | # stored in pandas DataFrames and Series (numpy arrays also used)
 9 | 
10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
11 | train_x = train.drop(['target'], axis=1)
12 | train_y = train['target']
13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
14 | 
15 | # Data for neural network
16 | train_nn = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
17 | train_x_nn = train_nn.drop(['target'], axis=1)
18 | train_y_nn = train_nn['target']
19 | test_x_nn = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv')
20 | 
21 | # ---------------------------------
22 | # Stacking
23 | # ----------------------------------
24 | from sklearn.metrics import log_loss
25 | from sklearn.model_selection import KFold
26 | 
27 | # Assume Model1Xgb, Model1NN and Model2Linear are defined in models.py
28 | # For each class train using fit and output prediction probabilities using predict
29 | 
30 | from models import Model1Xgb, Model1NN, Model2Linear
31 | 
32 | 
33 | # Function that returns predictions for training data without knowing the target values, and predictions for the test data
34 | def predict_cv(model, train_x, train_y, test_x):
35 |     preds = []
36 |     preds_test = []
37 |     va_idxes = []
38 | 
39 |     kf = KFold(n_splits=4, shuffle=True, random_state=71)
40 | 
41 |     # Train and make predictions using cross validation, save indices of predictions
42 |     for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
43 |         tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
44 |         tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
45 |         model.fit(tr_x, tr_y, va_x, va_y)
46 |         pred = model.predict(va_x)
47 |         preds.append(pred)
48 |         pred_test = model.predict(test_x)
49 |         preds_test.append(pred_test)
50 |         va_idxes.append(va_idx)
51 | 
52 |     # Link using predictions from validation data, then return to original order
53 |     va_idxes = np.concatenate(va_idxes)
54 |     preds = np.concatenate(preds, axis=0)
55 |     order = np.argsort(va_idxes)
56 |     pred_train = preds[order]
57 | 
58 |     # Take average of predictions from test data
59 |     preds_test = np.mean(preds_test, axis=0)
60 | 
61 |     return pred_train, preds_test
62 | 
63 | 
64 | # First level model
65 | # pred_train_1a, pred_train_1b are predictions from training data using cross validation
66 | # pred_test_1a, pred_test_1b are predictions from test data
67 | model_1a = Model1Xgb()
68 | pred_train_1a, pred_test_1a = predict_cv(model_1a, train_x, train_y, test_x)
69 | 
70 | model_1b = Model1NN()
71 | pred_train_1b, pred_test_1b = predict_cv(model_1b, train_x_nn, train_y, test_x_nn)
72 | 
73 | # Score for first level model
74 | print(f'logloss: {log_loss(train_y, pred_train_1a, eps=1e-7):.4f}')
75 | print(f'logloss: {log_loss(train_y, pred_train_1b, eps=1e-7):.4f}')
76 | 
77 | # Make predictions a feature and create a data frame
78 | train_x_2 = pd.DataFrame({'pred_1a': pred_train_1a, 'pred_1b': pred_train_1b})
79 | test_x_2 = pd.DataFrame({'pred_1a': pred_test_1a, 'pred_1b': pred_test_1b})
80 | 
81 | # Second level model
82 | # pred_train_2 are predictions from training data using cross validation via second level model
83 | # pred_test_2 are predictions from test data via second level model
84 | model_2 = Model2Linear()
85 | pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
86 | print(f'logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}')
87 | 


--------------------------------------------------------------------------------
/ch06/ch06-06-wrapper.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------
  2 | # Prepare the data etc.
  3 | # ----------------------------------
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | # train_x is the training data, train_y is the target values, and test_x is the test data
  8 | # stored in pandas DataFrames and Series (numpy arrays also used)
  9 | 
 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
 11 | train_x = train.drop(['target'], axis=1)
 12 | train_y = train['target']
 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
 14 | 
 15 | # Split training data into training and validation data
 16 | from sklearn.model_selection import KFold
 17 | 
 18 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
 19 | tr_idx, va_idx = list(kf.split(train_x))[0]
 20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
 21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
 22 | 
 23 | # Specify evaluation function that measures accuracy of features list
 24 | import xgboost as xgb
 25 | from sklearn.metrics import log_loss
 26 | 
 27 | 
 28 | def evaluate(features):
 29 |     dtrain = xgb.DMatrix(tr_x[features], label=tr_y)
 30 |     dvalid = xgb.DMatrix(va_x[features], label=va_y)
 31 |     params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
 32 |     num_round = 10  # In reality more rounds are necessary
 33 |     early_stopping_rounds = 3
 34 |     watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
 35 |     model = xgb.train(params, dtrain, num_round,
 36 |                       evals=watchlist, early_stopping_rounds=early_stopping_rounds,
 37 |                       verbose_eval=0)
 38 |     va_pred = model.predict(dvalid)
 39 |     score = log_loss(va_y, va_pred)
 40 | 
 41 |     return score
 42 | 
 43 | 
 44 | # ---------------------------------
 45 | # Greedy Forward Selection
 46 | # ----------------------------------
 47 | 
 48 | best_score = 9999.0
 49 | selected = set([])
 50 | 
 51 | print('start greedy forward selection')
 52 | 
 53 | while True:
 54 | 
 55 |     if len(selected) == len(train_x.columns):
 56 |         # Finish once all features selected
 57 |         break
 58 | 
 59 |     scores = []
 60 |     for feature in train_x.columns:
 61 |         if feature not in selected:
 62 |             # Assume evaluation function that measures accuracy of features list has been specified
 63 |             fs = list(selected) + [feature]
 64 |             score = evaluate(fs)
 65 |             scores.append((feature, score))
 66 | 
 67 |     # Assume low score is good
 68 |     b_feature, b_score = sorted(scores, key=lambda tpl: tpl[1])[0]
 69 |     if b_score < best_score:
 70 |         selected.add(b_feature)
 71 |         best_score = b_score
 72 |         print(f'selected:{b_feature}')
 73 |         print(f'score:{b_score}')
 74 |     else:
 75 |         # The score does not increase even if any features are added, so finish
 76 |         break
 77 | 
 78 | print(f'selected features: {selected}')
 79 | 
 80 | # ---------------------------------
 81 | # Simplified method for Greedy Forward Selection
 82 | # ----------------------------------
 83 | 
 84 | best_score = 9999.0
 85 | candidates = np.random.RandomState(71).permutation(train_x.columns)
 86 | selected = set([])
 87 | 
 88 | print('start simple selection')
 89 | for feature in candidates:
 90 |     # Assume evaluation function that measures accuracy of features list has been specified
 91 |     fs = list(selected) + [feature]
 92 |     score = evaluate(fs)
 93 | 
 94 |     # Assume low score is good
 95 |     if score < best_score:
 96 |         selected.add(feature)
 97 |         best_score = score
 98 |         print(f'selected:{feature}')
 99 |         print(f'score:{score}')
100 | 
101 | print(f'selected features: {selected}')
102 | 


--------------------------------------------------------------------------------
/ch03/ch03-04-time_series.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | # -----------------------------------
  5 | # Wide format, long format
  6 | # -----------------------------------
  7 | 
  8 | # Load wide format data
  9 | df_wide = pd.read_csv('../input/ch03/time_series_wide.csv', index_col=0)
 10 | # Convert the index column to datetime dtype
 11 | df_wide.index = pd.to_datetime(df_wide.index)
 12 | 
 13 | print(df_wide.iloc[:5, :3])
 14 | '''
 15 |               A     B     C
 16 | date
 17 | 2016-07-01  532  3314  1136
 18 | 2016-07-02  798  2461  1188
 19 | 2016-07-03  823  3522  1711
 20 | 2016-07-04  937  5451  1977
 21 | 2016-07-05  881  4729  1975
 22 | '''
 23 | 
 24 | # Convert to long format
 25 | df_long = df_wide.stack().reset_index(1)
 26 | df_long.columns = ['id', 'value']
 27 | 
 28 | print(df_long.head(10))
 29 | '''
 30 |            id  value
 31 | date
 32 | 2016-07-01  A    532
 33 | 2016-07-01  B   3314
 34 | 2016-07-01  C   1136
 35 | 2016-07-02  A    798
 36 | 2016-07-02  B   2461
 37 | 2016-07-02  C   1188
 38 | 2016-07-03  A    823
 39 | 2016-07-03  B   3522
 40 | 2016-07-03  C   1711
 41 | 2016-07-04  A    937
 42 | ...
 43 | '''
 44 | 
 45 | # Restore wide format
 46 | df_wide = df_long.pivot(index=None, columns='id', values='value')
 47 | 
 48 | # -----------------------------------
 49 | # Lag variables
 50 | # -----------------------------------
 51 | # Set data to wide format
 52 | x = df_wide
 53 | # -----------------------------------
 54 | # x is the wide format data frame
 55 | # The index is the date or timestamp, assume the columns store data of interest such as sales etc. for users or stores
 56 | 
 57 | # Create lag data for one period ago
 58 | x_lag1 = x.shift(1)
 59 | 
 60 | # Create lag data for seven periods ago
 61 | x_lag7 = x.shift(7)
 62 | 
 63 | # -----------------------------------
 64 | # Calculate moving averages for three periods from one period before
 65 | x_avg3 = x.shift(1).rolling(window=3).mean()
 66 | 
 67 | # -----------------------------------
 68 | # Calculate max values over seven periods from one period before
 69 | x_max7 = x.shift(1).rolling(window=7).max()
 70 | 
 71 | # -----------------------------------
 72 | # Calculate average of data from 7, 14, 21 and 28 periods before
 73 | x_e7_avg = (x.shift(7) + x.shift(14) + x.shift(21) + x.shift(28)) / 4.0
 74 | 
 75 | # -----------------------------------
 76 | # Create values for one period ahead
 77 | x_lead1 = x.shift(-1)
 78 | 
 79 | # -----------------------------------
 80 | # Lag variables
 81 | # -----------------------------------
 82 | # Load the data
 83 | train_x = pd.read_csv('../input/ch03/time_series_train.csv')
 84 | event_history = pd.read_csv('../input/ch03/time_series_events.csv')
 85 | train_x['date'] = pd.to_datetime(train_x['date'])
 86 | event_history['date'] = pd.to_datetime(event_history['date'])
 87 | # -----------------------------------
 88 | 
 89 | # train_x is training data in a data frame with columns for user id and date
 90 | # event_history contains data from past events in a data frame with date and event columns
 91 | 
 92 | # occurrences is a data frame with columns for date and whether a sale was made or not
 93 | dates = np.sort(train_x['date'].unique())
 94 | occurrences = pd.DataFrame(dates, columns=['date'])
 95 | sale_history = event_history[event_history['event'] == 'sale']
 96 | occurrences['sale'] = occurrences['date'].isin(sale_history['date'])
 97 | 
 98 | # Take cumulative sums to calculate to number of occurrences on each date
 99 | # occurrences is now a data frame with columns for date and cumulative number of sales on that date
100 | occurrences['sale'] = occurrences['sale'].cumsum()
101 | 
102 | # Using the timestamp as a key, combine with the training dataset
103 | train_x = train_x.merge(occurrences, on='date', how='left')
104 | 


--------------------------------------------------------------------------------
/ch04/ch04-01-introduction.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------
  2 | # Prepare the data etc.
  3 | # ----------------------------------
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | # train_x is the training data, train_y is the target values, and test_x is the test data
  8 | # stored in pandas DataFrames and Series (also possible to use numpy arrays)
  9 | 
 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
 11 | train_x = train.drop(['target'], axis=1)
 12 | train_y = train['target']
 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
 14 | 
 15 | import xgboost as xgb
 16 | 
 17 | 
 18 | # The Model class to operate the code
 19 | class Model:
 20 | 
 21 |     def __init__(self, params=None):
 22 |         self.model = None
 23 |         if params is None:
 24 |             self.params = {}
 25 |         else:
 26 |             self.params = params
 27 | 
 28 |     def fit(self, tr_x, tr_y):
 29 |         params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
 30 |         params.update(self.params)
 31 |         num_round = 10
 32 |         dtrain = xgb.DMatrix(tr_x, label=tr_y)
 33 |         self.model = xgb.train(params, dtrain, num_round)
 34 | 
 35 |     def predict(self, x):
 36 |         data = xgb.DMatrix(x)
 37 |         pred = self.model.predict(data)
 38 |         return pred
 39 | 
 40 | 
 41 | # -----------------------------------
 42 | # Model training and prediction
 43 | # -----------------------------------
 44 | # Specify the model hyperparameters
 45 | params = {'param1': 10, 'param2': 100}
 46 | 
 47 | # Define the Model class
 48 | # The Model class has functions fit() for training and predict() for outputting predicted probabilities
 49 | 
 50 | # Define the Model class
 51 | model = Model(params)
 52 | 
 53 | # Use the training data to train the model
 54 | model.fit(train_x, train_y)
 55 | 
 56 | # Output predictions for the test data
 57 | pred = model.predict(test_x)
 58 | 
 59 | # -----------------------------------
 60 | # Validation
 61 | # -----------------------------------
 62 | from sklearn.metrics import log_loss
 63 | from sklearn.model_selection import KFold
 64 | 
 65 | # Create an index in order to split the training and validation data
 66 | # Split the training data into 4, and keep aside 1 quarter for validation
 67 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
 68 | tr_idx, va_idx = list(kf.split(train_x))[0]
 69 | 
 70 | # Split the training data into training and validation data
 71 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
 72 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
 73 | 
 74 | # Define the model
 75 | model = Model(params)
 76 | 
 77 | # Use the training data to train the model
 78 | # Depending on the model, validation data can be supplied at the same time in order to monitor the score
 79 | model.fit(tr_x, tr_y)
 80 | 
 81 | # Make predictions with the validation data, and calculate the score
 82 | va_pred = model.predict(va_x)
 83 | score = log_loss(va_y, va_pred)
 84 | print(f'logloss: {score:.4f}')
 85 | 
 86 | # -----------------------------------
 87 | # Cross validation
 88 | # -----------------------------------
 89 | from sklearn.metrics import log_loss
 90 | from sklearn.model_selection import KFold
 91 | 
 92 | # Split the training data into 4, and keep aside 1 quarter for validation
 93 | # Change the quarter used for validation and evaluate the score 4 times
 94 | scores = []
 95 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
 96 | for tr_idx, va_idx in kf.split(train_x):
 97 |     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
 98 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
 99 |     model = Model(params)
100 |     model.fit(tr_x, tr_y)
101 |     va_pred = model.predict(va_x)
102 |     score = log_loss(va_y, va_pred)
103 |     scores.append(score)
104 | 
105 | # Output the mean cross validation score
106 | print(f'logloss: {np.mean(scores):.4f}')
107 | 


--------------------------------------------------------------------------------
/ch06/ch06-01-hopt.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------
  2 | # Prepare the data etc.
  3 | # ----------------------------------
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | # train_x is the training data, train_y is the target values, and test_x is the test data
  8 | # stored in pandas DataFrames and Series (numpy arrays also used)
  9 | 
 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
 11 | train_x = train.drop(['target'], axis=1)
 12 | train_y = train['target']
 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
 14 | 
 15 | # Split training data into training and validation data
 16 | from sklearn.model_selection import KFold
 17 | 
 18 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
 19 | tr_idx, va_idx = list(kf.split(train_x))[0]
 20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
 21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
 22 | 
 23 | # Class for training and making predictions with xgboost
 24 | import xgboost as xgb
 25 | 
 26 | 
 27 | class Model:
 28 | 
 29 |     def __init__(self, params=None):
 30 |         self.model = None
 31 |         if params is None:
 32 |             self.params = {}
 33 |         else:
 34 |             self.params = params
 35 | 
 36 |     def fit(self, tr_x, tr_y, va_x, va_y):
 37 |         params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
 38 |         params.update(self.params)
 39 |         num_round = 10
 40 |         dtrain = xgb.DMatrix(tr_x, label=tr_y)
 41 |         dvalid = xgb.DMatrix(va_x, label=va_y)
 42 |         watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
 43 |         self.model = xgb.train(params, dtrain, num_round, evals=watchlist)
 44 | 
 45 |     def predict(self, x):
 46 |         data = xgb.DMatrix(x)
 47 |         pred = self.model.predict(data)
 48 |         return pred
 49 | 
 50 | 
 51 | # -----------------------------------
 52 | # Specify the parameter space to search
 53 | # -----------------------------------
 54 | # hp.choice: select from multiple options
 55 | # hp.uniform: select uniformly from distribution between minimum and maximum bounds. Arguments are minimum and maximum bounds.
 56 | # hp.quniform: select uniformly at points separated by fixed intervals within minimum and maximum bounds. Arguments are minimum and maximum bounds and interval width.
 57 | # hp.loguniform: select from distribution so logarithm of returned values is uniformly distributed. Arguments are logarithm of minimum and maximum bounds.
 58 | 
 59 | from hyperopt import hp
 60 | 
 61 | space = {
 62 |     'activation': hp.choice('activation', ['prelu', 'relu']),
 63 |     'dropout': hp.uniform('dropout', 0, 0.2),
 64 |     'units': hp.quniform('units', 32, 256, 32),
 65 |     'learning_rate': hp.loguniform('learning_rate', np.log(0.00001), np.log(0.01)),
 66 | }
 67 | 
 68 | # -----------------------------------
 69 | # Parameter search using hyperopt
 70 | # -----------------------------------
 71 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
 72 | from sklearn.metrics import log_loss
 73 | 
 74 | 
 75 | def score(params):
 76 |     # When specifying the parameters also specify a metric to minimize
 77 |     # To be more specific, specify the parameters, then return score for predictions from trained model
 78 | 
 79 |     # Convert max_depth to integer
 80 |     params['max_depth'] = int(params['max_depth'])
 81 | 
 82 |     # Assume Model has already been defined
 83 |     # The Model class function fit() performs training, and predict() outputs predicted probabilities
 84 |     model = Model(params)
 85 |     model.fit(tr_x, tr_y, va_x, va_y)
 86 |     va_pred = model.predict(va_x)
 87 |     score = log_loss(va_y, va_pred)
 88 |     print(f'params: {params}, logloss: {score:.4f}')
 89 | 
 90 |     # Save the information
 91 |     history.append((params, score))
 92 | 
 93 |     return {'loss': score, 'status': STATUS_OK}
 94 | 
 95 | 
 96 | # Specify parameter space to search
 97 | space = {
 98 |     'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1),
 99 |     'max_depth': hp.quniform('max_depth', 3, 9, 1),
100 |     'gamma': hp.quniform('gamma', 0, 0.4, 0.1),
101 | }
102 | 
103 | # Use hyperopt for parameter search
104 | max_evals = 10
105 | trials = Trials()
106 | history = []
107 | fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
108 | 
109 | # Use recorded information to output parameter and score
110 | # (trials provides some information, but using it to obtain parameters is difficult in practice)
111 | history = sorted(history, key=lambda tpl: tpl[1])
112 | best = history[0]
113 | print(f'best params:{best[0]}, score:{best[1]:.4f}')
114 | 


--------------------------------------------------------------------------------
/input/ch03/time_series_wide.csv:
--------------------------------------------------------------------------------
  1 | ,A,B,C
  2 | 2016-07-01,532,3314,1136
  3 | 2016-07-02,798,2461,1188
  4 | 2016-07-03,823,3522,1711
  5 | 2016-07-04,937,5451,1977
  6 | 2016-07-05,881,4729,1975
  7 | 2016-07-06,931,4694,1937
  8 | 2016-07-07,989,4077,1943
  9 | 2016-07-08,905,4555,2112
 10 | 2016-07-09,823,4467,1964
 11 | 2016-07-10,786,5170,2132
 12 | 2016-07-11,984,4241,1983
 13 | 2016-07-12,939,5534,1624
 14 | 2016-07-13,850,5485,2184
 15 | 2016-07-14,980,4805,2211
 16 | 2016-07-15,887,4357,2120
 17 | 2016-07-16,761,4490,1896
 18 | 2016-07-17,922,4705,1813
 19 | 2016-07-18,855,4806,2169
 20 | 2016-07-19,916,5174,1764
 21 | 2016-07-20,835,5103,1875
 22 | 2016-07-21,884,4813,1811
 23 | 2016-07-22,978,4604,2230
 24 | 2016-07-23,919,4418,2093
 25 | 2016-07-24,931,4524,2053
 26 | 2016-07-25,907,4907,2021
 27 | 2016-07-26,1029,5268,2141
 28 | 2016-07-27,1069,4929,2027
 29 | 2016-07-28,841,4639,1983
 30 | 2016-07-29,942,5311,2027
 31 | 2016-07-30,876,4458,1805
 32 | 2016-07-31,991,4677,1898
 33 | 2016-08-01,851,5131,2100
 34 | 2016-08-02,994,4471,1980
 35 | 2016-08-03,744,5394,2047
 36 | 2016-08-04,810,3781,2097
 37 | 2016-08-05,822,4411,1980
 38 | 2016-08-06,724,4731,1928
 39 | 2016-08-07,895,5337,2069
 40 | 2016-08-08,720,4418,1696
 41 | 2016-08-09,831,4586,2302
 42 | 2016-08-10,910,4230,1872
 43 | 2016-08-11,879,4729,1993
 44 | 2016-08-12,1009,4619,2243
 45 | 2016-08-13,755,5032,1970
 46 | 2016-08-14,886,4557,2011
 47 | 2016-08-15,982,4311,2223
 48 | 2016-08-16,1139,3716,1770
 49 | 2016-08-17,801,5133,2188
 50 | 2016-08-18,875,5437,1652
 51 | 2016-08-19,1050,4444,2225
 52 | 2016-08-20,1047,4848,1901
 53 | 2016-08-21,701,4667,2226
 54 | 2016-08-22,970,5180,1882
 55 | 2016-08-23,841,4660,1866
 56 | 2016-08-24,901,4502,1825
 57 | 2016-08-25,795,4911,1878
 58 | 2016-08-26,882,4685,1702
 59 | 2016-08-27,783,4379,2187
 60 | 2016-08-28,798,4583,1949
 61 | 2016-08-29,868,4768,2023
 62 | 2016-08-30,800,5131,2088
 63 | 2016-08-31,977,5065,2371
 64 | 2016-09-01,991,5012,1821
 65 | 2016-09-02,801,4696,1559
 66 | 2016-09-03,898,4467,2306
 67 | 2016-09-04,766,4925,1865
 68 | 2016-09-05,920,4783,2522
 69 | 2016-09-06,796,4091,1972
 70 | 2016-09-07,1046,6043,1652
 71 | 2016-09-08,842,4967,2114
 72 | 2016-09-09,802,4414,2230
 73 | 2016-09-10,767,4688,1824
 74 | 2016-09-11,1065,5378,1944
 75 | 2016-09-12,976,4492,2391
 76 | 2016-09-13,885,4569,2014
 77 | 2016-09-14,861,5533,2077
 78 | 2016-09-15,732,4870,1799
 79 | 2016-09-16,942,4380,1836
 80 | 2016-09-17,793,4246,1866
 81 | 2016-09-18,980,4324,2152
 82 | 2016-09-19,866,4071,1760
 83 | 2016-09-20,997,5980,2274
 84 | 2016-09-21,937,5045,2296
 85 | 2016-09-22,787,5017,2010
 86 | 2016-09-23,969,5198,2087
 87 | 2016-09-24,779,4500,1906
 88 | 2016-09-25,915,5219,1932
 89 | 2016-09-26,925,4815,2252
 90 | 2016-09-27,858,5338,2257
 91 | 2016-09-28,911,5173,2059
 92 | 2016-09-29,914,4602,1844
 93 | 2016-09-30,803,4860,2025
 94 | 2016-10-01,1145,5120,1905
 95 | 2016-10-02,837,4436,1867
 96 | 2016-10-03,780,5155,1899
 97 | 2016-10-04,920,4406,2095
 98 | 2016-10-05,810,4238,2053
 99 | 2016-10-06,929,6004,2002
100 | 2016-10-07,1092,4742,2264
101 | 2016-10-08,809,5159,1771
102 | 2016-10-09,980,4765,1853
103 | 2016-10-10,884,4761,1777
104 | 2016-10-11,828,4039,2088
105 | 2016-10-12,931,5125,1966
106 | 2016-10-13,862,4981,2250
107 | 2016-10-14,886,4600,2277
108 | 2016-10-15,991,5283,2149
109 | 2016-10-16,1166,4292,2178
110 | 2016-10-17,1023,4822,1864
111 | 2016-10-18,981,3740,1645
112 | 2016-10-19,890,4192,2407
113 | 2016-10-20,870,4564,2108
114 | 2016-10-21,977,5349,1831
115 | 2016-10-22,997,5652,1826
116 | 2016-10-23,787,4443,2020
117 | 2016-10-24,975,4380,2108
118 | 2016-10-25,786,4275,2121
119 | 2016-10-26,902,4861,2463
120 | 2016-10-27,830,4317,2402
121 | 2016-10-28,837,4727,1749
122 | 2016-10-29,971,4097,1988
123 | 2016-10-30,794,4331,2326
124 | 2016-10-31,702,5094,1940
125 | 2016-11-01,884,4632,1952
126 | 2016-11-02,856,4972,1836
127 | 2016-11-03,1001,4663,1936
128 | 2016-11-04,911,5228,1949
129 | 2016-11-05,846,4980,1999
130 | 2016-11-06,1091,5191,1792
131 | 2016-11-07,978,4185,1620
132 | 2016-11-08,853,5440,1704
133 | 2016-11-09,753,4414,1852
134 | 2016-11-10,774,4814,1739
135 | 2016-11-11,975,5982,1890
136 | 2016-11-12,822,5464,1796
137 | 2016-11-13,768,5583,1615
138 | 2016-11-14,900,4456,2040
139 | 2016-11-15,873,4958,1904
140 | 2016-11-16,1102,5302,1771
141 | 2016-11-17,906,5559,1947
142 | 2016-11-18,903,4484,2039
143 | 2016-11-19,1081,4729,1731
144 | 2016-11-20,931,4010,1891
145 | 2016-11-21,782,4549,2001
146 | 2016-11-22,827,4642,1929
147 | 2016-11-23,873,3989,1965
148 | 2016-11-24,869,4906,2038
149 | 2016-11-25,938,4060,1991
150 | 2016-11-26,1077,4496,2382
151 | 2016-11-27,785,4723,2190
152 | 2016-11-28,830,4573,1838
153 | 2016-11-29,979,5131,1906
154 | 2016-11-30,806,5175,1958
155 | 2016-12-01,966,4565,2020
156 | 2016-12-02,844,3930,2190
157 | 2016-12-03,1026,5353,2535
158 | 2016-12-04,1014,4330,1921
159 | 2016-12-05,927,4130,2136
160 | 2016-12-06,745,4651,1882
161 | 2016-12-07,871,4339,2033
162 | 2016-12-08,839,3908,2062
163 | 2016-12-09,865,5423,1769
164 | 2016-12-10,923,3763,1884
165 | 2016-12-11,812,5022,1989
166 | 2016-12-12,1004,3949,1691
167 | 2016-12-13,845,5112,2208
168 | 2016-12-14,984,4661,1881
169 | 2016-12-15,842,4788,1962
170 | 2016-12-16,940,5799,1750
171 | 2016-12-17,900,4817,2048
172 | 2016-12-18,1003,4967,2025
173 | 2016-12-19,977,5274,1898
174 | 2016-12-20,890,3935,2085
175 | 2016-12-21,754,4846,2226
176 | 2016-12-22,992,4949,2181
177 | 2016-12-23,854,4619,2035
178 | 2016-12-24,900,5263,2144
179 | 2016-12-25,712,5029,1832
180 | 2016-12-26,840,4576,1954
181 | 2016-12-27,840,4573,1850
182 | 2016-12-28,943,4511,1764
183 | 2016-12-29,978,4599,1787
184 | 2016-12-30,907,4243,2069
185 | 2016-12-31,869,4703,2233
186 | 


--------------------------------------------------------------------------------
/ch03/ch03-05-reduction.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------
  2 | # Prepare the data etc.
  3 | # ----------------------------------
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | # train_x is the training data, train_y is the target values, and test_x is the test data
  8 | # stored in pandas DataFrames and Series (numpy arrays also used)
  9 | 
 10 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
 11 | train_x = train.drop(['target'], axis=1)
 12 | train_y = train['target']
 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv')
 14 | 
 15 | # For explanations save the original forms of the training and test data
 16 | train_x_saved = train_x.copy()
 17 | test_x_saved = test_x.copy()
 18 | 
 19 | from sklearn.preprocessing import StandardScaler, MinMaxScaler
 20 | 
 21 | 
 22 | # Function to return standardized versions of the original training and test data
 23 | def load_standarized_data():
 24 |     train_x, test_x = train_x_saved.copy(), test_x_saved.copy()
 25 | 
 26 |     scaler = StandardScaler()
 27 |     scaler.fit(train_x)
 28 |     train_x = scaler.transform(train_x)
 29 |     test_x = scaler.transform(test_x)
 30 |     return pd.DataFrame(train_x), pd.DataFrame(test_x)
 31 | 
 32 | 
 33 | # Function to return MinMax scaled versions of the original training and test data
 34 | def load_minmax_scaled_data():
 35 |     train_x, test_x = train_x_saved.copy(), test_x_saved.copy()
 36 | 
 37 |     # Apply Min-Max Scaling
 38 |     scaler = MinMaxScaler()
 39 |     scaler.fit(pd.concat([train_x, test_x], axis=0))
 40 |     train_x = scaler.transform(train_x)
 41 |     test_x = scaler.transform(test_x)
 42 | 
 43 |     return pd.DataFrame(train_x), pd.DataFrame(test_x)
 44 | 
 45 | 
 46 | # -----------------------------------
 47 | # PCA
 48 | # -----------------------------------
 49 | # Use the standardized data
 50 | train_x, test_x = load_standarized_data()
 51 | # -----------------------------------
 52 | # PCA
 53 | from sklearn.decomposition import PCA
 54 | 
 55 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform
 56 | 
 57 | # Fit the PCA transformation using the training data
 58 | pca = PCA(n_components=5)
 59 | pca.fit(train_x)
 60 | 
 61 | # Apply the transformation
 62 | train_x = pca.transform(train_x)
 63 | test_x = pca.transform(test_x)
 64 | 
 65 | # -----------------------------------
 66 | # Use the standardized data
 67 | train_x, test_x = load_standarized_data()
 68 | # -----------------------------------
 69 | # TruncatedSVD
 70 | from sklearn.decomposition import TruncatedSVD
 71 | 
 72 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform
 73 | 
 74 | # Fit the SVD transformation using the training data
 75 | svd = TruncatedSVD(n_components=5, random_state=71)
 76 | svd.fit(train_x)
 77 | 
 78 | # Apply the transformation
 79 | train_x = svd.transform(train_x)
 80 | test_x = svd.transform(test_x)
 81 | 
 82 | # -----------------------------------
 83 | # NMF
 84 | # -----------------------------------
 85 | # So that the data are non-negative, use the MinMax scaled data
 86 | train_x, test_x = load_minmax_scaled_data()
 87 | # -----------------------------------
 88 | from sklearn.decomposition import NMF
 89 | 
 90 | # Assume the data only contains non-negative values
 91 | 
 92 | # Fit the NMF transformation using the training data
 93 | model = NMF(n_components=5, init='random', random_state=71)
 94 | model.fit(train_x)
 95 | 
 96 | # Apply the transformation
 97 | train_x = model.transform(train_x)
 98 | test_x = model.transform(test_x)
 99 | 
100 | # -----------------------------------
101 | # LatentDirichletAllocation
102 | # -----------------------------------
103 | # Use the MinMax scaled data
104 | # Although this is not a matrix of counts, as the values are all non-negative it is still possible to calculate
105 | train_x, test_x = load_minmax_scaled_data()
106 | # -----------------------------------
107 | from sklearn.decomposition import LatentDirichletAllocation
108 | 
109 | # Assume the data is a matrix of counts of words in a document
110 | 
111 | # Fit the Latent Dirichlet Allocation transformation using the training data
112 | model = LatentDirichletAllocation(n_components=5, random_state=71)
113 | model.fit(train_x)
114 | 
115 | # Apply the transformation
116 | train_x = model.transform(train_x)
117 | test_x = model.transform(test_x)
118 | 
119 | # -----------------------------------
120 | # LinearDiscriminantAnalysis
121 | # -----------------------------------
122 | # Use the standardized data
123 | train_x, test_x = load_standarized_data()
124 | # -----------------------------------
125 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
126 | 
127 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform
128 | 
129 | # Fit the Linear Discriminant Analysis transformation using the training data
130 | lda = LDA(n_components=1)
131 | lda.fit(train_x, train_y)
132 | 
133 | # Apply the transformation
134 | train_x = lda.transform(train_x)
135 | test_x = lda.transform(test_x)
136 | 
137 | # -----------------------------------
138 | # t-sne
139 | # -----------------------------------
140 | # Use the standardized data
141 | train_x, test_x = load_standarized_data()
142 | # -----------------------------------
143 | import bhtsne
144 | 
145 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform
146 | 
147 | # Transform using t-sne
148 | data = pd.concat([train_x, test_x])
149 | embedded = bhtsne.tsne(data.astype(np.float64), dimensions=2, rand_seed=71)
150 | 
151 | # -----------------------------------
152 | # UMAP
153 | # -----------------------------------
154 | # Use the standardized data
155 | train_x, test_x = load_standarized_data()
156 | # -----------------------------------
157 | import umap
158 | 
159 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform
160 | 
161 | # Fit the UMAP transformation using the training data
162 | um = umap.UMAP()
163 | um.fit(train_x)
164 | 
165 | # Apply the transformation
166 | train_x = um.transform(train_x)
167 | test_x = um.transform(test_x)
168 | 
169 | # -----------------------------------
170 | # Clustering
171 | # -----------------------------------
172 | # Use the standardized data
173 | train_x, test_x = load_standarized_data()
174 | # -----------------------------------
175 | from sklearn.cluster import MiniBatchKMeans
176 | 
177 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform
178 | 
179 | #  Fit the Mini-Batch K-Means using the training data
180 | kmeans = MiniBatchKMeans(n_clusters=10, random_state=71)
181 | kmeans.fit(train_x)
182 | 
183 | # Output the clusters to which each class belongs
184 | train_clusters = kmeans.predict(train_x)
185 | test_clusters = kmeans.predict(test_x)
186 | 
187 | # Output the distance to the center for each cluster
188 | train_distances = kmeans.transform(train_x)
189 | test_distances = kmeans.transform(test_x)
190 | 


--------------------------------------------------------------------------------
/ch05/ch05-01-validation.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------
  2 | # Prepare the data etc.
  3 | # ----------------------------------
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | # train_x is the training data, train_y is the target values, and test_x is the test data
  8 | # stored in pandas DataFrames and Series (numpy arrays also used)
  9 | 
 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
 11 | train_x = train.drop(['target'], axis=1)
 12 | train_y = train['target']
 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
 14 | 
 15 | # Class for training and making predictions using xgboost
 16 | import xgboost as xgb
 17 | 
 18 | 
 19 | class Model:
 20 | 
 21 |     def __init__(self, params=None):
 22 |         self.model = None
 23 |         if params is None:
 24 |             self.params = {}
 25 |         else:
 26 |             self.params = params
 27 | 
 28 |     def fit(self, tr_x, tr_y, va_x, va_y):
 29 |         params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
 30 |         params.update(self.params)
 31 |         num_round = 10
 32 |         dtrain = xgb.DMatrix(tr_x, label=tr_y)
 33 |         dvalid = xgb.DMatrix(va_x, label=va_y)
 34 |         watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
 35 |         self.model = xgb.train(params, dtrain, num_round, evals=watchlist)
 36 | 
 37 |     def predict(self, x):
 38 |         data = xgb.DMatrix(x)
 39 |         pred = self.model.predict(data)
 40 |         return pred
 41 | 
 42 | 
 43 | # -----------------------------------
 44 | # hold-out method
 45 | # -----------------------------------
 46 | # Partition validation data for hold-out method
 47 | 
 48 | from sklearn.model_selection import train_test_split
 49 | 
 50 | # Use train_test_split function for partitioning
 51 | tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y,
 52 |                                           test_size=0.25, random_state=71, shuffle=True)
 53 | 
 54 | # -----------------------------------
 55 | # Perform validation with hold-out method
 56 | 
 57 | from sklearn.metrics import log_loss
 58 | from sklearn.model_selection import train_test_split
 59 | 
 60 | # Assume Model class has been predefined
 61 | # Model class performs fitting and returns predicted probabilities for each outcome
 62 | 
 63 | # Use train_test_split() function for partitioning
 64 | tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y,
 65 |                                           test_size=0.25, random_state=71, shuffle=True)
 66 | 
 67 | # Train the model, output predictions and calculate score
 68 | model = Model()
 69 | model.fit(tr_x, tr_y, va_x, va_y)
 70 | va_pred = model.predict(va_x)
 71 | score = log_loss(va_y, va_pred)
 72 | print(score)
 73 | 
 74 | # -----------------------------------
 75 | # Use the KFold class to partition validation data for hold-out method
 76 | 
 77 | from sklearn.model_selection import KFold
 78 | 
 79 | # Use KFold class to partition for hold-out method
 80 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
 81 | tr_idx, va_idx = list(kf.split(train_x))[0]
 82 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
 83 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
 84 | 
 85 | # -----------------------------------
 86 | # Cross validation
 87 | # -----------------------------------
 88 | # Partition data for cross validation
 89 | 
 90 | from sklearn.model_selection import KFold
 91 | 
 92 | # Use KFold class for partitioning for cross validation
 93 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
 94 | for tr_idx, va_idx in kf.split(train_x):
 95 |     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
 96 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
 97 | 
 98 | # -----------------------------------
 99 | # Perform cross validation
100 | 
101 | from sklearn.metrics import log_loss
102 | from sklearn.model_selection import KFold
103 | 
104 | # It is assumed that the Model class has been predefined
105 | # Model class performs fitting and returns predicted probabilities for each outcome
106 | 
107 | scores = []
108 | 
109 | # Use KFold class for partitioning for cross validation
110 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
111 | for tr_idx, va_idx in kf.split(train_x):
112 |     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
113 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
114 | 
115 |     # Train the model, output predictions and calculate score
116 |     model = Model()
117 |     model.fit(tr_x, tr_y, va_x, va_y)
118 |     va_pred = model.predict(va_x)
119 |     score = log_loss(va_y, va_pred)
120 |     scores.append(score)
121 | 
122 | # Take average of scores for each fold
123 | print(np.mean(scores))
124 | 
125 | # -----------------------------------
126 | # Stratified K-Fold
127 | # -----------------------------------
128 | from sklearn.model_selection import StratifiedKFold
129 | 
130 | # Use the StratifiedKFold class to perform partitioning into stratified folds
131 | kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=71)
132 | for tr_idx, va_idx in kf.split(train_x, train_y):
133 |     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
134 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
135 | 
136 | # -----------------------------------
137 | # GroupKFold
138 | # -----------------------------------
139 | # It is assumed that the data has the same users repeated 4 times
140 | train_x['user_id'] = np.arange(0, len(train_x)) // 4
141 | # -----------------------------------
142 | 
143 | from sklearn.model_selection import KFold, GroupKFold
144 | 
145 | # Partition taking the user_id column to be the customer ID
146 | user_id = train_x['user_id']
147 | unique_user_ids = user_id.unique()
148 | 
149 | # Use the KFold class and partition using the customer ID
150 | scores = []
151 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
152 | for tr_group_idx, va_group_idx in kf.split(unique_user_ids):
153 |     # Partition using the customer ID (into data for training and validation)
154 |     tr_groups, va_groups = unique_user_ids[tr_group_idx], unique_user_ids[va_group_idx]
155 | 
156 |     # Partition records based on whether the customer ID is in train/valid
157 |     is_tr = user_id.isin(tr_groups)
158 |     is_va = user_id.isin(va_groups)
159 |     tr_x, va_x = train_x[is_tr], train_x[is_va]
160 |     tr_y, va_y = train_y[is_tr], train_y[is_va]
161 | 
162 | # (For reference)GroupKFold is difficult to use as you cannot shuffle or specify the random number seed
163 | kf = GroupKFold(n_splits=4)
164 | for tr_idx, va_idx in kf.split(train_x, train_y, user_id):
165 |     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
166 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
167 | 
168 | # -----------------------------------
169 | # leave-one-out
170 | # -----------------------------------
171 | # Assume that there are only 100 data
172 | train_x = train_x.iloc[:100, :].copy()
173 | # -----------------------------------
174 | from sklearn.model_selection import LeaveOneOut
175 | 
176 | loo = LeaveOneOut()
177 | for tr_idx, va_idx in loo.split(train_x):
178 |     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
179 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
180 | 


--------------------------------------------------------------------------------
/ch06/ch06-03-hopt_nn.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------
  2 | # Prepare the data etc.
  3 | # ----------------------------------
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | # train_x is the training data, train_y is the target values, and test_x is the test data
  8 | # stored in pandas DataFrames and Series (numpy arrays also used)
  9 | 
 10 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
 11 | train_x = train.drop(['target'], axis=1)
 12 | train_y = train['target']
 13 | test_x = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
 14 | 
 15 | # Split training data into training and validation data
 16 | from sklearn.model_selection import KFold
 17 | 
 18 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
 19 | tr_idx, va_idx = list(kf.split(train_x))[0]
 20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
 21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
 22 | 
 23 | # Suppress tensorflow warnings
 24 | import os
 25 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
 26 | import tensorflow as tf
 27 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 28 | 
 29 | # -----------------------------------
 30 | # Example of parameter tuning for a neural network
 31 | # -----------------------------------
 32 | from hyperopt import hp
 33 | from keras.callbacks import EarlyStopping
 34 | from keras.layers.advanced_activations import ReLU, PReLU
 35 | from keras.layers.core import Dense, Dropout
 36 | from keras.layers.normalization import BatchNormalization
 37 | from keras.models import Sequential
 38 | from keras.optimizers import SGD, Adam
 39 | from sklearn.preprocessing import StandardScaler
 40 | 
 41 | # Fundamental parameters
 42 | base_param = {
 43 |     'input_dropout': 0.0,
 44 |     'hidden_layers': 3,
 45 |     'hidden_units': 96,
 46 |     'hidden_activation': 'relu',
 47 |     'hidden_dropout': 0.2,
 48 |     'batch_norm': 'before_act',
 49 |     'optimizer': {'type': 'adam', 'lr': 0.001},
 50 |     'batch_size': 64,
 51 | }
 52 | 
 53 | # Specify parameter space to search
 54 | param_space = {
 55 |     'input_dropout': hp.quniform('input_dropout', 0, 0.2, 0.05),
 56 |     'hidden_layers': hp.quniform('hidden_layers', 2, 4, 1),
 57 |     'hidden_units': hp.quniform('hidden_units', 32, 256, 32),
 58 |     'hidden_activation': hp.choice('hidden_activation', ['prelu', 'relu']),
 59 |     'hidden_dropout': hp.quniform('hidden_dropout', 0, 0.3, 0.05),
 60 |     'batch_norm': hp.choice('batch_norm', ['before_act', 'no']),
 61 |     'optimizer': hp.choice('optimizer',
 62 |                            [{'type': 'adam',
 63 |                              'lr': hp.loguniform('adam_lr', np.log(0.00001), np.log(0.01))},
 64 |                             {'type': 'sgd',
 65 |                              'lr': hp.loguniform('sgd_lr', np.log(0.00001), np.log(0.01))}]),
 66 |     'batch_size': hp.quniform('batch_size', 32, 128, 32),
 67 | }
 68 | 
 69 | 
 70 | class MLP:
 71 | 
 72 |     def __init__(self, params):
 73 |         self.params = params
 74 |         self.scaler = None
 75 |         self.model = None
 76 | 
 77 |     def fit(self, tr_x, tr_y, va_x, va_y):
 78 | 
 79 |         # Parameters
 80 |         input_dropout = self.params['input_dropout']
 81 |         hidden_layers = int(self.params['hidden_layers'])
 82 |         hidden_units = int(self.params['hidden_units'])
 83 |         hidden_activation = self.params['hidden_activation']
 84 |         hidden_dropout = self.params['hidden_dropout']
 85 |         batch_norm = self.params['batch_norm']
 86 |         optimizer_type = self.params['optimizer']['type']
 87 |         optimizer_lr = self.params['optimizer']['lr']
 88 |         batch_size = int(self.params['batch_size'])
 89 | 
 90 |         # Standardization
 91 |         self.scaler = StandardScaler()
 92 |         tr_x = self.scaler.fit_transform(tr_x)
 93 |         va_x = self.scaler.transform(va_x)
 94 | 
 95 |         self.model = Sequential()
 96 | 
 97 |         # Input layer
 98 |         self.model.add(Dropout(input_dropout, input_shape=(tr_x.shape[1],)))
 99 | 
100 |         # Hidden layers
101 |         for i in range(hidden_layers):
102 |             self.model.add(Dense(hidden_units))
103 |             if batch_norm == 'before_act':
104 |                 self.model.add(BatchNormalization())
105 |             if hidden_activation == 'prelu':
106 |                 self.model.add(PReLU())
107 |             elif hidden_activation == 'relu':
108 |                 self.model.add(ReLU())
109 |             else:
110 |                 raise NotImplementedError
111 |             self.model.add(Dropout(hidden_dropout))
112 | 
113 |         # Output layer
114 |         self.model.add(Dense(1, activation='sigmoid'))
115 | 
116 |         # Optimizer
117 |         if optimizer_type == 'sgd':
118 |             optimizer = SGD(lr=optimizer_lr, decay=1e-6, momentum=0.9, nesterov=True)
119 |         elif optimizer_type == 'adam':
120 |             optimizer = Adam(lr=optimizer_lr, beta_1=0.9, beta_2=0.999, decay=0.)
121 |         else:
122 |             raise NotImplementedError
123 | 
124 |         # Set objective function, metric etc.
125 |         self.model.compile(loss='binary_crossentropy',
126 |                            optimizer=optimizer, metrics=['accuracy'])
127 | 
128 |         # Number of epochs, early stopping
129 |         # Beware if the number of epochs is large, training might not complete with a small learning rate
130 |         nb_epoch = 200
131 |         patience = 20
132 |         early_stopping = EarlyStopping(patience=patience, restore_best_weights=True)
133 | 
134 |         # Execute training
135 |         history = self.model.fit(tr_x, tr_y,
136 |                                  epochs=nb_epoch,
137 |                                  batch_size=batch_size, verbose=1,
138 |                                  validation_data=(va_x, va_y),
139 |                                  callbacks=[early_stopping])
140 | 
141 |     def predict(self, x):
142 |         # Predictions
143 |         x = self.scaler.transform(x)
144 |         y_pred = self.model.predict(x)
145 |         y_pred = y_pred.flatten()
146 |         return y_pred
147 | 
148 | 
149 | # -----------------------------------
150 | # Hyperparameter tuning
151 | 
152 | from hyperopt import fmin, tpe, STATUS_OK, Trials
153 | from sklearn.metrics import log_loss
154 | 
155 | 
156 | def score(params):
157 |     # Specify a function to minimize when the parameters are specified
158 |     # In the search for model parameters, the score is for the predictions from the model trained with the specified parameter set
159 |     model = MLP(params)
160 |     model.fit(tr_x, tr_y, va_x, va_y)
161 |     va_pred = model.predict(va_x)
162 |     score = log_loss(va_y, va_pred)
163 |     print(f'params: {params}, logloss: {score:.4f}')
164 | 
165 |     # Save the information
166 |     history.append((params, score))
167 | 
168 |     return {'loss': score, 'status': STATUS_OK}
169 | 
170 | 
171 | # Use hyperopt for parameter search
172 | max_evals = 10
173 | trials = Trials()
174 | history = []
175 | fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
176 | 
177 | # Output parameters and scores from saved information
178 | # Trials provides some information but it is difficult to obtain parameters
179 | history = sorted(history, key=lambda tpl: tpl[1])
180 | best = history[0]
181 | print(f'best params:{best[0]}, score:{best[1]:.4f}')
182 | 


--------------------------------------------------------------------------------
/ch02/ch02-01-metrics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | # -----------------------------------
  5 | # Regression
  6 | # -----------------------------------
  7 | # rmse
  8 | 
  9 | from sklearn.metrics import mean_squared_error
 10 | 
 11 | # y_true are the true values、y_pred are the predictions
 12 | y_true = [1.0, 1.5, 2.0, 1.2, 1.8]
 13 | y_pred = [0.8, 1.5, 1.8, 1.3, 3.0]
 14 | 
 15 | rmse = np.sqrt(mean_squared_error(y_true, y_pred))
 16 | print(rmse)
 17 | # 0.5532
 18 | 
 19 | # -----------------------------------
 20 | # Binary classification
 21 | # -----------------------------------
 22 | # Confusion matrix
 23 | 
 24 | from sklearn.metrics import confusion_matrix
 25 | 
 26 | # True values and predicted values are binary, i.e. either 0 or 1
 27 | y_true = [1, 0, 1, 1, 0, 1, 1, 0]
 28 | y_pred = [0, 0, 1, 1, 0, 0, 1, 1]
 29 | 
 30 | tp = np.sum((np.array(y_true) == 1) & (np.array(y_pred) == 1))
 31 | tn = np.sum((np.array(y_true) == 0) & (np.array(y_pred) == 0))
 32 | fp = np.sum((np.array(y_true) == 0) & (np.array(y_pred) == 1))
 33 | fn = np.sum((np.array(y_true) == 1) & (np.array(y_pred) == 0))
 34 | 
 35 | confusion_matrix1 = np.array([[tp, fp],
 36 |                               [fn, tn]])
 37 | print(confusion_matrix1)
 38 | # array([[3, 1],
 39 | #        [2, 2]])
 40 | 
 41 | # Can also be created using the confusion_matrix() function from scikit-learn's metrics, but
 42 | # be aware that the arrangement of the confusion matrix elements may be different
 43 | confusion_matrix2 = confusion_matrix(y_true, y_pred)
 44 | print(confusion_matrix2)
 45 | # array([[2, 1],
 46 | #        [2, 3]])
 47 | 
 48 | # -----------------------------------
 49 | # accuracy
 50 | 
 51 | from sklearn.metrics import accuracy_score
 52 | 
 53 | # True values and predicted values are binary, i.e. either 0 or 1
 54 | y_true = [1, 0, 1, 1, 0, 1, 1, 0]
 55 | y_pred = [0, 0, 1, 1, 0, 0, 1, 1]
 56 | accuracy = accuracy_score(y_true, y_pred)
 57 | print(accuracy)
 58 | # 0.625
 59 | 
 60 | # -----------------------------------
 61 | # logloss
 62 | 
 63 | from sklearn.metrics import log_loss
 64 | 
 65 | # True values are binary (0 or 1), predicted values are probabilities
 66 | y_true = [1, 0, 1, 1, 0, 1]
 67 | y_prob = [0.1, 0.2, 0.8, 0.8, 0.1, 0.3]
 68 | 
 69 | logloss = log_loss(y_true, y_prob)
 70 | print(logloss)
 71 | # 0.7136
 72 | 
 73 | # -----------------------------------
 74 | # Multi-class classification
 75 | # -----------------------------------
 76 | # multi-class logloss
 77 | 
 78 | from sklearn.metrics import log_loss
 79 | 
 80 | # True values are 3-class classifiers, predicted values are probabilities for each class
 81 | y_true = np.array([0, 2, 1, 2, 2])
 82 | y_pred = np.array([[0.68, 0.32, 0.00],
 83 |                    [0.00, 0.00, 1.00],
 84 |                    [0.60, 0.40, 0.00],
 85 |                    [0.00, 0.00, 1.00],
 86 |                    [0.28, 0.12, 0.60]])
 87 | logloss = log_loss(y_true, y_pred)
 88 | print(logloss)
 89 | # 0.3626
 90 | 
 91 | # -----------------------------------
 92 | # Multi-label classification
 93 | # -----------------------------------
 94 | # mean_f1, macro_f1, micro_f1
 95 | 
 96 | from sklearn.metrics import f1_score
 97 | 
 98 | # For calculating performance metric of multi-label classification, it is easier to handle the true / predicted values as binary matrices of record x class
 99 | # True values - [[1,2], [1], [1,2,3], [2,3], [3]]
100 | y_true = np.array([[1, 1, 0],
101 |                    [1, 0, 0],
102 |                    [1, 1, 1],
103 |                    [0, 1, 1],
104 |                    [0, 0, 1]])
105 | 
106 | # Predicted values - [[1,3], [2], [1,3], [3], [3]]
107 | y_pred = np.array([[1, 0, 1],
108 |                    [0, 1, 0],
109 |                    [1, 0, 1],
110 |                    [0, 0, 1],
111 |                    [0, 0, 1]])
112 | 
113 | # mean_f1 is the mean of the F1-scores for each record
114 | mean_f1 = np.mean([f1_score(y_true[i, :], y_pred[i, :]) for i in range(len(y_true))])
115 | 
116 | # macro_f1 is the mean of the F1-scores for each class
117 | n_class = 3
118 | macro_f1 = np.mean([f1_score(y_true[:, c], y_pred[:, c]) for c in range(n_class)])
119 | 
120 | # micro-f1 is the F1-score calculate using the true/predicted values for each record-class pair 
121 | micro_f1 = f1_score(y_true.reshape(-1), y_pred.reshape(-1))
122 | 
123 | print(mean_f1, macro_f1, micro_f1)
124 | # 0.5933, 0.5524, 0.6250
125 | 
126 | # Can also be calculated using a scikit-learn function
127 | mean_f1 = f1_score(y_true, y_pred, average='samples')
128 | macro_f1 = f1_score(y_true, y_pred, average='macro')
129 | micro_f1 = f1_score(y_true, y_pred, average='micro')
130 | 
131 | # -----------------------------------
132 | # Multi-class classification with ordered classes
133 | # -----------------------------------
134 | # quadratic weighted kappa
135 | 
136 | from sklearn.metrics import confusion_matrix, cohen_kappa_score
137 | 
138 | 
139 | # Function for calculating quadratic weighted kappa
140 | def quadratic_weighted_kappa(c_matrix):
141 |     numer = 0.0
142 |     denom = 0.0
143 | 
144 |     for i in range(c_matrix.shape[0]):
145 |         for j in range(c_matrix.shape[1]):
146 |             n = c_matrix.shape[0]
147 |             wij = ((i - j) ** 2.0)
148 |             oij = c_matrix[i, j]
149 |             eij = c_matrix[i, :].sum() * c_matrix[:, j].sum() / c_matrix.sum()
150 |             numer += wij * oij
151 |             denom += wij * eij
152 | 
153 |     return 1.0 - numer / denom
154 | 
155 | 
156 | # y_true is the true class list, y_pred is the predicted class list
157 | y_true = [1, 2, 3, 4, 3]
158 | y_pred = [2, 2, 4, 4, 5]
159 | 
160 | # Calculate the confusion matrix
161 | c_matrix = confusion_matrix(y_true, y_pred, labels=[1, 2, 3, 4, 5])
162 | 
163 | # Calculate quadratic weighted kappa
164 | kappa = quadratic_weighted_kappa(c_matrix)
165 | print(kappa)
166 | # 0.6153
167 | 
168 | # Can also be calculated using a scikit-learn function
169 | kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic')
170 | 
171 | # -----------------------------------
172 | # Recommendation
173 | # -----------------------------------
174 | # MAP@K
175 | 
176 | # K=3, with 5 records and 4 class types
177 | K = 3
178 | 
179 | # True values for each record
180 | y_true = [[1, 2], [1, 2], [4], [1, 2, 3, 4], [3, 4]]
181 | 
182 | # Predicted values for each record - as K=3, usually predict order of 3 records for each class
183 | y_pred = [[1, 2, 4], [4, 1, 2], [1, 4, 3], [1, 2, 3], [1, 2, 4]]
184 | 
185 | 
186 | # Function to calculate the average precision for each record
187 | def apk(y_i_true, y_i_pred):
188 |     # Length of y_pred must be less than or equal to K, and all elements must be unique
189 |     assert (len(y_i_pred) <= K)
190 |     assert (len(np.unique(y_i_pred)) == len(y_i_pred))
191 | 
192 |     sum_precision = 0.0
193 |     num_hits = 0.0
194 | 
195 |     for i, p in enumerate(y_i_pred):
196 |         if p in y_i_true:
197 |             num_hits += 1
198 |             precision = num_hits / (i + 1)
199 |             sum_precision += precision
200 | 
201 |     return sum_precision / min(len(y_i_true), K)
202 | 
203 | 
204 | # Function for calculating MAP@K
205 | def mapk(y_true, y_pred):
206 |     return np.mean([apk(y_i_true, y_i_pred) for y_i_true, y_i_pred in zip(y_true, y_pred)])
207 | 
208 | 
209 | # Calculate MAP@K
210 | print(mapk(y_true, y_pred))
211 | # 0.65
212 | 
213 | # Even if the number of true values is the same, if the order is different then the score will be different
214 | print(apk(y_true[0], y_pred[0]))
215 | print(apk(y_true[1], y_pred[1]))
216 | # 1.0, 0.5833
217 | 


--------------------------------------------------------------------------------
/ch03/ch03-01-numerical.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------
  2 | # Prepare the data etc.
  3 | # ----------------------------------
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | # train_x is the training data, train_y contains the target values, test_x is the test data
  8 | # stored in pandas DataFrames and Series (numpy arrays also used)
  9 | 
 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
 11 | train_x = train.drop(['target'], axis=1)
 12 | train_y = train['target']
 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')
 14 | 
 15 | # Save training and test datasets in their original form for explanations
 16 | train_x_saved = train_x.copy()
 17 | test_x_saved = test_x.copy()
 18 | 
 19 | 
 20 | # Function to recover original training and test datasets
 21 | def load_data():
 22 |     train_x, test_x = train_x_saved.copy(), test_x_saved.copy()
 23 |     return train_x, test_x
 24 | 
 25 | 
 26 | # Store names of numerical variables to be converted in list
 27 | num_cols = ['age', 'height', 'weight', 'amount',
 28 |             'medical_info_a1', 'medical_info_a2', 'medical_info_a3', 'medical_info_b1']
 29 | 
 30 | # -----------------------------------
 31 | # Standardization
 32 | # -----------------------------------
 33 | # Load the data
 34 | train_x, test_x = load_data()
 35 | # -----------------------------------
 36 | from sklearn.preprocessing import StandardScaler
 37 | 
 38 | # Compute standardization parameters for multiple columns of the training data
 39 | scaler = StandardScaler()
 40 | scaler.fit(train_x[num_cols])
 41 | 
 42 | # Replace columns with standardized values
 43 | train_x[num_cols] = scaler.transform(train_x[num_cols])
 44 | test_x[num_cols] = scaler.transform(test_x[num_cols])
 45 | 
 46 | # -----------------------------------
 47 | # Load the data
 48 | train_x, test_x = load_data()
 49 | # -----------------------------------
 50 | from sklearn.preprocessing import StandardScaler
 51 | 
 52 | # Compute standardization parameters for multiple columns from combined training and test data
 53 | scaler = StandardScaler()
 54 | scaler.fit(pd.concat([train_x[num_cols], test_x[num_cols]]))
 55 | 
 56 | # Replace columns with standardized values
 57 | train_x[num_cols] = scaler.transform(train_x[num_cols])
 58 | test_x[num_cols] = scaler.transform(test_x[num_cols])
 59 | 
 60 | # -----------------------------------
 61 | # Load the data
 62 | train_x, test_x = load_data()
 63 | # -----------------------------------
 64 | from sklearn.preprocessing import StandardScaler
 65 | 
 66 | # Standardize training and test data separately (bad example)
 67 | scaler_train = StandardScaler()
 68 | scaler_train.fit(train_x[num_cols])
 69 | train_x[num_cols] = scaler_train.transform(train_x[num_cols])
 70 | scaler_test = StandardScaler()
 71 | scaler_test.fit(test_x[num_cols])
 72 | test_x[num_cols] = scaler_test.transform(test_x[num_cols])
 73 | 
 74 | # -----------------------------------
 75 | # Min-Max scaling
 76 | # -----------------------------------
 77 | # Load the data
 78 | train_x, test_x = load_data()
 79 | # -----------------------------------
 80 | from sklearn.preprocessing import MinMaxScaler
 81 | 
 82 | # Compute parameters for min-max scaling for multiple columns of the training data
 83 | scaler = MinMaxScaler()
 84 | scaler.fit(train_x[num_cols])
 85 | 
 86 | # Replace columns with min-max scaled values
 87 | train_x[num_cols] = scaler.transform(train_x[num_cols])
 88 | test_x[num_cols] = scaler.transform(test_x[num_cols])
 89 | 
 90 | # -----------------------------------
 91 | # Logarithmic transformation
 92 | # -----------------------------------
 93 | x = np.array([1.0, 10.0, 100.0, 1000.0, 10000.0])
 94 | 
 95 | # Take simple logarithm
 96 | x1 = np.log(x)
 97 | 
 98 | # Take logarithm of x+1
 99 | x2 = np.log1p(x)
100 | 
101 | # Apply original sign to logarithm taken of absolute value
102 | x3 = np.sign(x) * np.log(np.abs(x))
103 | 
104 | # -----------------------------------
105 | # Box-Cox transformation
106 | # -----------------------------------
107 | # Load the data
108 | train_x, test_x = load_data()
109 | # -----------------------------------
110 | 
111 | # Store only columns that take positive values in a list for transformation
112 | # Note: when including missing values it is necessary to use (~(train_x[c] <= 0.0)).all() etc.
113 | pos_cols = [c for c in num_cols if (train_x[c] > 0.0).all() and (test_x[c] > 0.0).all()]
114 | 
115 | from sklearn.preprocessing import PowerTransformer
116 | 
117 | # Fit Box-Cox transformation to the columns with positive values in the training data
118 | pt = PowerTransformer(method='box-cox')
119 | pt.fit(train_x[pos_cols])
120 | 
121 | # Replace columns with transformed data
122 | train_x[pos_cols] = pt.transform(train_x[pos_cols])
123 | test_x[pos_cols] = pt.transform(test_x[pos_cols])
124 | 
125 | # -----------------------------------
126 | # Yeo-Johnson transformation
127 | # -----------------------------------
128 | # Load the data
129 | train_x, test_x = load_data()
130 | # -----------------------------------
131 | 
132 | from sklearn.preprocessing import PowerTransformer
133 | 
134 | # Compute parameters for Yeo-Johnnson transformation for multiple columns of the training data
135 | pt = PowerTransformer(method='yeo-johnson')
136 | pt.fit(train_x[num_cols])
137 | 
138 | # Replace columns with transformed data
139 | train_x[num_cols] = pt.transform(train_x[num_cols])
140 | test_x[num_cols] = pt.transform(test_x[num_cols])
141 | 
142 | # -----------------------------------
143 | # Clipping
144 | # -----------------------------------
145 | # Load the data
146 | train_x, test_x = load_data()
147 | # -----------------------------------
148 | # Calculate 1% and 99% limits of each column of the training data
149 | p01 = train_x[num_cols].quantile(0.01)
150 | p99 = train_x[num_cols].quantile(0.99)
151 | 
152 | # Clip out values in the 1st and 99th percentiles
153 | train_x[num_cols] = train_x[num_cols].clip(p01, p99, axis=1)
154 | test_x[num_cols] = test_x[num_cols].clip(p01, p99, axis=1)
155 | 
156 | # -----------------------------------
157 | # Binning
158 | # -----------------------------------
159 | x = [1, 7, 5, 4, 6, 3]
160 | 
161 | # Use cut() function in pandas for binning
162 | 
163 | # Case where you specify the number of bins
164 | binned = pd.cut(x, 3, labels=False)
165 | print(binned)
166 | # [0 2 1 1 2 0] - shows which of the three bins the converted values are in
167 | 
168 | # Case where you specify the bin ranges (<3.0, 3.0->5.0, >5.0）
169 | bin_edges = [-float('inf'), 3.0, 5.0, float('inf')]
170 | binned = pd.cut(x, bin_edges, labels=False)
171 | print(binned)
172 | # [0 2 1 1 2 0] - shows which of the three bins the converted values are in
173 | 
174 | # -----------------------------------
175 | # Rank transformation
176 | # -----------------------------------
177 | x = [10, 20, 30, 0, 40, 40]
178 | 
179 | # Use rank() function in pandas for rank transformation
180 | rank = pd.Series(x).rank()
181 | print(rank.values)
182 | # First value is 1, mean rank is given for values in equal position
183 | # [2. 3. 4. 1. 5.5 5.5]
184 | 
185 | # Also possible to to apply argsort() function in numpy twice to make rank transformation
186 | order = np.argsort(x)
187 | rank = np.argsort(order)
188 | print(rank)
189 | # First value is zero, equal position values are ordered by whichever is first
190 | # [1 2 3 0 4 5]
191 | 
192 | # -----------------------------------
193 | # RankGauss
194 | # -----------------------------------
195 | # Load the data
196 | train_x, test_x = load_data()
197 | # -----------------------------------
198 | from sklearn.preprocessing import QuantileTransformer
199 | 
200 | # Compute parameters for Rank-Gauss transformation for multiple columns of the training data
201 | transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal')
202 | transformer.fit(train_x[num_cols])
203 | 
204 | # Replace columns with transformed data
205 | train_x[num_cols] = transformer.transform(train_x[num_cols])
206 | test_x[num_cols] = transformer.transform(test_x[num_cols])
207 | 


--------------------------------------------------------------------------------
/ch04-model-interface/code/runner.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from model import Model
  4 | from sklearn.metrics import log_loss
  5 | from sklearn.model_selection import StratifiedKFold
  6 | from typing import Callable, List, Optional, Tuple, Union
  7 | 
  8 | from util import Logger, Util
  9 | 
 10 | logger = Logger()
 11 | 
 12 | 
 13 | class Runner:
 14 | 
 15 |     def __init__(self, run_name: str, model_cls: Callable[[str, dict], Model], features: List[str], params: dict):
 16 |         """Constructor
 17 | 
 18 |         :param run_name: Run name
 19 |         :param model_cls: Model class
 20 |         :param features: List of features
 21 |         :param params: Hyperparameters
 22 |         """
 23 |         self.run_name = run_name
 24 |         self.model_cls = model_cls
 25 |         self.features = features
 26 |         self.params = params
 27 |         self.n_fold = 4
 28 | 
 29 |     def train_fold(self, i_fold: Union[int, str]) -> Tuple[
 30 |         Model, Optional[np.array], Optional[np.array], Optional[float]]:
 31 |         """Specify cross validation, train then calculate score
 32 | 
 33 |         In addition to calling from other methods, this is also used itself for checks and to adjust parameters
 34 | 
 35 |         :param i_fold: fold number (when everything use 'all')
 36 |         :return: Tuple containing (model instance, record index, predictions, validation score)
 37 |         """
 38 |         # Load training data
 39 |         validation = i_fold != 'all'
 40 |         train_x = self.load_x_train()
 41 |         train_y = self.load_y_train()
 42 | 
 43 |         if validation:
 44 |             # Set training and validation data
 45 |             tr_idx, va_idx = self.load_index_fold(i_fold)
 46 |             tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx]
 47 |             va_x, va_y = train_x.iloc[va_idx], train_y.iloc[va_idx]
 48 | 
 49 |             # Train model
 50 |             model = self.build_model(i_fold)
 51 |             model.train(tr_x, tr_y, va_x, va_y)
 52 | 
 53 |             # Make predictions using validation data and calculate score
 54 |             va_pred = model.predict(va_x)
 55 |             score = log_loss(va_y, va_pred, eps=1e-15, normalize=True)
 56 | 
 57 |             # Return model, index, predictions and score
 58 |             return model, va_idx, va_pred, score
 59 |         else:
 60 |             # Train using all training data
 61 |             model = self.build_model(i_fold)
 62 |             model.train(train_x, train_y)
 63 | 
 64 |             # Return model
 65 |             return model, None, None, None
 66 | 
 67 |     def run_train_cv(self) -> None:
 68 |         """Training and evaluation using cross validation
 69 | 
 70 |         Train, score, save each fold model, output score to log
 71 |         """
 72 |         logger.info(f'{self.run_name} - start training cv')
 73 | 
 74 |         scores = []
 75 |         va_idxes = []
 76 |         preds = []
 77 | 
 78 |         # Train on each fold
 79 |         for i_fold in range(self.n_fold):
 80 |             # Train
 81 |             logger.info(f'{self.run_name} fold {i_fold} - start training')
 82 |             model, va_idx, va_pred, score = self.train_fold(i_fold)
 83 |             logger.info(f'{self.run_name} fold {i_fold} - end training - score {score}')
 84 | 
 85 |             # Save model
 86 |             model.save_model()
 87 | 
 88 |             # Retain results
 89 |             va_idxes.append(va_idx)
 90 |             scores.append(score)
 91 |             preds.append(va_pred)
 92 | 
 93 |         # Gather results for all folds
 94 |         va_idxes = np.concatenate(va_idxes)
 95 |         order = np.argsort(va_idxes)
 96 |         preds = np.concatenate(preds, axis=0)
 97 |         preds = preds[order]
 98 | 
 99 |         logger.info(f'{self.run_name} - end training cv - score {np.mean(scores)}')
100 | 
101 |         # Save predictions
102 |         Util.dump(preds, f'../model/pred/{self.run_name}-train.pkl')
103 | 
104 |         # Save scores
105 |         logger.result_scores(self.run_name, scores)
106 | 
107 |     def run_predict_cv(self) -> None:
108 |         """Take average of results from models trained on each fold and make predictions for test data
109 | 
110 |         Necessary to run_train_cv beforehand
111 |         """
112 |         logger.info(f'{self.run_name} - start prediction cv')
113 | 
114 |         test_x = self.load_x_test()
115 | 
116 |         preds = []
117 | 
118 |         # Train on each fold
119 |         for i_fold in range(self.n_fold):
120 |             logger.info(f'{self.run_name} - start prediction fold:{i_fold}')
121 |             model = self.build_model(i_fold)
122 |             model.load_model()
123 |             pred = model.predict(test_x)
124 |             preds.append(pred)
125 |             logger.info(f'{self.run_name} - end prediction fold:{i_fold}')
126 | 
127 |         # Output mean value of predictions
128 |         pred_avg = np.mean(preds, axis=0)
129 | 
130 |         # Save predictions
131 |         Util.dump(pred_avg, f'../model/pred/{self.run_name}-test.pkl')
132 | 
133 |         logger.info(f'{self.run_name} - end prediction cv')
134 | 
135 |     def run_train_all(self) -> None:
136 |         """Train using all training data and save model"""
137 |         logger.info(f'{self.run_name} - start training all')
138 | 
139 |         # Train on all training data
140 |         i_fold = 'all'
141 |         model, _, _, _ = self.train_fold(i_fold)
142 |         model.save_model()
143 | 
144 |         logger.info(f'{self.run_name} - end training all')
145 | 
146 |     def run_predict_all(self) -> None:
147 |         """Make predictions using model trained with all training data
148 | 
149 |         Necessary to run_train_all beforehand
150 |         """
151 |         logger.info(f'{self.run_name} - start prediction all')
152 | 
153 |         test_x = self.load_x_test()
154 | 
155 |         # Make predictions using model trained on all training data
156 |         i_fold = 'all'
157 |         model = self.build_model(i_fold)
158 |         model.load_model()
159 |         pred = model.predict(test_x)
160 | 
161 |         # Save predictions
162 |         Util.dump(pred, f'../model/pred/{self.run_name}-test.pkl')
163 | 
164 |         logger.info(f'{self.run_name} - end prediction all')
165 | 
166 |     def build_model(self, i_fold: Union[int, str]) -> Model:
167 |         """Specify cross validation fold and create model
168 | 
169 |         :param i_fold: fold number
170 |         :return: model instance
171 |         """
172 |         # Create model from run name, fold and model class
173 |         run_fold_name = f'{self.run_name}-{i_fold}'
174 |         return self.model_cls(run_fold_name, self.params)
175 | 
176 |     def load_x_train(self) -> pd.DataFrame:
177 |         """Load features of training data
178 | 
179 |         :return: Training data features
180 |         """
181 |         # Load training data
182 |         # Note you must modify this method if you want to do anything more than just extraction by column name
183 |         # As it is inefficient to load train.csv every time, use this method appropriately for the data (same applies for other methods also) 
184 |         return pd.read_csv('../input/train.csv')[self.features]
185 | 
186 |     def load_y_train(self) -> pd.Series:
187 |         """Load target values of training data
188 | 
189 |         :return: Training data target values
190 |         """
191 |         # Load target values
192 |         train_y = pd.read_csv('../input/train.csv')['target']
193 |         train_y = np.array([int(st[-1]) for st in train_y]) - 1
194 |         train_y = pd.Series(train_y)
195 |         return train_y
196 | 
197 |     def load_x_test(self) -> pd.DataFrame:
198 |         """Load features of test data
199 | 
200 |         :return: Test data features
201 |         """
202 |         return pd.read_csv('../input/test.csv')[self.features]
203 | 
204 |     def load_index_fold(self, i_fold: int) -> np.array:
205 |         """Specify cross validation fold and return corresponding record index
206 | 
207 |         :param i_fold: Fold number
208 |         :return: Record index of corresponding fold
209 |         """
210 |         # Return index that separates training and validation data
211 |         # Here a random number is created every time, so there is also a method to save it to a file
212 |         train_y = self.load_y_train()
213 |         dummy_x = np.zeros(len(train_y))
214 |         skf = StratifiedKFold(n_splits=self.n_fold, shuffle=True, random_state=71)
215 |         return list(skf.split(dummy_x, train_y))[i_fold]
216 | 


--------------------------------------------------------------------------------
/ch01/ch01-01-titanic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | # -----------------------------------
  5 | # Load the training and test data
  6 | # -----------------------------------
  7 | # Load the training and test data
  8 | train = pd.read_csv('../input/ch01-titanic/train.csv')
  9 | test = pd.read_csv('../input/ch01-titanic/test.csv')
 10 | 
 11 | # Split the training data into features and target values
 12 | train_x = train.drop(['Survived'], axis=1)
 13 | train_y = train['Survived']
 14 | 
 15 | # The test data only contains features, so is ok as is
 16 | test_x = test.copy()
 17 | 
 18 | # -----------------------------------
 19 | # Feature engineering
 20 | # -----------------------------------
 21 | from sklearn.preprocessing import LabelEncoder
 22 | 
 23 | # Drop the PassengerID variables
 24 | train_x = train_x.drop(['PassengerId'], axis=1)
 25 | test_x = test_x.drop(['PassengerId'], axis=1)
 26 | 
 27 | # Drop the Name, Ticket & Cabin variables
 28 | train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
 29 | test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
 30 | 
 31 | # Apply label encoding to categorical variables
 32 | for c in ['Sex', 'Embarked']:
 33 |     # Fit the labels using the training data
 34 |     le = LabelEncoder()
 35 |     le.fit(train_x[c].fillna('NA'))
 36 | 
 37 |     # Return the encoded labels for the training and test data
 38 |     train_x[c] = le.transform(train_x[c].fillna('NA'))
 39 |     test_x[c] = le.transform(test_x[c].fillna('NA'))
 40 | 
 41 | # -----------------------------------
 42 | # Model creation
 43 | # -----------------------------------
 44 | from xgboost import XGBClassifier
 45 | 
 46 | # Create the model and fit it using the training data
 47 | model = XGBClassifier(n_estimators=20, random_state=71)
 48 | model.fit(train_x, train_y)
 49 | 
 50 | # Output predicted probabilities for the test data
 51 | pred = model.predict_proba(test_x)[:, 1]
 52 | 
 53 | # Convert into binary predictions 
 54 | pred_label = np.where(pred > 0.5, 1, 0)
 55 | 
 56 | # Create a submission file
 57 | submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
 58 | submission.to_csv('submission_first.csv', index=False)
 59 | # Score: 0.7799 (it is possible that this value differs from the one in the book)
 60 | 
 61 | # -----------------------------------
 62 | # Validation
 63 | # -----------------------------------
 64 | from sklearn.metrics import log_loss, accuracy_score
 65 | from sklearn.model_selection import KFold
 66 | 
 67 | # Create lists to store the scores for each fold
 68 | scores_accuracy = []
 69 | scores_logloss = []
 70 | 
 71 | # Perform cross validation
 72 | # Split the training data into 4, use 1 part for validation, then use the next part for validation, and so on...
 73 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
 74 | for tr_idx, va_idx in kf.split(train_x):
 75 |     # Split the training data into training and validation data
 76 |     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
 77 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
 78 | 
 79 |     # Train the model
 80 |     model = XGBClassifier(n_estimators=20, random_state=71)
 81 |     model.fit(tr_x, tr_y)
 82 | 
 83 |     # Output predicted probabilities for the validation data
 84 |     va_pred = model.predict_proba(va_x)[:, 1]
 85 | 
 86 |     # Calculate scores for the validation data
 87 |     logloss = log_loss(va_y, va_pred)
 88 |     accuracy = accuracy_score(va_y, va_pred > 0.5)
 89 | 
 90 |     # Store the scores for this fold
 91 |     scores_logloss.append(logloss)
 92 |     scores_accuracy.append(accuracy)
 93 | 
 94 | # Calculate the mean scores using all folds
 95 | logloss = np.mean(scores_logloss)
 96 | accuracy = np.mean(scores_accuracy)
 97 | print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')
 98 | # logloss: 0.4270, accuracy: 0.8148 (it is possible that these values differ from the book)
 99 | 
100 | # -----------------------------------
101 | # Model tuning
102 | # -----------------------------------
103 | import itertools
104 | 
105 | # Prepare candidate tuning parameters
106 | param_space = {
107 |     'max_depth': [3, 5, 7],
108 |     'min_child_weight': [1.0, 2.0, 4.0]
109 | }
110 | 
111 | # Hyperparamter combinations to try
112 | param_combinations = itertools.product(param_space['max_depth'], param_space['min_child_weight'])
113 | 
114 | # Create lists to store scores for the different hyperparameter combinations
115 | params = []
116 | scores = []
117 | 
118 | # Perform cross validation for each hyperparameter combination
119 | for max_depth, min_child_weight in param_combinations:
120 | 
121 |     score_folds = []
122 |     # Perform cross validation
123 |     # Split the training data into 4, use 1 part for validation, then use the next part for validation, and so on...
124 |     kf = KFold(n_splits=4, shuffle=True, random_state=123456)
125 |     for tr_idx, va_idx in kf.split(train_x):
126 |         # Split the training data into training and validation data
127 |         tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
128 |         tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
129 | 
130 |         # Train the model
131 |         model = XGBClassifier(n_estimators=20, random_state=71,
132 |                               max_depth=max_depth, min_child_weight=min_child_weight)
133 |         model.fit(tr_x, tr_y)
134 | 
135 |         # Output predicted probabilities for the validation data
136 |         va_pred = model.predict_proba(va_x)[:, 1]
137 |         logloss = log_loss(va_y, va_pred)
138 |         score_folds.append(logloss)
139 | 
140 |     # Calculate the mean score using all folds
141 |     score_mean = np.mean(score_folds)
142 | 
143 |     # Store the scores for this hyperparameter combination
144 |     params.append((max_depth, min_child_weight))
145 |     scores.append(score_mean)
146 | 
147 | # Set the parameters to the best values giving the highest score
148 | best_idx = np.argsort(scores)[0]
149 | best_param = params[best_idx]
150 | print(f'max_depth: {best_param[0]}, min_child_weight: {best_param[1]}')
151 | # Best score is with max_depth=7, min_child_weight=2.0 
152 | 
153 | 
154 | # -----------------------------------
155 | # Create features for logistic regression
156 | # -----------------------------------
157 | from sklearn.preprocessing import OneHotEncoder
158 | 
159 | # Copy the original datasets
160 | train_x2 = train.drop(['Survived'], axis=1)
161 | test_x2 = test.copy()
162 | 
163 | # Drop the PassengerID variables
164 | train_x2 = train_x2.drop(['PassengerId'], axis=1)
165 | test_x2 = test_x2.drop(['PassengerId'], axis=1)
166 | 
167 | # Drop the Name, Ticket & Cabin variables
168 | train_x2 = train_x2.drop(['Name', 'Ticket', 'Cabin'], axis=1)
169 | test_x2 = test_x2.drop(['Name', 'Ticket', 'Cabin'], axis=1)
170 | 
171 | # Perform one-hot encoding
172 | cat_cols = ['Sex', 'Embarked', 'Pclass']
173 | ohe = OneHotEncoder(categories='auto', sparse=False)
174 | ohe.fit(train_x2[cat_cols].fillna('NA'))
175 | 
176 | # Create column names for dummy one-hot encoding variables
177 | ohe_columns = []
178 | for i, c in enumerate(cat_cols):
179 |     ohe_columns += [f'{c}_{v}' for v in ohe.categories_[i]]
180 | 
181 | # Create DataFrames for one-hot encoding
182 | ohe_train_x2 = pd.DataFrame(ohe.transform(train_x2[cat_cols].fillna('NA')), columns=ohe_columns)
183 | ohe_test_x2 = pd.DataFrame(ohe.transform(test_x2[cat_cols].fillna('NA')), columns=ohe_columns)
184 | 
185 | # Drop the original columns that were used for one-hot encoding
186 | train_x2 = train_x2.drop(cat_cols, axis=1)
187 | test_x2 = test_x2.drop(cat_cols, axis=1)
188 | 
189 | # Append the one-hot encoded columns 
190 | train_x2 = pd.concat([train_x2, ohe_train_x2], axis=1)
191 | test_x2 = pd.concat([test_x2, ohe_test_x2], axis=1)
192 | 
193 | # Replace missing values in these columns with the means of the values that exist
194 | num_cols = ['Age', 'SibSp', 'Parch', 'Fare']
195 | for col in num_cols:
196 |     train_x2[col].fillna(train_x2[col].mean(), inplace=True)
197 |     test_x2[col].fillna(train_x2[col].mean(), inplace=True)
198 | 
199 | # Make a logarithmic transformation of the Fare variables
200 | train_x2['Fare'] = np.log1p(train_x2['Fare'])
201 | test_x2['Fare'] = np.log1p(test_x2['Fare'])
202 | 
203 | # -----------------------------------
204 | # Ensembling
205 | # -----------------------------------
206 | from sklearn.linear_model import LogisticRegression
207 | 
208 | # xgboost model
209 | model_xgb = XGBClassifier(n_estimators=20, random_state=71)
210 | model_xgb.fit(train_x, train_y)
211 | pred_xgb = model_xgb.predict_proba(test_x)[:, 1]
212 | 
213 | # Logistic regression model
214 | # As the xgboost model uses differently engineered features, train_x2, test_x2 were created separately
215 | model_lr = LogisticRegression(solver='lbfgs', max_iter=300)
216 | model_lr.fit(train_x2, train_y)
217 | pred_lr = model_lr.predict_proba(test_x2)[:, 1]
218 | 
219 | # Take a weighted average of the predictions
220 | pred = pred_xgb * 0.8 + pred_lr * 0.2
221 | pred_label = np.where(pred > 0.5, 1, 0)
222 | 


--------------------------------------------------------------------------------
/input/sample-data/input_create.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pandas.tseries.offsets as offsets
  4 | import argparse
  5 | 
  6 | parser = argparse.ArgumentParser()
  7 | parser.add_argument('--test', action='store_true')
  8 | args = parser.parse_args()
  9 | 
 10 | 
 11 | class Util:
 12 | 
 13 |     @classmethod
 14 |     def iif(cls, cond, iftrue, ifelse):
 15 |         if cond:
 16 |             return iftrue
 17 |         else:
 18 |             return ifelse
 19 | 
 20 | 
 21 | class Generator:
 22 | 
 23 |     def __init__(self):
 24 |         pass
 25 | 
 26 |     def choice_prob(self, prob, iftrue, ifelse):
 27 |         if self.r.rand() < prob:
 28 |             return iftrue
 29 |         else:
 30 |             return ifelse
 31 | 
 32 |     def run_all(self, seed=71, n=100):
 33 |         self.r = np.random.RandomState(seed)
 34 |         ds = []
 35 |         for i in range(n):
 36 |             print(i)
 37 |             data, scores, target = self.run()
 38 |             ds.append(data + scores + target)
 39 | 
 40 |         columns = self.column_names()
 41 |         df = pd.DataFrame(ds, columns=columns)
 42 |         return df
 43 | 
 44 |     def run(self):
 45 | 
 46 |         # Information on insured person
 47 |         # Set score according to age, weight and BMI
 48 |         insured_age = self.r.choice(range(5, 80))
 49 |         insured_sex = self.r.choice(['Male', 'Female'], p=[0.6, 0.4])
 50 |         height = 160.0 + Util.iif(insured_sex == 'Male', 10.0, 0.0) + self.r.randn() * 8
 51 |         bmi = 22.0 + self.r.randn() * 3.0
 52 |         weight = height * height * bmi / 10000.0
 53 | 
 54 |         if insured_age <= 15:
 55 |             score_01 = 0.1
 56 |         elif 60 <= insured_age < 70:
 57 |             score_01 = 0.2
 58 |         elif 70 <= insured_age:
 59 |             score_01 = 0.3
 60 |         else:
 61 |             score_01 = 0.0
 62 | 
 63 |         if weight >= 85.0:
 64 |             score_02 = 0.2
 65 |         else:
 66 |             score_02 = 0.0
 67 | 
 68 |         if bmi <= 19.0:
 69 |             score_03 = (19.0 - bmi) * 0.15
 70 |         elif bmi >= 25.0:
 71 |             score_03 = (bmi - 25.0) * 0.15
 72 |         else:
 73 |             score_03 = 0.0
 74 | 
 75 |         # Product information
 76 |         # A1-A3, B1-B3, C1-C3, D1, E1
 77 |         # Set scores for types D, E and 2
 78 | 
 79 |         product = self.r.choice(list('ABCDE'), p=[0.5, 0.1, 0.25, 0.1, 0.05])
 80 |         is_prodtype_1 = product in list('ABD')
 81 |         is_prodtype_2 = product in list('CE')
 82 |         is_prodtype_a = product in list('ABC')
 83 |         is_prodtype_b = product in list('DE')
 84 | 
 85 |         if is_prodtype_a:
 86 |             product_sub = self.r.choice([1, 2, 3], p=[0.4, 0.2, 0.4])
 87 |         else:
 88 |             product_sub = 1
 89 |         product = '{}{}'.format(product, product_sub)
 90 | 
 91 |         if is_prodtype_b:
 92 |             score_21 = 0.2
 93 |         else:
 94 |             score_21 = 0.0
 95 |         if product_sub == 2:
 96 |             score_22 = 0.1
 97 |         else:
 98 |             score_22 = 0.0
 99 | 
100 |         # Insurance premiums - premiums vary according to product type
101 |         # Set score for large premiums
102 |         amount_raw = self.r.choice(range(1, 11))
103 |         if is_prodtype_1:
104 |             amount = amount_raw * 1000  * 1000
105 |         else:
106 |             amount = amount_raw  * 1000
107 | 
108 |         if amount_raw > 5:
109 |             score_23 = 0.1
110 |         else:
111 |             score_23 = 0.0
112 | 
113 |         # No score for application date
114 |         date_start = pd.to_datetime('2015/1/1')
115 |         date_end = pd.to_datetime('2016/12/31')
116 |         days = (date_end - date_start).days
117 | 
118 |         app_date = date_start + offsets.Day(self.r.choice(range(days)))
119 |         app_year, app_month, app_day = app_date.year, app_date.month, app_date.day
120 |         application_date = '{}/{}/{}'.format(app_year, app_month, app_day)
121 | 
122 |         # Medical information
123 |         # Set scores based on a1, difference between a1 and a2, and when a3 is greater than or equal to 5
124 |         # No score for b
125 |         medical_info_a1 = int(250.0 + 100.0 * self.r.rand() + 100.0 * self.r.randn())
126 |         medical_info_a2 = int(200.0 + 100.0 * self.r.rand() + 100.0 * self.r.randn())
127 |         medical_info_a3 = self.r.poisson(lam=2)
128 | 
129 |         medical_info_b1 = int(10.0 + 10.0 * self.r.rand())
130 |         medical_info_b2 = self.r.choice([1,2,3,9], p=[0.5, 0.25, 0.2, 0.05])
131 |         medical_info_b3 = self.r.choice(list('ABCDEFGH') + list('abcde') + ['1', '2', '3', '4'])
132 | 
133 |         medical_info_c1 = self.r.choice([np.nan, self.r.poisson(lam=1.5)], p=[0.3, 0.7])
134 |         medical_info_c2 = self.r.choice([np.nan, self.r.uniform(8.0, 22.0)], p=[0.8, 0.2])
135 | 
136 |         if medical_info_a1 > 350:
137 |             score_41 = 0.2
138 |         else:
139 |             score_41 = 0.0
140 | 
141 |         medical_info_a_diff = medical_info_a1 - medical_info_a2
142 |         score_42 = max(np.abs(medical_info_a_diff) - 100.0, 0.0) / 800.0
143 | 
144 |         if medical_info_a3 >= 5:
145 |             score_43 = 0.2
146 |         else:
147 |             score_43 = 0.0
148 | 
149 |         if np.isnan(medical_info_c1):
150 |             score_44 = -0.1
151 |         elif medical_info_c1 >= 3:
152 |             score_44 = 0.1
153 |         else:
154 |             score_44 = 0.0
155 | 
156 |         # Medical information binary variable
157 |         # 1-5 are related with a score
158 |         # 6-7 are related with a score for women only
159 |         # 8-10 have no relation with a score
160 |         medical_keyword_probs = np.array([
161 |             0.8, 0.5, 0.2, 0.05, 0.02, 0.4, 0.1, 0.8, 0.3, 0.05,
162 |         ])
163 |         medical_keywords = []
164 |         for prob in medical_keyword_probs:
165 |             medical_keywords.append(self.r.choice([0, 1], p=[1 - prob, prob]))
166 | 
167 |         mkeys = medical_keywords[:5]
168 |         mprobs = medical_keyword_probs[:5]
169 |         mkeys_sum = np.array(mkeys).sum()
170 |         mkeys_score = 1.0 / mprobs * 0.01
171 | 
172 |         score_51 = np.sum(np.array(mkeys) * mkeys_score)
173 |         if mkeys_sum >= 4:
174 |             score_52 = 0.5
175 |         elif mkeys_sum >= 3:
176 |             score_52 = 0.3
177 |         elif mkeys_sum >= 2:
178 |             score_52 = 0.1
179 |         else:
180 |             score_52 = 0.0
181 | 
182 |         score_53 = 0.0
183 |         if insured_sex == 'Female':
184 |             if medical_keywords[5] == 1 or medical_keywords[6] == 1:
185 |                 score_53 += 0.1
186 | 
187 |         # noise
188 |         score_noise = self.r.uniform(-0.2, 0.2)
189 | 
190 |         data = [insured_age, insured_sex, height, weight,
191 |                 product, amount, application_date,
192 |                 medical_info_a1, medical_info_a2, medical_info_a3,
193 |                 medical_info_b1, medical_info_b2, medical_info_b3,
194 |                 medical_info_c1, medical_info_c2] + medical_keywords
195 | 
196 |         score_elements = [score_01, score_02, score_03, score_21, score_22, score_23,
197 |                           score_41, score_42, score_43, score_44, score_51, score_52, score_53, score_noise]
198 |         score = np.array(score_elements).sum()
199 |         scores = [score] + score_elements
200 | 
201 |         target = [Util.iif(score >= 0.8, 1, 0)]
202 | 
203 |         return data, scores, target
204 | 
205 |     def column_names(self):
206 |         return self.column_names_data() + self.column_names_scores() + self.column_names_target()
207 | 
208 |     def column_names_data(self):
209 |         data = (['age', 'sex', 'height', 'weight',
210 |                  'product', 'amount', 'date',
211 |                  'medical_info_a1', 'medical_info_a2', 'medical_info_a3',
212 |                  'medical_info_b1', 'medical_info_b2', 'medical_info_b3',
213 |                  'medical_info_c1', 'medical_info_c2']
214 |                 + ['medical_keyword_{}'.format(i) for i in range(1, 11)])
215 |         return data
216 | 
217 |     def column_names_scores(self):
218 |         score_elements = ['score_01', 'score_02', 'score_03', 'score_21', 'score_22', 'score_23',
219 |                           'score_41', 'score_42', 'score_43', 'score_44',
220 |                           'score_51', 'score_52', 'score_53', 'score_noise']
221 |         scores = ['score'] + score_elements
222 |         return scores
223 | 
224 |     def column_names_target(self):
225 |         target = ['target']
226 |         return target
227 | 
228 | 
229 | if __name__ == '__main__':
230 |     gen = Generator()
231 |     if args.test:
232 |         n_tr = 100
233 |     else:
234 |         n_tr = 10000
235 | 
236 |     df = gen.run_all(n=n_tr * 2)
237 |     cols_data = gen.column_names_data()
238 |     cols_target = gen.column_names_target()
239 | 
240 |     # df[:n_tr].to_csv('train_debug.csv', index=False, sep='\t')
241 |     # df[n_tr:].to_csv('test_debug.csv', index=False, sep='\t')
242 |     df[:n_tr][cols_data + cols_target].to_csv('train.csv', index=False, sep=',')
243 |     df[n_tr:][cols_data].to_csv('test.csv', index=False, sep=',')
244 | 


--------------------------------------------------------------------------------
/ch03/ch03-02-categorical.py:
--------------------------------------------------------------------------------
  1 | # ---------------------------------
  2 | # Prepare the data etc.
  3 | # ----------------------------------
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | # train_x is the training data, train_y contains the target values, test_x is the test data
  8 | # stored in pandas DataFrames and Series (numpy arrays also used)
  9 | 
 10 | train = pd.read_csv('../input/sample-data/train.csv')
 11 | train_x = train.drop(['target'], axis=1)
 12 | train_y = train['target']
 13 | test_x = pd.read_csv('../input/sample-data/test.csv')
 14 | 
 15 | # Save training and test datasets in their original form for explanations
 16 | train_x_saved = train_x.copy()
 17 | test_x_saved = test_x.copy()
 18 | 
 19 | 
 20 | # Function to recover original training and test datasets
 21 | def load_data():
 22 |     train_x, test_x = train_x_saved.copy(), test_x_saved.copy()
 23 |     return train_x, test_x
 24 | 
 25 | 
 26 | # Store names of categorical variables to be converted in list
 27 | cat_cols = ['sex', 'product', 'medical_info_b2', 'medical_info_b3']
 28 | 
 29 | # -----------------------------------
 30 | # One-hot encoding
 31 | # -----------------------------------
 32 | # Load the data
 33 | train_x, test_x = load_data()
 34 | # -----------------------------------
 35 | 
 36 | # Concatenate the training and test datasets, and apply one-hot encoding via get_dummies()
 37 | all_x = pd.concat([train_x, test_x])
 38 | all_x = pd.get_dummies(all_x, columns=cat_cols)
 39 | 
 40 | # Resplit into training and test data
 41 | train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
 42 | test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)
 43 | 
 44 | # -----------------------------------
 45 | # Load the data
 46 | train_x, test_x = load_data()
 47 | # -----------------------------------
 48 | from sklearn.preprocessing import OneHotEncoder
 49 | 
 50 | # Encoding with the OneHotEncoder() function
 51 | ohe = OneHotEncoder(sparse=False, categories='auto')
 52 | ohe.fit(train_x[cat_cols])
 53 | 
 54 | # Create column names for dummy variables
 55 | columns = []
 56 | for i, c in enumerate(cat_cols):
 57 |     columns += [f'{c}_{v}' for v in ohe.categories_[i]]
 58 | 
 59 | # Put created dummy variables into data frames
 60 | dummy_vals_train = pd.DataFrame(ohe.transform(train_x[cat_cols]), columns=columns)
 61 | dummy_vals_test = pd.DataFrame(ohe.transform(test_x[cat_cols]), columns=columns)
 62 | 
 63 | # Join the remaining variables
 64 | train_x = pd.concat([train_x.drop(cat_cols, axis=1), dummy_vals_train], axis=1)
 65 | test_x = pd.concat([test_x.drop(cat_cols, axis=1), dummy_vals_test], axis=1)
 66 | 
 67 | # -----------------------------------
 68 | # Label encoding
 69 | # -----------------------------------
 70 | # Load the data
 71 | train_x, test_x = load_data()
 72 | # -----------------------------------
 73 | from sklearn.preprocessing import LabelEncoder
 74 | 
 75 | # Loop over the categorical variables and apply label encoding
 76 | for c in cat_cols:
 77 |     # Define labels based on the training data
 78 |     le = LabelEncoder()
 79 |     le.fit(train_x[c])
 80 |     train_x[c] = le.transform(train_x[c])
 81 |     test_x[c] = le.transform(test_x[c])
 82 | 
 83 | # -----------------------------------
 84 | # Feature hashing
 85 | # -----------------------------------
 86 | # Load the data
 87 | train_x, test_x = load_data()
 88 | # -----------------------------------
 89 | from sklearn.feature_extraction import FeatureHasher
 90 | 
 91 | # Loop over the categorical variables and apply feature hashing
 92 | for c in cat_cols:
 93 |     # Using the FeatureHasher() function is slightly different from other encoders
 94 | 
 95 |     fh = FeatureHasher(n_features=5, input_type='string')
 96 |     # Convert the variable to a string and apply the FeatureHasher() function
 97 |     hash_train = fh.transform(train_x[[c]].astype(str).values)
 98 |     hash_test = fh.transform(test_x[[c]].astype(str).values)
 99 |     # Add to a data frame
100 |     hash_train = pd.DataFrame(hash_train.todense(), columns=[f'{c}_{i}' for i in range(5)])
101 |     hash_test = pd.DataFrame(hash_test.todense(), columns=[f'{c}_{i}' for i in range(5)])
102 |     # Join with the original data frame
103 |     train_x = pd.concat([train_x, hash_train], axis=1)
104 |     test_x = pd.concat([test_x, hash_test], axis=1)
105 | 
106 | # Drop the original categorical variable columns
107 | train_x.drop(cat_cols, axis=1, inplace=True)
108 | test_x.drop(cat_cols, axis=1, inplace=True)
109 | 
110 | # -----------------------------------
111 | # Frequency encoding
112 | # -----------------------------------
113 | # Load the data
114 | train_x, test_x = load_data()
115 | # -----------------------------------
116 | # Loop over the categorical variables and apply frequency encoding
117 | for c in cat_cols:
118 |     freq = train_x[c].value_counts()
119 |     # Replace each categorical variable with its frequency of occurrence
120 |     train_x[c] = train_x[c].map(freq)
121 |     test_x[c] = test_x[c].map(freq)
122 | 
123 | # -----------------------------------
124 | # Target encoding
125 | # -----------------------------------
126 | # Load the data
127 | train_x, test_x = load_data()
128 | # -----------------------------------
129 | from sklearn.model_selection import KFold
130 | 
131 | # Loop over the categorical variables and apply target encoding
132 | for c in cat_cols:
133 |     # Calculate the average of the target for each categorical value in the training data
134 |     data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
135 |     target_mean = data_tmp.groupby(c)['target'].mean()
136 |     # Replace the categorical variables in the test data
137 |     test_x[c] = test_x[c].map(target_mean)
138 | 
139 |     # Prepare an array to store the converted training data
140 |     tmp = np.repeat(np.nan, train_x.shape[0])
141 | 
142 |     # Split the training data
143 |     kf = KFold(n_splits=4, shuffle=True, random_state=72)
144 |     for idx_1, idx_2 in kf.split(train_x):
145 |         # Calculate the average of the target values for the out-of-fold categorical variables
146 |         target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
147 |         # Store the converted values temporarily in an array
148 |         tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)
149 | 
150 |     # Replace the original data with the converted values
151 |     train_x[c] = tmp
152 | 
153 | # -----------------------------------
154 | # Target encoding - for each fold of cross validation
155 | # -----------------------------------
156 | # Load the data
157 | train_x, test_x = load_data()
158 | # -----------------------------------
159 | from sklearn.model_selection import KFold
160 | 
161 | # Apply target encoding for each cross validation fold
162 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
163 | for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
164 | 
165 |     # Split the validation data off from the training data
166 |     tr_x, va_x = train_x.iloc[tr_idx].copy(), train_x.iloc[va_idx].copy()
167 |     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
168 | 
169 |     # Loop over the categorical variables and apply target encoding
170 |     for c in cat_cols:
171 |         # Calculate the average of the target for each categorical value in the training data
172 |         data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y})
173 |         target_mean = data_tmp.groupby(c)['target'].mean()
174 |         # Replace the categorical variables in the validation data
175 |         va_x.loc[:, c] = va_x[c].map(target_mean)
176 | 
177 |         # Prepare an array to store the converted training data
178 |         tmp = np.repeat(np.nan, tr_x.shape[0])
179 |         kf_encoding = KFold(n_splits=4, shuffle=True, random_state=72)
180 |         for idx_1, idx_2 in kf_encoding.split(tr_x):
181 |             # Calculate the average of the target values for the out-of-fold categorical variables
182 |             target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
183 |             # Store the converted values temporarily in an array
184 |             tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)
185 | 
186 |         tr_x.loc[:, c] = tmp
187 | 
188 |     # Remember to save the encoded features so you can come back and read the data later if necessary
189 | 
190 | # -----------------------------------
191 | # Target encoding - when the cross validation and target encoded folds need to be partitioned
192 | # -----------------------------------
193 | # Load the data
194 | train_x, test_x = load_data()
195 | # -----------------------------------
196 | from sklearn.model_selection import KFold
197 | 
198 | # Define the cross validation folds
199 | kf = KFold(n_splits=4, shuffle=True, random_state=71)
200 | 
201 | # Loop over the categorical variables and apply target encoding
202 | for c in cat_cols:
203 | 
204 |     # Add the target values
205 |     data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
206 |     # Store the converted values temporarily in an array
207 |     tmp = np.repeat(np.nan, train_x.shape[0])
208 | 
209 |     # Split off the cross validation 
210 |     for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
211 |         # Calculate the average of the target values for each category for the training data
212 |         target_mean = data_tmp.iloc[tr_idx].groupby(c)['target'].mean()
213 |         # For the validation data, store the converted values temporarily in an array
214 |         tmp[va_idx] = train_x[c].iloc[va_idx].map(target_mean)
215 | 
216 |     # Replace the original data with the converted values
217 |     train_x[c] = tmp
218 | 


--------------------------------------------------------------------------------
/input/ch03/multi_table_log.csv:
--------------------------------------------------------------------------------
  1 | user_id,date,event,product_id
  2 | 40,2018-01-01,login,
  3 | 40,2018-01-01,view,P5
  4 | 9,2018-01-03,login,
  5 | 9,2018-01-03,view,P3
  6 | 9,2018-01-03,view,P5
  7 | 9,2018-01-03,view,P11
  8 | 9,2018-01-03,view,P20
  9 | 11,2018-01-03,login,
 10 | 11,2018-01-03,view,P2
 11 | 11,2018-01-03,view,P4
 12 | 11,2018-01-03,view,P9
 13 | 11,2018-01-03,view,P10
 14 | 11,2018-01-03,view,P15
 15 | 11,2018-01-03,view,P19
 16 | 33,2018-01-04,login,
 17 | 33,2018-01-04,view,P5
 18 | 33,2018-01-04,view,P6
 19 | 33,2018-01-04,view,P8
 20 | 49,2018-01-04,login,
 21 | 49,2018-01-04,view,P3
 22 | 49,2018-01-04,view,P5
 23 | 49,2018-01-04,view,P19
 24 | 25,2018-01-06,login,
 25 | 25,2018-01-06,view,P1
 26 | 25,2018-01-06,view,P4
 27 | 25,2018-01-06,view,P9
 28 | 25,2018-01-06,view,P14
 29 | 25,2018-01-06,view,P15
 30 | 25,2018-01-06,view,P16
 31 | 55,2018-01-06,login,
 32 | 55,2018-01-06,view,P11
 33 | 55,2018-01-06,view,P12
 34 | 55,2018-01-06,view,P13
 35 | 55,2018-01-06,view,P20
 36 | 70,2018-01-07,login,
 37 | 70,2018-01-07,view,P7
 38 | 70,2018-01-07,view,P20
 39 | 57,2018-01-08,login,
 40 | 57,2018-01-08,view,P8
 41 | 57,2018-01-08,view,P16
 42 | 27,2018-01-09,login,
 43 | 27,2018-01-09,view,P14
 44 | 56,2018-01-09,login,
 45 | 56,2018-01-09,view,P3
 46 | 70,2018-01-10,login,
 47 | 70,2018-01-10,view,P3
 48 | 33,2018-01-13,login,
 49 | 33,2018-01-13,view,P8
 50 | 79,2018-01-13,login,
 51 | 79,2018-01-13,view,P9
 52 | 1,2018-01-14,login,
 53 | 1,2018-01-14,view,P17
 54 | 1,2018-01-14,view,P18
 55 | 31,2018-01-14,login,
 56 | 31,2018-01-14,view,P2
 57 | 31,2018-01-14,view,P4
 58 | 31,2018-01-14,view,P6
 59 | 31,2018-01-14,view,P7
 60 | 31,2018-01-14,view,P12
 61 | 31,2018-01-14,view,P14
 62 | 31,2018-01-14,view,P16
 63 | 61,2018-01-14,login,
 64 | 61,2018-01-14,view,P11
 65 | 61,2018-01-14,view,P14
 66 | 89,2018-01-16,login,
 67 | 89,2018-01-16,view,P3
 68 | 89,2018-01-16,view,P8
 69 | 89,2018-01-16,view,P11
 70 | 89,2018-01-16,view,P14
 71 | 23,2018-01-17,login,
 72 | 23,2018-01-17,view,P3
 73 | 23,2018-01-17,view,P8
 74 | 23,2018-01-17,view,P11
 75 | 23,2018-01-17,view,P13
 76 | 44,2018-01-17,login,
 77 | 44,2018-01-17,view,P2
 78 | 44,2018-01-17,view,P6
 79 | 44,2018-01-17,view,P11
 80 | 44,2018-01-17,view,P19
 81 | 99,2018-01-17,login,
 82 | 99,2018-01-17,view,P5
 83 | 99,2018-01-17,view,P9
 84 | 99,2018-01-17,view,P17
 85 | 24,2018-01-19,login,
 86 | 24,2018-01-19,view,P2
 87 | 24,2018-01-19,view,P3
 88 | 24,2018-01-19,view,P4
 89 | 24,2018-01-19,view,P9
 90 | 24,2018-01-19,view,P12
 91 | 24,2018-01-19,view,P14
 92 | 60,2018-01-20,login,
 93 | 60,2018-01-20,view,P9
 94 | 60,2018-01-20,view,P14
 95 | 60,2018-01-20,view,P17
 96 | 60,2018-01-20,view,P19
 97 | 4,2018-01-21,login,
 98 | 4,2018-01-21,view,P4
 99 | 4,2018-01-21,view,P19
100 | 40,2018-01-21,login,
101 | 40,2018-01-21,view,P2
102 | 40,2018-01-21,view,P10
103 | 40,2018-01-21,view,P13
104 | 40,2018-01-21,view,P15
105 | 40,2018-01-21,view,P18
106 | 34,2018-01-22,login,
107 | 34,2018-01-22,view,P7
108 | 34,2018-01-22,view,P17
109 | 34,2018-01-22,view,P20
110 | 42,2018-01-22,login,
111 | 42,2018-01-22,view,P2
112 | 42,2018-01-22,view,P4
113 | 42,2018-01-22,view,P5
114 | 42,2018-01-22,view,P7
115 | 23,2018-01-23,login,
116 | 23,2018-01-23,view,P1
117 | 23,2018-01-23,view,P11
118 | 23,2018-01-23,view,P12
119 | 23,2018-01-23,view,P13
120 | 17,2018-01-26,login,
121 | 17,2018-01-26,view,P2
122 | 17,2018-01-26,view,P3
123 | 17,2018-01-26,view,P11
124 | 64,2018-01-26,login,
125 | 64,2018-01-26,view,P17
126 | 64,2018-01-26,view,P18
127 | 5,2018-01-27,login,
128 | 5,2018-01-27,view,P9
129 | 5,2018-01-27,view,P11
130 | 5,2018-01-27,view,P13
131 | 5,2018-01-27,view,P19
132 | 60,2018-01-28,login,
133 | 60,2018-01-28,view,P2
134 | 60,2018-01-28,view,P8
135 | 60,2018-01-28,view,P14
136 | 60,2018-01-28,view,P16
137 | 60,2018-01-28,view,P17
138 | 63,2018-02-01,login,
139 | 63,2018-02-01,view,P2
140 | 63,2018-02-01,view,P7
141 | 63,2018-02-01,view,P8
142 | 63,2018-02-01,view,P11
143 | 63,2018-02-01,view,P14
144 | 100,2018-02-01,login,
145 | 100,2018-02-01,view,P1
146 | 100,2018-02-01,view,P5
147 | 100,2018-02-01,view,P7
148 | 100,2018-02-01,view,P14
149 | 100,2018-02-01,view,P16
150 | 29,2018-02-03,login,
151 | 29,2018-02-03,view,P3
152 | 29,2018-02-03,view,P10
153 | 29,2018-02-03,view,P16
154 | 29,2018-02-03,view,P18
155 | 82,2018-02-03,login,
156 | 82,2018-02-03,view,P6
157 | 82,2018-02-03,view,P10
158 | 82,2018-02-03,view,P13
159 | 82,2018-02-03,view,P14
160 | 82,2018-02-03,view,P17
161 | 96,2018-02-03,login,
162 | 96,2018-02-03,view,P4
163 | 96,2018-02-03,view,P13
164 | 96,2018-02-03,view,P17
165 | 76,2018-02-04,login,
166 | 76,2018-02-04,view,P9
167 | 76,2018-02-04,view,P11
168 | 61,2018-02-05,login,
169 | 61,2018-02-05,view,P6
170 | 61,2018-02-05,view,P19
171 | 10,2018-02-06,login,
172 | 10,2018-02-06,view,P2
173 | 10,2018-02-06,view,P4
174 | 10,2018-02-06,view,P8
175 | 10,2018-02-06,view,P11
176 | 10,2018-02-06,view,P13
177 | 28,2018-02-06,login,
178 | 28,2018-02-06,view,P2
179 | 37,2018-02-06,login,
180 | 37,2018-02-06,view,P2
181 | 37,2018-02-06,view,P4
182 | 37,2018-02-06,view,P17
183 | 37,2018-02-06,view,P19
184 | 37,2018-02-06,view,P20
185 | 28,2018-02-07,login,
186 | 28,2018-02-07,view,P2
187 | 28,2018-02-07,view,P7
188 | 28,2018-02-07,view,P17
189 | 52,2018-02-07,login,
190 | 52,2018-02-07,view,P5
191 | 52,2018-02-07,view,P8
192 | 52,2018-02-07,view,P17
193 | 63,2018-02-08,login,
194 | 63,2018-02-08,view,P1
195 | 63,2018-02-08,view,P2
196 | 63,2018-02-08,view,P7
197 | 63,2018-02-08,view,P8
198 | 63,2018-02-08,view,P14
199 | 63,2018-02-08,view,P15
200 | 63,2018-02-08,view,P18
201 | 90,2018-02-11,login,
202 | 90,2018-02-11,view,P5
203 | 90,2018-02-11,view,P12
204 | 90,2018-02-11,view,P13
205 | 90,2018-02-11,view,P14
206 | 90,2018-02-11,view,P18
207 | 22,2018-02-12,login,
208 | 22,2018-02-12,view,P1
209 | 22,2018-02-12,view,P4
210 | 22,2018-02-12,view,P5
211 | 74,2018-02-16,login,
212 | 74,2018-02-16,view,P1
213 | 74,2018-02-16,view,P5
214 | 56,2018-02-18,login,
215 | 56,2018-02-18,view,P1
216 | 56,2018-02-18,view,P10
217 | 56,2018-02-18,view,P12
218 | 56,2018-02-18,view,P15
219 | 56,2018-02-18,view,P20
220 | 61,2018-02-18,login,
221 | 61,2018-02-18,view,P6
222 | 61,2018-02-18,view,P12
223 | 61,2018-02-18,view,P14
224 | 61,2018-02-18,view,P20
225 | 69,2018-02-18,login,
226 | 69,2018-02-18,view,P2
227 | 69,2018-02-18,view,P3
228 | 69,2018-02-18,view,P14
229 | 69,2018-02-18,view,P20
230 | 12,2018-02-19,login,
231 | 12,2018-02-19,view,P8
232 | 12,2018-02-19,view,P11
233 | 12,2018-02-19,view,P13
234 | 12,2018-02-19,view,P14
235 | 12,2018-02-19,view,P17
236 | 72,2018-02-19,login,
237 | 72,2018-02-19,view,P7
238 | 72,2018-02-19,view,P11
239 | 72,2018-02-19,view,P14
240 | 86,2018-02-19,login,
241 | 86,2018-02-19,view,P4
242 | 86,2018-02-19,view,P7
243 | 86,2018-02-19,view,P13
244 | 42,2018-02-21,login,
245 | 42,2018-02-21,view,P2
246 | 42,2018-02-21,view,P4
247 | 42,2018-02-21,view,P5
248 | 42,2018-02-21,view,P8
249 | 42,2018-02-21,view,P20
250 | 61,2018-02-21,login,
251 | 61,2018-02-21,view,P7
252 | 61,2018-02-21,view,P8
253 | 61,2018-02-21,view,P11
254 | 61,2018-02-21,view,P16
255 | 14,2018-02-22,login,
256 | 14,2018-02-22,view,P1
257 | 14,2018-02-22,view,P4
258 | 14,2018-02-22,view,P13
259 | 37,2018-02-22,login,
260 | 37,2018-02-22,view,P1
261 | 37,2018-02-22,view,P4
262 | 37,2018-02-22,view,P5
263 | 76,2018-02-23,login,
264 | 76,2018-02-23,view,P2
265 | 76,2018-02-23,view,P16
266 | 61,2018-02-25,login,
267 | 61,2018-02-25,view,P9
268 | 61,2018-02-25,view,P15
269 | 34,2018-02-27,login,
270 | 34,2018-02-27,view,P9
271 | 34,2018-02-27,view,P11
272 | 34,2018-02-27,view,P13
273 | 34,2018-02-27,view,P18
274 | 34,2018-02-27,view,P19
275 | 93,2018-02-27,login,
276 | 93,2018-02-27,view,P1
277 | 93,2018-02-27,view,P2
278 | 93,2018-02-27,view,P17
279 | 97,2018-02-27,login,
280 | 97,2018-02-27,view,P1
281 | 97,2018-02-27,view,P9
282 | 97,2018-02-27,view,P11
283 | 97,2018-02-27,view,P20
284 | 100,2018-02-27,login,
285 | 100,2018-02-27,view,P1
286 | 100,2018-02-27,view,P12
287 | 100,2018-02-27,view,P14
288 | 27,2018-02-28,login,
289 | 27,2018-02-28,view,P12
290 | 27,2018-02-28,view,P14
291 | 27,2018-02-28,view,P15
292 | 27,2018-02-28,view,P20
293 | 20,2018-03-01,login,
294 | 20,2018-03-01,view,P7
295 | 20,2018-03-01,view,P19
296 | 33,2018-03-02,login,
297 | 33,2018-03-02,view,P5
298 | 33,2018-03-02,view,P8
299 | 75,2018-03-02,login,
300 | 75,2018-03-02,view,P3
301 | 75,2018-03-02,view,P5
302 | 75,2018-03-02,view,P8
303 | 75,2018-03-02,view,P10
304 | 75,2018-03-02,view,P14
305 | 75,2018-03-02,view,P18
306 | 83,2018-03-02,login,
307 | 83,2018-03-02,view,P1
308 | 83,2018-03-02,view,P2
309 | 83,2018-03-02,view,P5
310 | 83,2018-03-02,view,P6
311 | 83,2018-03-02,view,P9
312 | 83,2018-03-02,view,P13
313 | 66,2018-03-03,login,
314 | 66,2018-03-03,view,P2
315 | 66,2018-03-03,view,P6
316 | 66,2018-03-03,view,P17
317 | 17,2018-03-05,login,
318 | 17,2018-03-05,view,P2
319 | 17,2018-03-05,view,P5
320 | 17,2018-03-05,view,P8
321 | 17,2018-03-05,view,P13
322 | 37,2018-03-05,login,
323 | 37,2018-03-05,view,P4
324 | 37,2018-03-05,view,P6
325 | 37,2018-03-05,view,P13
326 | 37,2018-03-05,view,P19
327 | 58,2018-03-05,login,
328 | 58,2018-03-05,view,P5
329 | 58,2018-03-05,view,P8
330 | 58,2018-03-05,view,P15
331 | 12,2018-03-06,login,
332 | 12,2018-03-06,view,P9
333 | 12,2018-03-06,view,P14
334 | 28,2018-03-09,login,
335 | 28,2018-03-09,view,P4
336 | 28,2018-03-09,view,P8
337 | 28,2018-03-09,view,P16
338 | 28,2018-03-09,view,P20
339 | 35,2018-03-09,login,
340 | 35,2018-03-09,view,P2
341 | 35,2018-03-09,view,P11
342 | 48,2018-03-11,login,
343 | 48,2018-03-11,view,P3
344 | 48,2018-03-11,view,P17
345 | 22,2018-03-12,login,
346 | 22,2018-03-12,view,P4
347 | 22,2018-03-12,view,P5
348 | 22,2018-03-12,view,P8
349 | 22,2018-03-12,view,P14
350 | 72,2018-03-14,login,
351 | 72,2018-03-14,view,P5
352 | 72,2018-03-14,view,P9
353 | 72,2018-03-14,view,P11
354 | 43,2018-03-15,login,
355 | 43,2018-03-15,view,P2
356 | 43,2018-03-15,view,P4
357 | 43,2018-03-15,view,P9
358 | 43,2018-03-15,view,P10
359 | 43,2018-03-15,view,P11
360 | 43,2018-03-15,view,P13
361 | 43,2018-03-15,view,P14
362 | 61,2018-03-15,login,
363 | 61,2018-03-15,view,P9
364 | 61,2018-03-15,view,P12
365 | 61,2018-03-15,view,P18
366 | 7,2018-03-17,login,
367 | 7,2018-03-17,view,P1
368 | 7,2018-03-17,view,P2
369 | 7,2018-03-17,view,P11
370 | 7,2018-03-17,view,P17
371 | 36,2018-03-18,login,
372 | 36,2018-03-18,view,P2
373 | 36,2018-03-18,view,P4
374 | 36,2018-03-18,view,P10
375 | 36,2018-03-18,view,P14
376 | 5,2018-03-19,login,
377 | 5,2018-03-19,view,P4
378 | 82,2018-03-19,login,
379 | 82,2018-03-19,view,P5
380 | 82,2018-03-19,view,P12
381 | 82,2018-03-19,view,P13
382 | 82,2018-03-19,view,P15
383 | 72,2018-03-20,login,
384 | 72,2018-03-20,view,P3
385 | 72,2018-03-20,view,P8
386 | 72,2018-03-20,view,P11
387 | 48,2018-03-21,login,
388 | 48,2018-03-21,view,P1
389 | 54,2018-03-22,login,
390 | 54,2018-03-22,view,P1
391 | 54,2018-03-22,view,P10
392 | 59,2018-03-23,login,
393 | 59,2018-03-23,view,P4
394 | 59,2018-03-23,view,P5
395 | 59,2018-03-23,view,P11
396 | 59,2018-03-23,view,P15
397 | 59,2018-03-23,view,P16
398 | 43,2018-03-26,login,
399 | 43,2018-03-26,view,P2
400 | 43,2018-03-26,view,P9
401 | 43,2018-03-26,view,P15
402 | 59,2018-03-26,login,
403 | 59,2018-03-26,view,P4
404 | 59,2018-03-26,view,P6
405 | 59,2018-03-26,view,P8
406 | 59,2018-03-26,view,P11
407 | 59,2018-03-26,view,P12
408 | 84,2018-03-27,login,
409 | 84,2018-03-27,view,P2
410 | 84,2018-03-27,view,P11
411 | 2,2018-03-28,login,
412 | 2,2018-03-28,view,P2
413 | 2,2018-03-28,view,P4
414 | 2,2018-03-28,view,P7
415 | 2,2018-03-28,view,P11
416 | 2,2018-03-28,view,P19
417 | 97,2018-03-28,login,
418 | 97,2018-03-28,view,P8
419 | 97,2018-03-28,view,P11
420 | 26,2018-03-29,login,
421 | 26,2018-03-29,view,P7
422 | 26,2018-03-29,view,P13
423 | 26,2018-03-29,view,P14
424 | 40,2018-03-29,login,
425 | 40,2018-03-29,view,P5
426 | 40,2018-03-29,view,P18
427 | 40,2018-03-29,view,P19
428 | 56,2018-03-29,login,
429 | 56,2018-03-29,view,P7
430 | 52,2018-03-30,login,
431 | 52,2018-03-30,view,P3
432 | 52,2018-03-30,view,P10
433 | 52,2018-03-30,view,P11
434 | 11,2018-03-31,login,
435 | 11,2018-03-31,view,P2
436 | 11,2018-03-31,view,P4
437 | 11,2018-03-31,view,P20
438 | 84,2018-03-31,login,
439 | 84,2018-03-31,view,P4
440 | 84,2018-03-31,view,P13
441 | 84,2018-03-31,view,P17
442 | 92,2018-03-31,login,
443 | 92,2018-03-31,view,P1
444 | 92,2018-03-31,view,P14
445 | 92,2018-03-31,view,P19
446 | 92,2018-03-31,view,P20
447 | 76,2018-04-01,login,
448 | 76,2018-04-01,view,P3
449 | 76,2018-04-01,view,P11
450 | 82,2018-04-01,login,
451 | 82,2018-04-01,view,P2
452 | 82,2018-04-01,view,P5
453 | 82,2018-04-01,view,P6
454 | 82,2018-04-01,view,P8
455 | 82,2018-04-01,view,P17
456 | 47,2018-04-02,login,
457 | 47,2018-04-02,view,P4
458 | 47,2018-04-02,view,P9
459 | 47,2018-04-02,view,P11
460 | 47,2018-04-02,view,P19
461 | 10,2018-04-04,login,
462 | 10,2018-04-04,view,P2
463 | 10,2018-04-04,view,P4
464 | 10,2018-04-04,view,P9
465 | 10,2018-04-04,view,P13
466 | 86,2018-04-04,login,
467 | 86,2018-04-04,view,P3
468 | 86,2018-04-04,view,P4
469 | 86,2018-04-04,view,P6
470 | 86,2018-04-04,view,P7
471 | 86,2018-04-04,view,P11
472 | 86,2018-04-04,view,P14
473 | 86,2018-04-04,view,P17
474 | 39,2018-04-05,login,
475 | 39,2018-04-05,view,P4
476 | 39,2018-04-05,view,P9
477 | 39,2018-04-05,view,P12
478 | 39,2018-04-05,view,P18
479 | 13,2018-04-06,login,
480 | 13,2018-04-06,view,P2
481 | 13,2018-04-06,view,P4
482 | 13,2018-04-06,view,P8
483 | 13,2018-04-06,view,P16
484 | 13,2018-04-06,view,P20
485 | 70,2018-04-06,login,
486 | 96,2018-04-06,login,
487 | 96,2018-04-06,view,P1
488 | 96,2018-04-06,view,P3
489 | 96,2018-04-06,view,P5
490 | 96,2018-04-06,view,P9
491 | 96,2018-04-06,view,P10
492 | 96,2018-04-06,view,P11
493 | 96,2018-04-06,view,P12
494 | 11,2018-04-07,login,
495 | 11,2018-04-07,view,P2
496 | 11,2018-04-07,view,P3
497 | 11,2018-04-07,view,P4
498 | 11,2018-04-07,view,P14
499 | 11,2018-04-07,view,P16
500 | 34,2018-04-09,login,
501 | 34,2018-04-09,view,P1
502 | 34,2018-04-09,view,P16
503 | 34,2018-04-09,view,P18
504 | 34,2018-04-09,view,P20
505 | 72,2018-04-14,login,
506 | 72,2018-04-14,view,P3
507 | 72,2018-04-14,view,P6
508 | 72,2018-04-14,view,P7
509 | 17,2018-04-16,login,
510 | 17,2018-04-16,view,P1
511 | 17,2018-04-16,view,P2
512 | 17,2018-04-16,view,P5
513 | 17,2018-04-16,view,P8
514 | 17,2018-04-16,view,P11
515 | 17,2018-04-16,view,P12
516 | 17,2018-04-16,view,P15
517 | 17,2018-04-16,view,P16
518 | 29,2018-04-16,login,
519 | 29,2018-04-16,view,P12
520 | 100,2018-04-16,login,
521 | 100,2018-04-16,view,P7
522 | 100,2018-04-16,view,P10
523 | 100,2018-04-16,view,P14
524 | 5,2018-04-19,login,
525 | 69,2018-04-22,login,
526 | 69,2018-04-22,view,P1
527 | 69,2018-04-22,view,P2
528 | 69,2018-04-22,view,P10
529 | 69,2018-04-22,view,P11
530 | 69,2018-04-22,view,P17
531 | 69,2018-04-22,view,P19
532 | 93,2018-04-22,login,
533 | 93,2018-04-22,view,P11
534 | 93,2018-04-22,view,P19
535 | 93,2018-04-22,view,P20
536 | 18,2018-04-23,login,
537 | 18,2018-04-23,view,P5
538 | 18,2018-04-23,view,P7
539 | 18,2018-04-23,view,P15
540 | 24,2018-04-23,login,
541 | 24,2018-04-23,view,P2
542 | 24,2018-04-23,view,P4
543 | 24,2018-04-23,view,P9
544 | 24,2018-04-23,view,P13
545 | 24,2018-04-23,view,P14
546 | 24,2018-04-23,view,P15
547 | 57,2018-04-23,login,
548 | 57,2018-04-23,view,P4
549 | 57,2018-04-23,view,P5
550 | 57,2018-04-23,view,P11
551 | 57,2018-04-23,view,P18
552 | 60,2018-04-24,login,
553 | 60,2018-04-24,view,P2
554 | 60,2018-04-24,view,P8
555 | 60,2018-04-24,view,P20
556 | 42,2018-04-26,login,
557 | 42,2018-04-26,view,P2
558 | 42,2018-04-26,view,P4
559 | 42,2018-04-26,view,P5
560 | 42,2018-04-26,view,P7
561 | 22,2018-04-27,login,
562 | 22,2018-04-27,view,P1
563 | 22,2018-04-27,view,P4
564 | 22,2018-04-27,view,P9
565 | 22,2018-04-27,view,P14
566 | 1,2018-04-28,login,
567 | 1,2018-04-28,view,P1
568 | 1,2018-04-28,view,P6
569 | 1,2018-04-28,view,P18
570 | 69,2018-04-28,login,
571 | 69,2018-04-28,view,P2
572 | 69,2018-04-28,view,P3
573 | 69,2018-04-28,view,P4
574 | 69,2018-04-28,view,P13
575 | 69,2018-04-28,view,P14
576 | 69,2018-04-28,view,P17
577 | 81,2018-04-28,login,
578 | 81,2018-04-28,view,P2
579 | 81,2018-04-28,view,P8
580 | 81,2018-04-28,view,P9
581 | 81,2018-04-28,view,P11
582 | 81,2018-04-28,view,P17
583 | 52,2018-04-29,login,
584 | 52,2018-04-29,view,P4
585 | 52,2018-04-29,view,P8
586 | 52,2018-04-29,view,P12
587 | 56,2018-04-29,login,
588 | 56,2018-04-29,view,P6
589 | 56,2018-04-29,view,P8
590 | 56,2018-04-29,view,P12
591 | 56,2018-04-29,view,P14
592 | 69,2018-04-29,login,
593 | 69,2018-04-29,view,P2
594 | 69,2018-04-29,view,P7
595 | 69,2018-04-29,view,P10
596 | 69,2018-04-29,view,P11
597 | 69,2018-04-29,view,P12
598 | 69,2018-04-29,view,P14
599 | 69,2018-04-29,view,P17
600 | 69,2018-04-29,view,P20
601 | 79,2018-04-29,login,
602 | 79,2018-04-29,view,P9
603 | 79,2018-04-29,view,P10
604 | 79,2018-04-29,view,P13
605 | 79,2018-04-29,view,P19
606 | 4,2018-04-30,login,
607 | 4,2018-04-30,view,P4
608 | 4,2018-04-30,view,P15
609 | 97,2018-05-01,login,
610 | 97,2018-05-01,view,P5
611 | 97,2018-05-01,view,P8
612 | 97,2018-05-01,view,P9
613 | 97,2018-05-01,view,P18
614 | 61,2018-05-02,login,
615 | 61,2018-05-02,view,P3
616 | 61,2018-05-02,view,P4
617 | 61,2018-05-02,view,P10
618 | 61,2018-05-02,view,P13
619 | 61,2018-05-02,view,P14
620 | 61,2018-05-02,view,P16
621 | 61,2018-05-02,view,P18
622 | 79,2018-05-02,login,
623 | 99,2018-05-02,login,
624 | 99,2018-05-02,view,P4
625 | 99,2018-05-02,view,P5
626 | 99,2018-05-02,view,P6
627 | 27,2018-05-03,login,
628 | 27,2018-05-03,view,P7
629 | 27,2018-05-03,view,P14
630 | 27,2018-05-03,view,P16
631 | 52,2018-05-04,login,
632 | 52,2018-05-04,view,P4
633 | 52,2018-05-04,view,P5
634 | 52,2018-05-04,view,P8
635 | 90,2018-05-04,login,
636 | 90,2018-05-04,view,P3
637 | 90,2018-05-04,view,P10
638 | 90,2018-05-04,view,P14
639 | 90,2018-05-04,view,P15
640 | 90,2018-05-04,view,P16
641 | 90,2018-05-04,view,P18
642 | 97,2018-05-04,login,
643 | 97,2018-05-04,view,P11
644 | 97,2018-05-04,view,P20
645 | 54,2018-05-05,login,
646 | 54,2018-05-05,view,P9
647 | 54,2018-05-05,view,P10
648 | 54,2018-05-05,view,P16
649 | 14,2018-05-06,login,
650 | 14,2018-05-06,view,P1
651 | 14,2018-05-06,view,P4
652 | 14,2018-05-06,view,P5
653 | 14,2018-05-06,view,P12
654 | 14,2018-05-06,view,P14
655 | 64,2018-05-06,login,
656 | 64,2018-05-06,view,P16
657 | 90,2018-05-09,login,
658 | 90,2018-05-09,view,P5
659 | 90,2018-05-09,view,P8
660 | 90,2018-05-09,view,P13
661 | 90,2018-05-09,view,P14
662 | 90,2018-05-09,view,P16
663 | 90,2018-05-09,view,P18
664 | 90,2018-05-09,view,P20
665 | 71,2018-05-11,login,
666 | 71,2018-05-11,view,P8
667 | 71,2018-05-11,view,P10
668 | 71,2018-05-11,view,P16
669 | 82,2018-05-11,login,
670 | 82,2018-05-11,view,P2
671 | 82,2018-05-11,view,P5
672 | 82,2018-05-11,view,P9
673 | 82,2018-05-11,view,P12
674 | 82,2018-05-11,view,P13
675 | 82,2018-05-11,view,P18
676 | 98,2018-05-11,login,
677 | 98,2018-05-11,view,P4
678 | 98,2018-05-11,view,P5
679 | 98,2018-05-11,view,P11
680 | 98,2018-05-11,view,P14
681 | 64,2018-05-13,login,
682 | 64,2018-05-13,view,P13
683 | 64,2018-05-13,view,P14
684 | 53,2018-05-14,login,
685 | 53,2018-05-14,view,P8
686 | 79,2018-05-14,login,
687 | 79,2018-05-14,view,P9
688 | 79,2018-05-14,view,P16
689 | 97,2018-05-14,login,
690 | 97,2018-05-14,view,P1
691 | 97,2018-05-14,view,P8
692 | 97,2018-05-14,view,P11
693 | 97,2018-05-14,view,P13
694 | 97,2018-05-14,view,P18
695 | 60,2018-05-16,login,
696 | 60,2018-05-16,view,P9
697 | 60,2018-05-16,view,P14
698 | 60,2018-05-16,view,P17
699 | 60,2018-05-16,view,P20
700 | 37,2018-05-17,login,
701 | 37,2018-05-17,view,P3
702 | 37,2018-05-17,view,P4
703 | 37,2018-05-17,view,P10
704 | 37,2018-05-17,view,P12
705 | 37,2018-05-17,view,P14
706 | 81,2018-05-17,login,
707 | 81,2018-05-17,view,P2
708 | 81,2018-05-17,view,P3
709 | 81,2018-05-17,view,P8
710 | 81,2018-05-17,view,P13
711 | 81,2018-05-17,view,P14
712 | 81,2018-05-17,view,P20
713 | 12,2018-05-19,login,
714 | 12,2018-05-19,view,P1
715 | 12,2018-05-19,view,P8
716 | 12,2018-05-19,view,P15
717 | 12,2018-05-19,view,P18
718 | 48,2018-05-19,login,
719 | 48,2018-05-19,view,P3
720 | 48,2018-05-19,view,P16
721 | 48,2018-05-19,view,P17
722 | 87,2018-05-20,login,
723 | 87,2018-05-20,view,P4
724 | 87,2018-05-20,view,P6
725 | 87,2018-05-20,view,P7
726 | 87,2018-05-20,view,P18
727 | 56,2018-05-21,login,
728 | 56,2018-05-21,view,P3
729 | 56,2018-05-21,view,P10
730 | 56,2018-05-21,view,P12
731 | 56,2018-05-21,view,P18
732 | 28,2018-05-22,login,
733 | 28,2018-05-22,view,P3
734 | 28,2018-05-22,view,P16
735 | 50,2018-05-22,login,
736 | 50,2018-05-22,view,P14
737 | 50,2018-05-22,view,P15
738 | 52,2018-05-23,login,
739 | 52,2018-05-23,view,P1
740 | 52,2018-05-23,view,P3
741 | 52,2018-05-23,view,P9
742 | 52,2018-05-23,view,P11
743 | 52,2018-05-23,view,P12
744 | 8,2018-05-25,login,
745 | 8,2018-05-25,view,P4
746 | 8,2018-05-25,view,P12
747 | 8,2018-05-25,view,P19
748 | 79,2018-05-26,login,
749 | 79,2018-05-26,view,P10
750 | 2,2018-05-27,login,
751 | 2,2018-05-27,view,P2
752 | 2,2018-05-27,view,P7
753 | 2,2018-05-27,view,P11
754 | 22,2018-05-28,login,
755 | 22,2018-05-28,view,P5
756 | 28,2018-05-29,login,
757 | 28,2018-05-29,view,P2
758 | 28,2018-05-29,view,P11
759 | 39,2018-05-29,login,
760 | 39,2018-05-29,view,P4
761 | 39,2018-05-29,view,P9
762 | 39,2018-05-29,view,P10
763 | 39,2018-05-29,view,P13
764 | 39,2018-05-29,view,P19
765 | 14,2018-05-30,login,
766 | 14,2018-05-30,view,P1
767 | 14,2018-05-30,view,P4
768 | 14,2018-05-30,view,P11
769 | 14,2018-05-30,view,P13
770 | 14,2018-05-30,view,P16
771 | 14,2018-05-30,view,P18
772 | 67,2018-05-30,login,
773 | 67,2018-05-30,view,P18
774 | 93,2018-05-31,login,
775 | 93,2018-05-31,view,P9
776 | 93,2018-05-31,view,P11
777 | 67,2018-06-01,login,
778 | 67,2018-06-01,view,P7
779 | 67,2018-06-01,view,P8
780 | 67,2018-06-01,view,P14
781 | 28,2018-06-04,login,
782 | 28,2018-06-04,view,P2
783 | 33,2018-06-07,login,
784 | 33,2018-06-07,view,P5
785 | 33,2018-06-07,view,P6
786 | 33,2018-06-07,view,P8
787 | 33,2018-06-07,view,P17
788 | 28,2018-06-08,login,
789 | 28,2018-06-08,view,P2
790 | 28,2018-06-08,view,P4
791 | 28,2018-06-08,view,P8
792 | 28,2018-06-08,view,P14
793 | 35,2018-06-08,login,
794 | 35,2018-06-08,view,P2
795 | 35,2018-06-08,view,P13
796 | 44,2018-06-08,login,
797 | 44,2018-06-08,view,P2
798 | 44,2018-06-08,view,P3
799 | 44,2018-06-08,view,P6
800 | 44,2018-06-08,view,P11
801 | 44,2018-06-08,view,P13
802 | 44,2018-06-08,view,P19
803 | 59,2018-06-09,login,
804 | 59,2018-06-09,view,P4
805 | 59,2018-06-09,view,P5
806 | 59,2018-06-09,view,P7
807 | 59,2018-06-09,view,P8
808 | 59,2018-06-09,view,P11
809 | 59,2018-06-09,view,P15
810 | 59,2018-06-09,view,P20
811 | 80,2018-06-10,login,
812 | 80,2018-06-10,view,P1
813 | 80,2018-06-10,view,P2
814 | 80,2018-06-10,view,P4
815 | 80,2018-06-10,view,P11
816 | 80,2018-06-10,view,P19
817 | 6,2018-06-13,login,
818 | 6,2018-06-13,view,P4
819 | 6,2018-06-13,view,P17
820 | 35,2018-06-14,login,
821 | 35,2018-06-14,view,P1
822 | 35,2018-06-14,view,P2
823 | 35,2018-06-14,view,P3
824 | 35,2018-06-14,view,P7
825 | 35,2018-06-14,view,P16
826 | 29,2018-06-17,login,
827 | 44,2018-06-17,login,
828 | 44,2018-06-17,view,P2
829 | 44,2018-06-17,view,P12
830 | 93,2018-06-17,login,
831 | 93,2018-06-17,view,P11
832 | 93,2018-06-17,view,P15
833 | 97,2018-06-17,login,
834 | 97,2018-06-17,view,P13
835 | 97,2018-06-17,view,P15
836 | 66,2018-06-19,login,
837 | 66,2018-06-19,view,P2
838 | 66,2018-06-19,view,P13
839 | 66,2018-06-19,view,P18
840 | 69,2018-06-19,login,
841 | 69,2018-06-19,view,P2
842 | 69,2018-06-19,view,P11
843 | 69,2018-06-19,view,P14
844 | 69,2018-06-19,view,P17
845 | 97,2018-06-20,login,
846 | 97,2018-06-20,view,P7
847 | 97,2018-06-20,view,P8
848 | 28,2018-06-21,login,
849 | 28,2018-06-21,view,P2
850 | 28,2018-06-21,view,P5
851 | 28,2018-06-21,view,P16
852 | 55,2018-06-21,login,
853 | 55,2018-06-21,view,P8
854 | 55,2018-06-21,view,P12
855 | 55,2018-06-21,view,P13
856 | 55,2018-06-21,view,P18
857 | 67,2018-06-21,login,
858 | 67,2018-06-21,view,P14
859 | 22,2018-06-22,login,
860 | 22,2018-06-22,view,P1
861 | 22,2018-06-22,view,P4
862 | 22,2018-06-22,view,P11
863 | 22,2018-06-22,view,P17
864 | 37,2018-06-22,login,
865 | 37,2018-06-22,view,P6
866 | 37,2018-06-22,view,P8
867 | 37,2018-06-22,view,P12
868 | 37,2018-06-22,view,P13
869 | 37,2018-06-22,view,P14
870 | 40,2018-06-22,login,
871 | 40,2018-06-22,view,P5
872 | 40,2018-06-22,view,P6
873 | 40,2018-06-22,view,P13
874 | 72,2018-06-23,login,
875 | 72,2018-06-23,view,P8
876 | 72,2018-06-23,view,P11
877 | 72,2018-06-23,view,P18
878 | 72,2018-06-23,view,P19
879 | 29,2018-06-24,login,
880 | 29,2018-06-24,view,P7
881 | 29,2018-06-24,view,P8
882 | 29,2018-06-24,view,P13
883 | 29,2018-06-24,view,P18
884 | 59,2018-06-24,login,
885 | 59,2018-06-24,view,P1
886 | 59,2018-06-24,view,P4
887 | 59,2018-06-24,view,P11
888 | 59,2018-06-24,view,P15
889 | 61,2018-06-24,login,
890 | 61,2018-06-24,view,P4
891 | 61,2018-06-24,view,P18
892 | 80,2018-06-24,login,
893 | 80,2018-06-24,view,P1
894 | 80,2018-06-24,view,P2
895 | 80,2018-06-24,view,P4
896 | 80,2018-06-24,view,P7
897 | 80,2018-06-24,view,P12
898 | 27,2018-06-25,login,
899 | 27,2018-06-25,view,P10
900 | 27,2018-06-25,view,P14
901 | 84,2018-06-25,login,
902 | 84,2018-06-25,view,P2
903 | 84,2018-06-25,view,P3
904 | 84,2018-06-25,view,P13
905 | 88,2018-06-25,login,
906 | 88,2018-06-25,view,P6
907 | 88,2018-06-25,view,P17
908 | 36,2018-06-28,login,
909 | 36,2018-06-28,view,P4
910 | 36,2018-06-28,view,P5
911 | 36,2018-06-28,view,P8
912 | 36,2018-06-28,view,P14
913 | 29,2018-06-30,login,
914 | 29,2018-06-30,view,P8
915 | 29,2018-06-30,view,P19
916 | 30,2018-06-30,login,
917 | 30,2018-06-30,view,P1
918 | 30,2018-06-30,view,P2
919 | 30,2018-06-30,view,P5
920 | 30,2018-06-30,view,P17
921 | 30,2018-06-30,view,P19
922 | 90,2018-06-30,login,
923 | 90,2018-06-30,view,P3
924 | 90,2018-06-30,view,P8
925 | 90,2018-06-30,view,P10
926 | 90,2018-06-30,view,P14
927 | 


--------------------------------------------------------------------------------
/input/ch03/multi_table_train.csv:
--------------------------------------------------------------------------------
   1 | user_id,product_id,target
   2 | 1,P1,0
   3 | 1,P2,0
   4 | 1,P3,1
   5 | 1,P4,0
   6 | 1,P5,0
   7 | 1,P6,0
   8 | 1,P7,0
   9 | 1,P8,0
  10 | 1,P9,0
  11 | 1,P10,0
  12 | 1,P11,0
  13 | 1,P12,0
  14 | 1,P13,0
  15 | 1,P14,0
  16 | 1,P15,0
  17 | 1,P16,0
  18 | 1,P17,1
  19 | 1,P18,0
  20 | 1,P19,0
  21 | 1,P20,0
  22 | 2,P1,0
  23 | 2,P2,1
  24 | 2,P3,0
  25 | 2,P4,0
  26 | 2,P5,0
  27 | 2,P6,0
  28 | 2,P7,1
  29 | 2,P8,0
  30 | 2,P9,1
  31 | 2,P10,1
  32 | 2,P11,1
  33 | 2,P12,1
  34 | 2,P13,1
  35 | 2,P14,0
  36 | 2,P15,0
  37 | 2,P16,0
  38 | 2,P17,0
  39 | 2,P18,0
  40 | 2,P19,0
  41 | 2,P20,0
  42 | 3,P1,1
  43 | 3,P2,1
  44 | 3,P3,0
  45 | 3,P4,0
  46 | 3,P5,0
  47 | 3,P6,0
  48 | 3,P7,1
  49 | 3,P8,0
  50 | 3,P9,1
  51 | 3,P10,0
  52 | 3,P11,1
  53 | 3,P12,0
  54 | 3,P13,0
  55 | 3,P14,1
  56 | 3,P15,0
  57 | 3,P16,0
  58 | 3,P17,1
  59 | 3,P18,0
  60 | 3,P19,0
  61 | 3,P20,0
  62 | 4,P1,0
  63 | 4,P2,0
  64 | 4,P3,1
  65 | 4,P4,1
  66 | 4,P5,0
  67 | 4,P6,0
  68 | 4,P7,0
  69 | 4,P8,0
  70 | 4,P9,0
  71 | 4,P10,1
  72 | 4,P11,0
  73 | 4,P12,0
  74 | 4,P13,1
  75 | 4,P14,0
  76 | 4,P15,1
  77 | 4,P16,0
  78 | 4,P17,0
  79 | 4,P18,1
  80 | 4,P19,1
  81 | 4,P20,1
  82 | 5,P1,0
  83 | 5,P2,0
  84 | 5,P3,1
  85 | 5,P4,1
  86 | 5,P5,0
  87 | 5,P6,0
  88 | 5,P7,0
  89 | 5,P8,0
  90 | 5,P9,0
  91 | 5,P10,0
  92 | 5,P11,1
  93 | 5,P12,1
  94 | 5,P13,0
  95 | 5,P14,1
  96 | 5,P15,1
  97 | 5,P16,0
  98 | 5,P17,0
  99 | 5,P18,1
 100 | 5,P19,0
 101 | 5,P20,0
 102 | 6,P1,0
 103 | 6,P2,1
 104 | 6,P3,0
 105 | 6,P4,1
 106 | 6,P5,0
 107 | 6,P6,0
 108 | 6,P7,0
 109 | 6,P8,1
 110 | 6,P9,0
 111 | 6,P10,0
 112 | 6,P11,1
 113 | 6,P12,0
 114 | 6,P13,1
 115 | 6,P14,0
 116 | 6,P15,0
 117 | 6,P16,0
 118 | 6,P17,1
 119 | 6,P18,0
 120 | 6,P19,0
 121 | 6,P20,0
 122 | 7,P1,1
 123 | 7,P2,1
 124 | 7,P3,0
 125 | 7,P4,0
 126 | 7,P5,1
 127 | 7,P6,0
 128 | 7,P7,0
 129 | 7,P8,0
 130 | 7,P9,1
 131 | 7,P10,0
 132 | 7,P11,1
 133 | 7,P12,1
 134 | 7,P13,1
 135 | 7,P14,1
 136 | 7,P15,0
 137 | 7,P16,0
 138 | 7,P17,1
 139 | 7,P18,0
 140 | 7,P19,0
 141 | 7,P20,0
 142 | 8,P1,0
 143 | 8,P2,0
 144 | 8,P3,1
 145 | 8,P4,1
 146 | 8,P5,0
 147 | 8,P6,1
 148 | 8,P7,0
 149 | 8,P8,0
 150 | 8,P9,0
 151 | 8,P10,1
 152 | 8,P11,1
 153 | 8,P12,1
 154 | 8,P13,0
 155 | 8,P14,0
 156 | 8,P15,0
 157 | 8,P16,1
 158 | 8,P17,0
 159 | 8,P18,0
 160 | 8,P19,1
 161 | 8,P20,1
 162 | 9,P1,0
 163 | 9,P2,0
 164 | 9,P3,0
 165 | 9,P4,0
 166 | 9,P5,0
 167 | 9,P6,0
 168 | 9,P7,0
 169 | 9,P8,1
 170 | 9,P9,0
 171 | 9,P10,0
 172 | 9,P11,1
 173 | 9,P12,0
 174 | 9,P13,0
 175 | 9,P14,1
 176 | 9,P15,0
 177 | 9,P16,1
 178 | 9,P17,0
 179 | 9,P18,1
 180 | 9,P19,1
 181 | 9,P20,0
 182 | 10,P1,1
 183 | 10,P2,1
 184 | 10,P3,0
 185 | 10,P4,1
 186 | 10,P5,0
 187 | 10,P6,0
 188 | 10,P7,1
 189 | 10,P8,1
 190 | 10,P9,0
 191 | 10,P10,0
 192 | 10,P11,1
 193 | 10,P12,0
 194 | 10,P13,1
 195 | 10,P14,1
 196 | 10,P15,1
 197 | 10,P16,1
 198 | 10,P17,0
 199 | 10,P18,0
 200 | 10,P19,0
 201 | 10,P20,0
 202 | 11,P1,0
 203 | 11,P2,1
 204 | 11,P3,1
 205 | 11,P4,1
 206 | 11,P5,0
 207 | 11,P6,0
 208 | 11,P7,0
 209 | 11,P8,0
 210 | 11,P9,0
 211 | 11,P10,0
 212 | 11,P11,1
 213 | 11,P12,0
 214 | 11,P13,0
 215 | 11,P14,1
 216 | 11,P15,0
 217 | 11,P16,0
 218 | 11,P17,0
 219 | 11,P18,0
 220 | 11,P19,0
 221 | 11,P20,0
 222 | 12,P1,1
 223 | 12,P2,0
 224 | 12,P3,0
 225 | 12,P4,1
 226 | 12,P5,0
 227 | 12,P6,0
 228 | 12,P7,0
 229 | 12,P8,0
 230 | 12,P9,0
 231 | 12,P10,0
 232 | 12,P11,0
 233 | 12,P12,1
 234 | 12,P13,0
 235 | 12,P14,1
 236 | 12,P15,1
 237 | 12,P16,1
 238 | 12,P17,1
 239 | 12,P18,1
 240 | 12,P19,0
 241 | 12,P20,0
 242 | 13,P1,1
 243 | 13,P2,1
 244 | 13,P3,0
 245 | 13,P4,0
 246 | 13,P5,1
 247 | 13,P6,0
 248 | 13,P7,0
 249 | 13,P8,1
 250 | 13,P9,1
 251 | 13,P10,0
 252 | 13,P11,0
 253 | 13,P12,1
 254 | 13,P13,1
 255 | 13,P14,0
 256 | 13,P15,0
 257 | 13,P16,0
 258 | 13,P17,0
 259 | 13,P18,1
 260 | 13,P19,1
 261 | 13,P20,1
 262 | 14,P1,1
 263 | 14,P2,0
 264 | 14,P3,1
 265 | 14,P4,1
 266 | 14,P5,0
 267 | 14,P6,1
 268 | 14,P7,1
 269 | 14,P8,0
 270 | 14,P9,0
 271 | 14,P10,0
 272 | 14,P11,0
 273 | 14,P12,0
 274 | 14,P13,1
 275 | 14,P14,0
 276 | 14,P15,1
 277 | 14,P16,1
 278 | 14,P17,0
 279 | 14,P18,0
 280 | 14,P19,1
 281 | 14,P20,0
 282 | 15,P1,0
 283 | 15,P2,1
 284 | 15,P3,0
 285 | 15,P4,0
 286 | 15,P5,1
 287 | 15,P6,1
 288 | 15,P7,1
 289 | 15,P8,0
 290 | 15,P9,0
 291 | 15,P10,0
 292 | 15,P11,0
 293 | 15,P12,0
 294 | 15,P13,0
 295 | 15,P14,1
 296 | 15,P15,0
 297 | 15,P16,1
 298 | 15,P17,0
 299 | 15,P18,1
 300 | 15,P19,0
 301 | 15,P20,0
 302 | 16,P1,0
 303 | 16,P2,1
 304 | 16,P3,0
 305 | 16,P4,0
 306 | 16,P5,0
 307 | 16,P6,1
 308 | 16,P7,1
 309 | 16,P8,1
 310 | 16,P9,0
 311 | 16,P10,0
 312 | 16,P11,0
 313 | 16,P12,1
 314 | 16,P13,0
 315 | 16,P14,0
 316 | 16,P15,0
 317 | 16,P16,0
 318 | 16,P17,0
 319 | 16,P18,0
 320 | 16,P19,1
 321 | 16,P20,0
 322 | 17,P1,1
 323 | 17,P2,1
 324 | 17,P3,1
 325 | 17,P4,0
 326 | 17,P5,1
 327 | 17,P6,0
 328 | 17,P7,0
 329 | 17,P8,1
 330 | 17,P9,0
 331 | 17,P10,0
 332 | 17,P11,1
 333 | 17,P12,0
 334 | 17,P13,1
 335 | 17,P14,0
 336 | 17,P15,0
 337 | 17,P16,0
 338 | 17,P17,1
 339 | 17,P18,0
 340 | 17,P19,1
 341 | 17,P20,1
 342 | 18,P1,1
 343 | 18,P2,1
 344 | 18,P3,0
 345 | 18,P4,0
 346 | 18,P5,0
 347 | 18,P6,1
 348 | 18,P7,0
 349 | 18,P8,0
 350 | 18,P9,0
 351 | 18,P10,0
 352 | 18,P11,0
 353 | 18,P12,1
 354 | 18,P13,0
 355 | 18,P14,0
 356 | 18,P15,0
 357 | 18,P16,0
 358 | 18,P17,1
 359 | 18,P18,0
 360 | 18,P19,0
 361 | 18,P20,0
 362 | 19,P1,1
 363 | 19,P2,1
 364 | 19,P3,0
 365 | 19,P4,0
 366 | 19,P5,0
 367 | 19,P6,0
 368 | 19,P7,0
 369 | 19,P8,0
 370 | 19,P9,0
 371 | 19,P10,0
 372 | 19,P11,0
 373 | 19,P12,1
 374 | 19,P13,1
 375 | 19,P14,0
 376 | 19,P15,0
 377 | 19,P16,1
 378 | 19,P17,1
 379 | 19,P18,0
 380 | 19,P19,0
 381 | 19,P20,0
 382 | 20,P1,0
 383 | 20,P2,1
 384 | 20,P3,0
 385 | 20,P4,1
 386 | 20,P5,0
 387 | 20,P6,0
 388 | 20,P7,0
 389 | 20,P8,0
 390 | 20,P9,1
 391 | 20,P10,0
 392 | 20,P11,0
 393 | 20,P12,1
 394 | 20,P13,1
 395 | 20,P14,1
 396 | 20,P15,0
 397 | 20,P16,0
 398 | 20,P17,0
 399 | 20,P18,0
 400 | 20,P19,0
 401 | 20,P20,1
 402 | 21,P1,0
 403 | 21,P2,0
 404 | 21,P3,0
 405 | 21,P4,0
 406 | 21,P5,1
 407 | 21,P6,0
 408 | 21,P7,1
 409 | 21,P8,1
 410 | 21,P9,1
 411 | 21,P10,0
 412 | 21,P11,1
 413 | 21,P12,0
 414 | 21,P13,0
 415 | 21,P14,1
 416 | 21,P15,1
 417 | 21,P16,0
 418 | 21,P17,1
 419 | 21,P18,0
 420 | 21,P19,1
 421 | 21,P20,0
 422 | 22,P1,1
 423 | 22,P2,0
 424 | 22,P3,0
 425 | 22,P4,1
 426 | 22,P5,0
 427 | 22,P6,0
 428 | 22,P7,0
 429 | 22,P8,0
 430 | 22,P9,1
 431 | 22,P10,1
 432 | 22,P11,1
 433 | 22,P12,0
 434 | 22,P13,0
 435 | 22,P14,0
 436 | 22,P15,1
 437 | 22,P16,0
 438 | 22,P17,0
 439 | 22,P18,0
 440 | 22,P19,0
 441 | 22,P20,1
 442 | 23,P1,1
 443 | 23,P2,0
 444 | 23,P3,1
 445 | 23,P4,0
 446 | 23,P5,0
 447 | 23,P6,0
 448 | 23,P7,0
 449 | 23,P8,0
 450 | 23,P9,0
 451 | 23,P10,0
 452 | 23,P11,1
 453 | 23,P12,0
 454 | 23,P13,1
 455 | 23,P14,0
 456 | 23,P15,0
 457 | 23,P16,0
 458 | 23,P17,0
 459 | 23,P18,0
 460 | 23,P19,0
 461 | 23,P20,1
 462 | 24,P1,1
 463 | 24,P2,1
 464 | 24,P3,0
 465 | 24,P4,1
 466 | 24,P5,0
 467 | 24,P6,1
 468 | 24,P7,0
 469 | 24,P8,1
 470 | 24,P9,1
 471 | 24,P10,0
 472 | 24,P11,1
 473 | 24,P12,0
 474 | 24,P13,0
 475 | 24,P14,1
 476 | 24,P15,1
 477 | 24,P16,0
 478 | 24,P17,1
 479 | 24,P18,0
 480 | 24,P19,0
 481 | 24,P20,0
 482 | 25,P1,0
 483 | 25,P2,1
 484 | 25,P3,0
 485 | 25,P4,1
 486 | 25,P5,0
 487 | 25,P6,0
 488 | 25,P7,0
 489 | 25,P8,1
 490 | 25,P9,0
 491 | 25,P10,0
 492 | 25,P11,0
 493 | 25,P12,0
 494 | 25,P13,1
 495 | 25,P14,1
 496 | 25,P15,0
 497 | 25,P16,0
 498 | 25,P17,0
 499 | 25,P18,0
 500 | 25,P19,0
 501 | 25,P20,0
 502 | 26,P1,0
 503 | 26,P2,0
 504 | 26,P3,0
 505 | 26,P4,1
 506 | 26,P5,0
 507 | 26,P6,0
 508 | 26,P7,0
 509 | 26,P8,0
 510 | 26,P9,0
 511 | 26,P10,1
 512 | 26,P11,1
 513 | 26,P12,0
 514 | 26,P13,1
 515 | 26,P14,1
 516 | 26,P15,1
 517 | 26,P16,1
 518 | 26,P17,0
 519 | 26,P18,0
 520 | 26,P19,0
 521 | 26,P20,0
 522 | 27,P1,0
 523 | 27,P2,1
 524 | 27,P3,0
 525 | 27,P4,0
 526 | 27,P5,0
 527 | 27,P6,0
 528 | 27,P7,0
 529 | 27,P8,0
 530 | 27,P9,0
 531 | 27,P10,0
 532 | 27,P11,0
 533 | 27,P12,0
 534 | 27,P13,1
 535 | 27,P14,1
 536 | 27,P15,1
 537 | 27,P16,0
 538 | 27,P17,0
 539 | 27,P18,0
 540 | 27,P19,0
 541 | 27,P20,0
 542 | 28,P1,0
 543 | 28,P2,1
 544 | 28,P3,0
 545 | 28,P4,0
 546 | 28,P5,0
 547 | 28,P6,0
 548 | 28,P7,1
 549 | 28,P8,1
 550 | 28,P9,1
 551 | 28,P10,0
 552 | 28,P11,1
 553 | 28,P12,0
 554 | 28,P13,0
 555 | 28,P14,1
 556 | 28,P15,0
 557 | 28,P16,0
 558 | 28,P17,0
 559 | 28,P18,0
 560 | 28,P19,1
 561 | 28,P20,0
 562 | 29,P1,0
 563 | 29,P2,0
 564 | 29,P3,0
 565 | 29,P4,0
 566 | 29,P5,0
 567 | 29,P6,1
 568 | 29,P7,1
 569 | 29,P8,0
 570 | 29,P9,0
 571 | 29,P10,0
 572 | 29,P11,0
 573 | 29,P12,0
 574 | 29,P13,0
 575 | 29,P14,0
 576 | 29,P15,1
 577 | 29,P16,1
 578 | 29,P17,0
 579 | 29,P18,0
 580 | 29,P19,0
 581 | 29,P20,0
 582 | 30,P1,0
 583 | 30,P2,1
 584 | 30,P3,1
 585 | 30,P4,0
 586 | 30,P5,1
 587 | 30,P6,0
 588 | 30,P7,0
 589 | 30,P8,1
 590 | 30,P9,0
 591 | 30,P10,1
 592 | 30,P11,0
 593 | 30,P12,0
 594 | 30,P13,0
 595 | 30,P14,0
 596 | 30,P15,0
 597 | 30,P16,0
 598 | 30,P17,0
 599 | 30,P18,0
 600 | 30,P19,0
 601 | 30,P20,0
 602 | 31,P1,1
 603 | 31,P2,1
 604 | 31,P3,0
 605 | 31,P4,1
 606 | 31,P5,1
 607 | 31,P6,0
 608 | 31,P7,0
 609 | 31,P8,1
 610 | 31,P9,1
 611 | 31,P10,1
 612 | 31,P11,1
 613 | 31,P12,0
 614 | 31,P13,0
 615 | 31,P14,1
 616 | 31,P15,1
 617 | 31,P16,0
 618 | 31,P17,0
 619 | 31,P18,0
 620 | 31,P19,0
 621 | 31,P20,0
 622 | 32,P1,0
 623 | 32,P2,0
 624 | 32,P3,0
 625 | 32,P4,1
 626 | 32,P5,1
 627 | 32,P6,1
 628 | 32,P7,0
 629 | 32,P8,0
 630 | 32,P9,0
 631 | 32,P10,1
 632 | 32,P11,0
 633 | 32,P12,1
 634 | 32,P13,1
 635 | 32,P14,0
 636 | 32,P15,0
 637 | 32,P16,0
 638 | 32,P17,1
 639 | 32,P18,0
 640 | 32,P19,1
 641 | 32,P20,1
 642 | 33,P1,0
 643 | 33,P2,0
 644 | 33,P3,0
 645 | 33,P4,0
 646 | 33,P5,0
 647 | 33,P6,0
 648 | 33,P7,1
 649 | 33,P8,1
 650 | 33,P9,0
 651 | 33,P10,1
 652 | 33,P11,0
 653 | 33,P12,0
 654 | 33,P13,0
 655 | 33,P14,0
 656 | 33,P15,0
 657 | 33,P16,0
 658 | 33,P17,0
 659 | 33,P18,0
 660 | 33,P19,0
 661 | 33,P20,0
 662 | 34,P1,1
 663 | 34,P2,1
 664 | 34,P3,1
 665 | 34,P4,0
 666 | 34,P5,1
 667 | 34,P6,1
 668 | 34,P7,1
 669 | 34,P8,0
 670 | 34,P9,1
 671 | 34,P10,0
 672 | 34,P11,1
 673 | 34,P12,0
 674 | 34,P13,1
 675 | 34,P14,0
 676 | 34,P15,0
 677 | 34,P16,0
 678 | 34,P17,0
 679 | 34,P18,1
 680 | 34,P19,0
 681 | 34,P20,0
 682 | 35,P1,1
 683 | 35,P2,1
 684 | 35,P3,0
 685 | 35,P4,0
 686 | 35,P5,0
 687 | 35,P6,0
 688 | 35,P7,1
 689 | 35,P8,1
 690 | 35,P9,0
 691 | 35,P10,0
 692 | 35,P11,1
 693 | 35,P12,0
 694 | 35,P13,0
 695 | 35,P14,0
 696 | 35,P15,0
 697 | 35,P16,0
 698 | 35,P17,0
 699 | 35,P18,1
 700 | 35,P19,0
 701 | 35,P20,0
 702 | 36,P1,0
 703 | 36,P2,1
 704 | 36,P3,0
 705 | 36,P4,1
 706 | 36,P5,0
 707 | 36,P6,0
 708 | 36,P7,0
 709 | 36,P8,0
 710 | 36,P9,1
 711 | 36,P10,0
 712 | 36,P11,1
 713 | 36,P12,0
 714 | 36,P13,0
 715 | 36,P14,1
 716 | 36,P15,0
 717 | 36,P16,0
 718 | 36,P17,0
 719 | 36,P18,0
 720 | 36,P19,0
 721 | 36,P20,1
 722 | 37,P1,1
 723 | 37,P2,1
 724 | 37,P3,0
 725 | 37,P4,1
 726 | 37,P5,0
 727 | 37,P6,0
 728 | 37,P7,0
 729 | 37,P8,1
 730 | 37,P9,1
 731 | 37,P10,1
 732 | 37,P11,0
 733 | 37,P12,1
 734 | 37,P13,1
 735 | 37,P14,0
 736 | 37,P15,0
 737 | 37,P16,0
 738 | 37,P17,0
 739 | 37,P18,0
 740 | 37,P19,0
 741 | 37,P20,1
 742 | 38,P1,1
 743 | 38,P2,1
 744 | 38,P3,0
 745 | 38,P4,1
 746 | 38,P5,0
 747 | 38,P6,0
 748 | 38,P7,1
 749 | 38,P8,0
 750 | 38,P9,0
 751 | 38,P10,0
 752 | 38,P11,0
 753 | 38,P12,1
 754 | 38,P13,0
 755 | 38,P14,0
 756 | 38,P15,0
 757 | 38,P16,0
 758 | 38,P17,1
 759 | 38,P18,0
 760 | 38,P19,0
 761 | 38,P20,0
 762 | 39,P1,0
 763 | 39,P2,0
 764 | 39,P3,0
 765 | 39,P4,1
 766 | 39,P5,0
 767 | 39,P6,0
 768 | 39,P7,0
 769 | 39,P8,0
 770 | 39,P9,1
 771 | 39,P10,0
 772 | 39,P11,0
 773 | 39,P12,1
 774 | 39,P13,1
 775 | 39,P14,0
 776 | 39,P15,0
 777 | 39,P16,1
 778 | 39,P17,0
 779 | 39,P18,0
 780 | 39,P19,0
 781 | 39,P20,1
 782 | 40,P1,0
 783 | 40,P2,1
 784 | 40,P3,0
 785 | 40,P4,0
 786 | 40,P5,1
 787 | 40,P6,0
 788 | 40,P7,0
 789 | 40,P8,0
 790 | 40,P9,0
 791 | 40,P10,1
 792 | 40,P11,0
 793 | 40,P12,0
 794 | 40,P13,1
 795 | 40,P14,0
 796 | 40,P15,1
 797 | 40,P16,0
 798 | 40,P17,0
 799 | 40,P18,1
 800 | 40,P19,1
 801 | 40,P20,0
 802 | 41,P1,1
 803 | 41,P2,0
 804 | 41,P3,0
 805 | 41,P4,0
 806 | 41,P5,0
 807 | 41,P6,1
 808 | 41,P7,1
 809 | 41,P8,1
 810 | 41,P9,0
 811 | 41,P10,0
 812 | 41,P11,1
 813 | 41,P12,1
 814 | 41,P13,1
 815 | 41,P14,0
 816 | 41,P15,0
 817 | 41,P16,0
 818 | 41,P17,0
 819 | 41,P18,0
 820 | 41,P19,0
 821 | 41,P20,0
 822 | 42,P1,0
 823 | 42,P2,1
 824 | 42,P3,0
 825 | 42,P4,1
 826 | 42,P5,0
 827 | 42,P6,0
 828 | 42,P7,0
 829 | 42,P8,0
 830 | 42,P9,1
 831 | 42,P10,0
 832 | 42,P11,1
 833 | 42,P12,1
 834 | 42,P13,0
 835 | 42,P14,0
 836 | 42,P15,0
 837 | 42,P16,0
 838 | 42,P17,0
 839 | 42,P18,0
 840 | 42,P19,0
 841 | 42,P20,0
 842 | 43,P1,1
 843 | 43,P2,1
 844 | 43,P3,1
 845 | 43,P4,1
 846 | 43,P5,1
 847 | 43,P6,0
 848 | 43,P7,1
 849 | 43,P8,0
 850 | 43,P9,1
 851 | 43,P10,0
 852 | 43,P11,1
 853 | 43,P12,1
 854 | 43,P13,1
 855 | 43,P14,1
 856 | 43,P15,0
 857 | 43,P16,0
 858 | 43,P17,1
 859 | 43,P18,1
 860 | 43,P19,1
 861 | 43,P20,0
 862 | 44,P1,0
 863 | 44,P2,1
 864 | 44,P3,1
 865 | 44,P4,0
 866 | 44,P5,0
 867 | 44,P6,0
 868 | 44,P7,0
 869 | 44,P8,1
 870 | 44,P9,0
 871 | 44,P10,0
 872 | 44,P11,1
 873 | 44,P12,0
 874 | 44,P13,1
 875 | 44,P14,0
 876 | 44,P15,0
 877 | 44,P16,0
 878 | 44,P17,1
 879 | 44,P18,0
 880 | 44,P19,1
 881 | 44,P20,1
 882 | 45,P1,0
 883 | 45,P2,0
 884 | 45,P3,1
 885 | 45,P4,0
 886 | 45,P5,0
 887 | 45,P6,0
 888 | 45,P7,1
 889 | 45,P8,0
 890 | 45,P9,0
 891 | 45,P10,0
 892 | 45,P11,1
 893 | 45,P12,0
 894 | 45,P13,0
 895 | 45,P14,1
 896 | 45,P15,0
 897 | 45,P16,1
 898 | 45,P17,1
 899 | 45,P18,0
 900 | 45,P19,0
 901 | 45,P20,0
 902 | 46,P1,0
 903 | 46,P2,0
 904 | 46,P3,0
 905 | 46,P4,0
 906 | 46,P5,1
 907 | 46,P6,0
 908 | 46,P7,1
 909 | 46,P8,0
 910 | 46,P9,1
 911 | 46,P10,0
 912 | 46,P11,0
 913 | 46,P12,0
 914 | 46,P13,0
 915 | 46,P14,0
 916 | 46,P15,1
 917 | 46,P16,0
 918 | 46,P17,0
 919 | 46,P18,0
 920 | 46,P19,0
 921 | 46,P20,0
 922 | 47,P1,0
 923 | 47,P2,0
 924 | 47,P3,0
 925 | 47,P4,1
 926 | 47,P5,0
 927 | 47,P6,1
 928 | 47,P7,0
 929 | 47,P8,0
 930 | 47,P9,0
 931 | 47,P10,1
 932 | 47,P11,0
 933 | 47,P12,0
 934 | 47,P13,0
 935 | 47,P14,0
 936 | 47,P15,0
 937 | 47,P16,0
 938 | 47,P17,0
 939 | 47,P18,0
 940 | 47,P19,0
 941 | 47,P20,0
 942 | 48,P1,1
 943 | 48,P2,0
 944 | 48,P3,1
 945 | 48,P4,0
 946 | 48,P5,1
 947 | 48,P6,0
 948 | 48,P7,1
 949 | 48,P8,0
 950 | 48,P9,1
 951 | 48,P10,0
 952 | 48,P11,0
 953 | 48,P12,0
 954 | 48,P13,1
 955 | 48,P14,1
 956 | 48,P15,0
 957 | 48,P16,0
 958 | 48,P17,1
 959 | 48,P18,0
 960 | 48,P19,1
 961 | 48,P20,0
 962 | 49,P1,0
 963 | 49,P2,0
 964 | 49,P3,0
 965 | 49,P4,0
 966 | 49,P5,1
 967 | 49,P6,0
 968 | 49,P7,0
 969 | 49,P8,0
 970 | 49,P9,0
 971 | 49,P10,1
 972 | 49,P11,0
 973 | 49,P12,0
 974 | 49,P13,0
 975 | 49,P14,0
 976 | 49,P15,0
 977 | 49,P16,1
 978 | 49,P17,0
 979 | 49,P18,0
 980 | 49,P19,1
 981 | 49,P20,0
 982 | 50,P1,0
 983 | 50,P2,0
 984 | 50,P3,0
 985 | 50,P4,0
 986 | 50,P5,0
 987 | 50,P6,0
 988 | 50,P7,0
 989 | 50,P8,0
 990 | 50,P9,0
 991 | 50,P10,0
 992 | 50,P11,0
 993 | 50,P12,0
 994 | 50,P13,1
 995 | 50,P14,1
 996 | 50,P15,0
 997 | 50,P16,1
 998 | 50,P17,1
 999 | 50,P18,0
1000 | 50,P19,1
1001 | 50,P20,1
1002 | 51,P1,0
1003 | 51,P2,0
1004 | 51,P3,1
1005 | 51,P4,1
1006 | 51,P5,0
1007 | 51,P6,0
1008 | 51,P7,0
1009 | 51,P8,0
1010 | 51,P9,0
1011 | 51,P10,0
1012 | 51,P11,1
1013 | 51,P12,1
1014 | 51,P13,0
1015 | 51,P14,1
1016 | 51,P15,0
1017 | 51,P16,1
1018 | 51,P17,0
1019 | 51,P18,0
1020 | 51,P19,0
1021 | 51,P20,0
1022 | 52,P1,0
1023 | 52,P2,0
1024 | 52,P3,0
1025 | 52,P4,1
1026 | 52,P5,0
1027 | 52,P6,0
1028 | 52,P7,0
1029 | 52,P8,1
1030 | 52,P9,0
1031 | 52,P10,1
1032 | 52,P11,1
1033 | 52,P12,0
1034 | 52,P13,0
1035 | 52,P14,0
1036 | 52,P15,0
1037 | 52,P16,0
1038 | 52,P17,1
1039 | 52,P18,0
1040 | 52,P19,0
1041 | 52,P20,0
1042 | 53,P1,0
1043 | 53,P2,0
1044 | 53,P3,0
1045 | 53,P4,0
1046 | 53,P5,0
1047 | 53,P6,1
1048 | 53,P7,0
1049 | 53,P8,0
1050 | 53,P9,0
1051 | 53,P10,0
1052 | 53,P11,1
1053 | 53,P12,1
1054 | 53,P13,0
1055 | 53,P14,0
1056 | 53,P15,0
1057 | 53,P16,1
1058 | 53,P17,0
1059 | 53,P18,0
1060 | 53,P19,0
1061 | 53,P20,0
1062 | 54,P1,1
1063 | 54,P2,1
1064 | 54,P3,1
1065 | 54,P4,0
1066 | 54,P5,0
1067 | 54,P6,0
1068 | 54,P7,0
1069 | 54,P8,0
1070 | 54,P9,1
1071 | 54,P10,1
1072 | 54,P11,0
1073 | 54,P12,0
1074 | 54,P13,0
1075 | 54,P14,0
1076 | 54,P15,1
1077 | 54,P16,1
1078 | 54,P17,0
1079 | 54,P18,0
1080 | 54,P19,0
1081 | 54,P20,0
1082 | 55,P1,1
1083 | 55,P2,0
1084 | 55,P3,0
1085 | 55,P4,0
1086 | 55,P5,0
1087 | 55,P6,0
1088 | 55,P7,0
1089 | 55,P8,1
1090 | 55,P9,0
1091 | 55,P10,0
1092 | 55,P11,1
1093 | 55,P12,1
1094 | 55,P13,1
1095 | 55,P14,0
1096 | 55,P15,0
1097 | 55,P16,0
1098 | 55,P17,1
1099 | 55,P18,0
1100 | 55,P19,0
1101 | 55,P20,0
1102 | 56,P1,1
1103 | 56,P2,0
1104 | 56,P3,0
1105 | 56,P4,0
1106 | 56,P5,0
1107 | 56,P6,0
1108 | 56,P7,0
1109 | 56,P8,0
1110 | 56,P9,1
1111 | 56,P10,1
1112 | 56,P11,0
1113 | 56,P12,1
1114 | 56,P13,0
1115 | 56,P14,1
1116 | 56,P15,0
1117 | 56,P16,0
1118 | 56,P17,0
1119 | 56,P18,0
1120 | 56,P19,0
1121 | 56,P20,1
1122 | 57,P1,0
1123 | 57,P2,0
1124 | 57,P3,0
1125 | 57,P4,1
1126 | 57,P5,0
1127 | 57,P6,0
1128 | 57,P7,1
1129 | 57,P8,1
1130 | 57,P9,0
1131 | 57,P10,0
1132 | 57,P11,0
1133 | 57,P12,1
1134 | 57,P13,0
1135 | 57,P14,0
1136 | 57,P15,0
1137 | 57,P16,0
1138 | 57,P17,0
1139 | 57,P18,1
1140 | 57,P19,1
1141 | 57,P20,1
1142 | 58,P1,1
1143 | 58,P2,1
1144 | 58,P3,0
1145 | 58,P4,0
1146 | 58,P5,1
1147 | 58,P6,0
1148 | 58,P7,1
1149 | 58,P8,1
1150 | 58,P9,0
1151 | 58,P10,0
1152 | 58,P11,0
1153 | 58,P12,0
1154 | 58,P13,0
1155 | 58,P14,0
1156 | 58,P15,0
1157 | 58,P16,1
1158 | 58,P17,0
1159 | 58,P18,1
1160 | 58,P19,0
1161 | 58,P20,0
1162 | 59,P1,0
1163 | 59,P2,0
1164 | 59,P3,0
1165 | 59,P4,1
1166 | 59,P5,1
1167 | 59,P6,1
1168 | 59,P7,0
1169 | 59,P8,1
1170 | 59,P9,0
1171 | 59,P10,0
1172 | 59,P11,1
1173 | 59,P12,0
1174 | 59,P13,0
1175 | 59,P14,1
1176 | 59,P15,0
1177 | 59,P16,1
1178 | 59,P17,0
1179 | 59,P18,0
1180 | 59,P19,0
1181 | 59,P20,0
1182 | 60,P1,0
1183 | 60,P2,1
1184 | 60,P3,0
1185 | 60,P4,1
1186 | 60,P5,0
1187 | 60,P6,1
1188 | 60,P7,0
1189 | 60,P8,1
1190 | 60,P9,0
1191 | 60,P10,1
1192 | 60,P11,0
1193 | 60,P12,0
1194 | 60,P13,0
1195 | 60,P14,0
1196 | 60,P15,0
1197 | 60,P16,1
1198 | 60,P17,0
1199 | 60,P18,1
1200 | 60,P19,0
1201 | 60,P20,0
1202 | 61,P1,0
1203 | 61,P2,0
1204 | 61,P3,0
1205 | 61,P4,1
1206 | 61,P5,0
1207 | 61,P6,0
1208 | 61,P7,0
1209 | 61,P8,1
1210 | 61,P9,1
1211 | 61,P10,0
1212 | 61,P11,1
1213 | 61,P12,0
1214 | 61,P13,0
1215 | 61,P14,1
1216 | 61,P15,1
1217 | 61,P16,0
1218 | 61,P17,0
1219 | 61,P18,1
1220 | 61,P19,0
1221 | 61,P20,0
1222 | 62,P1,1
1223 | 62,P2,1
1224 | 62,P3,0
1225 | 62,P4,1
1226 | 62,P5,1
1227 | 62,P6,1
1228 | 62,P7,0
1229 | 62,P8,0
1230 | 62,P9,0
1231 | 62,P10,0
1232 | 62,P11,1
1233 | 62,P12,0
1234 | 62,P13,0
1235 | 62,P14,0
1236 | 62,P15,0
1237 | 62,P16,0
1238 | 62,P17,1
1239 | 62,P18,0
1240 | 62,P19,1
1241 | 62,P20,0
1242 | 63,P1,1
1243 | 63,P2,1
1244 | 63,P3,1
1245 | 63,P4,1
1246 | 63,P5,0
1247 | 63,P6,0
1248 | 63,P7,1
1249 | 63,P8,1
1250 | 63,P9,1
1251 | 63,P10,0
1252 | 63,P11,1
1253 | 63,P12,1
1254 | 63,P13,0
1255 | 63,P14,1
1256 | 63,P15,0
1257 | 63,P16,0
1258 | 63,P17,0
1259 | 63,P18,0
1260 | 63,P19,0
1261 | 63,P20,0
1262 | 64,P1,1
1263 | 64,P2,0
1264 | 64,P3,0
1265 | 64,P4,1
1266 | 64,P5,0
1267 | 64,P6,0
1268 | 64,P7,0
1269 | 64,P8,0
1270 | 64,P9,1
1271 | 64,P10,0
1272 | 64,P11,1
1273 | 64,P12,0
1274 | 64,P13,1
1275 | 64,P14,1
1276 | 64,P15,0
1277 | 64,P16,1
1278 | 64,P17,1
1279 | 64,P18,0
1280 | 64,P19,1
1281 | 64,P20,0
1282 | 65,P1,0
1283 | 65,P2,1
1284 | 65,P3,1
1285 | 65,P4,1
1286 | 65,P5,0
1287 | 65,P6,1
1288 | 65,P7,0
1289 | 65,P8,0
1290 | 65,P9,0
1291 | 65,P10,1
1292 | 65,P11,0
1293 | 65,P12,0
1294 | 65,P13,0
1295 | 65,P14,0
1296 | 65,P15,0
1297 | 65,P16,1
1298 | 65,P17,0
1299 | 65,P18,0
1300 | 65,P19,0
1301 | 65,P20,0
1302 | 66,P1,1
1303 | 66,P2,1
1304 | 66,P3,1
1305 | 66,P4,0
1306 | 66,P5,0
1307 | 66,P6,1
1308 | 66,P7,0
1309 | 66,P8,1
1310 | 66,P9,1
1311 | 66,P10,0
1312 | 66,P11,0
1313 | 66,P12,0
1314 | 66,P13,0
1315 | 66,P14,0
1316 | 66,P15,0
1317 | 66,P16,1
1318 | 66,P17,1
1319 | 66,P18,0
1320 | 66,P19,0
1321 | 66,P20,0
1322 | 67,P1,0
1323 | 67,P2,0
1324 | 67,P3,0
1325 | 67,P4,0
1326 | 67,P5,1
1327 | 67,P6,0
1328 | 67,P7,0
1329 | 67,P8,1
1330 | 67,P9,0
1331 | 67,P10,0
1332 | 67,P11,1
1333 | 67,P12,1
1334 | 67,P13,0
1335 | 67,P14,1
1336 | 67,P15,0
1337 | 67,P16,0
1338 | 67,P17,0
1339 | 67,P18,0
1340 | 67,P19,1
1341 | 67,P20,0
1342 | 68,P1,1
1343 | 68,P2,0
1344 | 68,P3,0
1345 | 68,P4,0
1346 | 68,P5,1
1347 | 68,P6,0
1348 | 68,P7,0
1349 | 68,P8,0
1350 | 68,P9,1
1351 | 68,P10,0
1352 | 68,P11,1
1353 | 68,P12,0
1354 | 68,P13,0
1355 | 68,P14,1
1356 | 68,P15,0
1357 | 68,P16,0
1358 | 68,P17,0
1359 | 68,P18,0
1360 | 68,P19,1
1361 | 68,P20,0
1362 | 69,P1,0
1363 | 69,P2,1
1364 | 69,P3,1
1365 | 69,P4,1
1366 | 69,P5,0
1367 | 69,P6,0
1368 | 69,P7,1
1369 | 69,P8,0
1370 | 69,P9,0
1371 | 69,P10,0
1372 | 69,P11,1
1373 | 69,P12,0
1374 | 69,P13,0
1375 | 69,P14,1
1376 | 69,P15,0
1377 | 69,P16,0
1378 | 69,P17,1
1379 | 69,P18,0
1380 | 69,P19,0
1381 | 69,P20,1
1382 | 70,P1,1
1383 | 70,P2,0
1384 | 70,P3,1
1385 | 70,P4,0
1386 | 70,P5,0
1387 | 70,P6,0
1388 | 70,P7,1
1389 | 70,P8,0
1390 | 70,P9,1
1391 | 70,P10,0
1392 | 70,P11,0
1393 | 70,P12,0
1394 | 70,P13,0
1395 | 70,P14,0
1396 | 70,P15,0
1397 | 70,P16,1
1398 | 70,P17,0
1399 | 70,P18,1
1400 | 70,P19,0
1401 | 70,P20,0
1402 | 71,P1,1
1403 | 71,P2,1
1404 | 71,P3,1
1405 | 71,P4,0
1406 | 71,P5,0
1407 | 71,P6,0
1408 | 71,P7,1
1409 | 71,P8,1
1410 | 71,P9,0
1411 | 71,P10,1
1412 | 71,P11,0
1413 | 71,P12,1
1414 | 71,P13,0
1415 | 71,P14,1
1416 | 71,P15,1
1417 | 71,P16,0
1418 | 71,P17,0
1419 | 71,P18,0
1420 | 71,P19,0
1421 | 71,P20,0
1422 | 72,P1,0
1423 | 72,P2,0
1424 | 72,P3,0
1425 | 72,P4,0
1426 | 72,P5,0
1427 | 72,P6,0
1428 | 72,P7,1
1429 | 72,P8,1
1430 | 72,P9,0
1431 | 72,P10,0
1432 | 72,P11,1
1433 | 72,P12,0
1434 | 72,P13,0
1435 | 72,P14,1
1436 | 72,P15,1
1437 | 72,P16,0
1438 | 72,P17,0
1439 | 72,P18,0
1440 | 72,P19,0
1441 | 72,P20,1
1442 | 73,P1,0
1443 | 73,P2,1
1444 | 73,P3,0
1445 | 73,P4,1
1446 | 73,P5,0
1447 | 73,P6,0
1448 | 73,P7,0
1449 | 73,P8,0
1450 | 73,P9,0
1451 | 73,P10,1
1452 | 73,P11,0
1453 | 73,P12,0
1454 | 73,P13,1
1455 | 73,P14,1
1456 | 73,P15,0
1457 | 73,P16,1
1458 | 73,P17,1
1459 | 73,P18,0
1460 | 73,P19,0
1461 | 73,P20,0
1462 | 74,P1,1
1463 | 74,P2,0
1464 | 74,P3,0
1465 | 74,P4,1
1466 | 74,P5,1
1467 | 74,P6,0
1468 | 74,P7,0
1469 | 74,P8,0
1470 | 74,P9,0
1471 | 74,P10,0
1472 | 74,P11,1
1473 | 74,P12,0
1474 | 74,P13,1
1475 | 74,P14,1
1476 | 74,P15,0
1477 | 74,P16,0
1478 | 74,P17,0
1479 | 74,P18,0
1480 | 74,P19,0
1481 | 74,P20,0
1482 | 75,P1,0
1483 | 75,P2,0
1484 | 75,P3,1
1485 | 75,P4,1
1486 | 75,P5,0
1487 | 75,P6,1
1488 | 75,P7,1
1489 | 75,P8,0
1490 | 75,P9,1
1491 | 75,P10,1
1492 | 75,P11,0
1493 | 75,P12,1
1494 | 75,P13,0
1495 | 75,P14,1
1496 | 75,P15,0
1497 | 75,P16,0
1498 | 75,P17,1
1499 | 75,P18,0
1500 | 75,P19,0
1501 | 75,P20,0
1502 | 76,P1,0
1503 | 76,P2,1
1504 | 76,P3,0
1505 | 76,P4,0
1506 | 76,P5,0
1507 | 76,P6,0
1508 | 76,P7,0
1509 | 76,P8,0
1510 | 76,P9,1
1511 | 76,P10,0
1512 | 76,P11,1
1513 | 76,P12,0
1514 | 76,P13,0
1515 | 76,P14,1
1516 | 76,P15,0
1517 | 76,P16,1
1518 | 76,P17,0
1519 | 76,P18,1
1520 | 76,P19,0
1521 | 76,P20,0
1522 | 77,P1,1
1523 | 77,P2,1
1524 | 77,P3,1
1525 | 77,P4,1
1526 | 77,P5,0
1527 | 77,P6,0
1528 | 77,P7,0
1529 | 77,P8,0
1530 | 77,P9,0
1531 | 77,P10,0
1532 | 77,P11,1
1533 | 77,P12,1
1534 | 77,P13,0
1535 | 77,P14,0
1536 | 77,P15,0
1537 | 77,P16,0
1538 | 77,P17,0
1539 | 77,P18,0
1540 | 77,P19,1
1541 | 77,P20,1
1542 | 78,P1,0
1543 | 78,P2,0
1544 | 78,P3,0
1545 | 78,P4,0
1546 | 78,P5,1
1547 | 78,P6,0
1548 | 78,P7,0
1549 | 78,P8,1
1550 | 78,P9,1
1551 | 78,P10,0
1552 | 78,P11,0
1553 | 78,P12,0
1554 | 78,P13,1
1555 | 78,P14,0
1556 | 78,P15,0
1557 | 78,P16,0
1558 | 78,P17,1
1559 | 78,P18,0
1560 | 78,P19,0
1561 | 78,P20,0
1562 | 79,P1,0
1563 | 79,P2,0
1564 | 79,P3,0
1565 | 79,P4,1
1566 | 79,P5,0
1567 | 79,P6,0
1568 | 79,P7,0
1569 | 79,P8,0
1570 | 79,P9,1
1571 | 79,P10,0
1572 | 79,P11,1
1573 | 79,P12,0
1574 | 79,P13,0
1575 | 79,P14,0
1576 | 79,P15,0
1577 | 79,P16,0
1578 | 79,P17,0
1579 | 79,P18,0
1580 | 79,P19,1
1581 | 79,P20,0
1582 | 80,P1,1
1583 | 80,P2,1
1584 | 80,P3,0
1585 | 80,P4,1
1586 | 80,P5,0
1587 | 80,P6,0
1588 | 80,P7,0
1589 | 80,P8,0
1590 | 80,P9,0
1591 | 80,P10,0
1592 | 80,P11,0
1593 | 80,P12,0
1594 | 80,P13,0
1595 | 80,P14,0
1596 | 80,P15,1
1597 | 80,P16,0
1598 | 80,P17,1
1599 | 80,P18,0
1600 | 80,P19,0
1601 | 80,P20,0
1602 | 81,P1,0
1603 | 81,P2,1
1604 | 81,P3,1
1605 | 81,P4,0
1606 | 81,P5,0
1607 | 81,P6,0
1608 | 81,P7,0
1609 | 81,P8,1
1610 | 81,P9,0
1611 | 81,P10,1
1612 | 81,P11,1
1613 | 81,P12,0
1614 | 81,P13,1
1615 | 81,P14,1
1616 | 81,P15,0
1617 | 81,P16,0
1618 | 81,P17,0
1619 | 81,P18,0
1620 | 81,P19,0
1621 | 81,P20,1
1622 | 82,P1,0
1623 | 82,P2,1
1624 | 82,P3,0
1625 | 82,P4,0
1626 | 82,P5,1
1627 | 82,P6,0
1628 | 82,P7,0
1629 | 82,P8,0
1630 | 82,P9,0
1631 | 82,P10,1
1632 | 82,P11,0
1633 | 82,P12,0
1634 | 82,P13,1
1635 | 82,P14,1
1636 | 82,P15,0
1637 | 82,P16,0
1638 | 82,P17,1
1639 | 82,P18,1
1640 | 82,P19,0
1641 | 82,P20,0
1642 | 83,P1,1
1643 | 83,P2,1
1644 | 83,P3,0
1645 | 83,P4,0
1646 | 83,P5,0
1647 | 83,P6,0
1648 | 83,P7,0
1649 | 83,P8,0
1650 | 83,P9,0
1651 | 83,P10,0
1652 | 83,P11,0
1653 | 83,P12,1
1654 | 83,P13,1
1655 | 83,P14,0
1656 | 83,P15,1
1657 | 83,P16,0
1658 | 83,P17,1
1659 | 83,P18,1
1660 | 83,P19,0
1661 | 83,P20,0
1662 | 84,P1,0
1663 | 84,P2,1
1664 | 84,P3,0
1665 | 84,P4,0
1666 | 84,P5,0
1667 | 84,P6,0
1668 | 84,P7,0
1669 | 84,P8,0
1670 | 84,P9,0
1671 | 84,P10,0
1672 | 84,P11,1
1673 | 84,P12,1
1674 | 84,P13,1
1675 | 84,P14,1
1676 | 84,P15,0
1677 | 84,P16,0
1678 | 84,P17,0
1679 | 84,P18,0
1680 | 84,P19,0
1681 | 84,P20,0
1682 | 85,P1,0
1683 | 85,P2,1
1684 | 85,P3,0
1685 | 85,P4,1
1686 | 85,P5,0
1687 | 85,P6,0
1688 | 85,P7,1
1689 | 85,P8,0
1690 | 85,P9,1
1691 | 85,P10,0
1692 | 85,P11,1
1693 | 85,P12,0
1694 | 85,P13,0
1695 | 85,P14,1
1696 | 85,P15,0
1697 | 85,P16,0
1698 | 85,P17,0
1699 | 85,P18,1
1700 | 85,P19,0
1701 | 85,P20,0
1702 | 86,P1,0
1703 | 86,P2,0
1704 | 86,P3,1
1705 | 86,P4,1
1706 | 86,P5,1
1707 | 86,P6,1
1708 | 86,P7,1
1709 | 86,P8,0
1710 | 86,P9,0
1711 | 86,P10,0
1712 | 86,P11,1
1713 | 86,P12,0
1714 | 86,P13,0
1715 | 86,P14,1
1716 | 86,P15,0
1717 | 86,P16,0
1718 | 86,P17,1
1719 | 86,P18,0
1720 | 86,P19,1
1721 | 86,P20,0
1722 | 87,P1,0
1723 | 87,P2,1
1724 | 87,P3,1
1725 | 87,P4,1
1726 | 87,P5,0
1727 | 87,P6,0
1728 | 87,P7,0
1729 | 87,P8,0
1730 | 87,P9,0
1731 | 87,P10,0
1732 | 87,P11,0
1733 | 87,P12,0
1734 | 87,P13,1
1735 | 87,P14,0
1736 | 87,P15,0
1737 | 87,P16,0
1738 | 87,P17,1
1739 | 87,P18,0
1740 | 87,P19,0
1741 | 87,P20,0
1742 | 88,P1,1
1743 | 88,P2,0
1744 | 88,P3,0
1745 | 88,P4,0
1746 | 88,P5,1
1747 | 88,P6,1
1748 | 88,P7,0
1749 | 88,P8,0
1750 | 88,P9,1
1751 | 88,P10,0
1752 | 88,P11,1
1753 | 88,P12,0
1754 | 88,P13,1
1755 | 88,P14,0
1756 | 88,P15,0
1757 | 88,P16,1
1758 | 88,P17,1
1759 | 88,P18,0
1760 | 88,P19,0
1761 | 88,P20,0
1762 | 89,P1,0
1763 | 89,P2,0
1764 | 89,P3,1
1765 | 89,P4,0
1766 | 89,P5,0
1767 | 89,P6,1
1768 | 89,P7,1
1769 | 89,P8,0
1770 | 89,P9,0
1771 | 89,P10,0
1772 | 89,P11,0
1773 | 89,P12,1
1774 | 89,P13,0
1775 | 89,P14,1
1776 | 89,P15,0
1777 | 89,P16,0
1778 | 89,P17,0
1779 | 89,P18,0
1780 | 89,P19,0
1781 | 89,P20,0
1782 | 90,P1,0
1783 | 90,P2,0
1784 | 90,P3,1
1785 | 90,P4,0
1786 | 90,P5,0
1787 | 90,P6,0
1788 | 90,P7,0
1789 | 90,P8,1
1790 | 90,P9,1
1791 | 90,P10,1
1792 | 90,P11,1
1793 | 90,P12,0
1794 | 90,P13,1
1795 | 90,P14,1
1796 | 90,P15,1
1797 | 90,P16,1
1798 | 90,P17,0
1799 | 90,P18,1
1800 | 90,P19,0
1801 | 90,P20,0
1802 | 91,P1,1
1803 | 91,P2,0
1804 | 91,P3,0
1805 | 91,P4,0
1806 | 91,P5,0
1807 | 91,P6,1
1808 | 91,P7,0
1809 | 91,P8,0
1810 | 91,P9,1
1811 | 91,P10,0
1812 | 91,P11,0
1813 | 91,P12,1
1814 | 91,P13,1
1815 | 91,P14,0
1816 | 91,P15,0
1817 | 91,P16,1
1818 | 91,P17,1
1819 | 91,P18,1
1820 | 91,P19,1
1821 | 91,P20,0
1822 | 92,P1,1
1823 | 92,P2,1
1824 | 92,P3,1
1825 | 92,P4,0
1826 | 92,P5,0
1827 | 92,P6,0
1828 | 92,P7,0
1829 | 92,P8,1
1830 | 92,P9,1
1831 | 92,P10,1
1832 | 92,P11,0
1833 | 92,P12,0
1834 | 92,P13,1
1835 | 92,P14,0
1836 | 92,P15,0
1837 | 92,P16,0
1838 | 92,P17,0
1839 | 92,P18,1
1840 | 92,P19,0
1841 | 92,P20,1
1842 | 93,P1,0
1843 | 93,P2,0
1844 | 93,P3,0
1845 | 93,P4,0
1846 | 93,P5,0
1847 | 93,P6,0
1848 | 93,P7,0
1849 | 93,P8,0
1850 | 93,P9,0
1851 | 93,P10,0
1852 | 93,P11,1
1853 | 93,P12,0
1854 | 93,P13,1
1855 | 93,P14,0
1856 | 93,P15,1
1857 | 93,P16,0
1858 | 93,P17,0
1859 | 93,P18,0
1860 | 93,P19,0
1861 | 93,P20,0
1862 | 94,P1,1
1863 | 94,P2,0
1864 | 94,P3,0
1865 | 94,P4,1
1866 | 94,P5,0
1867 | 94,P6,0
1868 | 94,P7,1
1869 | 94,P8,1
1870 | 94,P9,0
1871 | 94,P10,0
1872 | 94,P11,0
1873 | 94,P12,0
1874 | 94,P13,1
1875 | 94,P14,0
1876 | 94,P15,0
1877 | 94,P16,0
1878 | 94,P17,0
1879 | 94,P18,0
1880 | 94,P19,0
1881 | 94,P20,0
1882 | 95,P1,1
1883 | 95,P2,0
1884 | 95,P3,0
1885 | 95,P4,0
1886 | 95,P5,0
1887 | 95,P6,0
1888 | 95,P7,0
1889 | 95,P8,0
1890 | 95,P9,1
1891 | 95,P10,0
1892 | 95,P11,1
1893 | 95,P12,1
1894 | 95,P13,1
1895 | 95,P14,1
1896 | 95,P15,1
1897 | 95,P16,1
1898 | 95,P17,0
1899 | 95,P18,0
1900 | 95,P19,0
1901 | 95,P20,0
1902 | 96,P1,1
1903 | 96,P2,0
1904 | 96,P3,0
1905 | 96,P4,0
1906 | 96,P5,0
1907 | 96,P6,0
1908 | 96,P7,0
1909 | 96,P8,0
1910 | 96,P9,0
1911 | 96,P10,0
1912 | 96,P11,1
1913 | 96,P12,1
1914 | 96,P13,1
1915 | 96,P14,0
1916 | 96,P15,0
1917 | 96,P16,0
1918 | 96,P17,1
1919 | 96,P18,0
1920 | 96,P19,0
1921 | 96,P20,0
1922 | 97,P1,0
1923 | 97,P2,0
1924 | 97,P3,0
1925 | 97,P4,0
1926 | 97,P5,1
1927 | 97,P6,1
1928 | 97,P7,1
1929 | 97,P8,0
1930 | 97,P9,0
1931 | 97,P10,0
1932 | 97,P11,1
1933 | 97,P12,0
1934 | 97,P13,1
1935 | 97,P14,0
1936 | 97,P15,0
1937 | 97,P16,0
1938 | 97,P17,0
1939 | 97,P18,1
1940 | 97,P19,0
1941 | 97,P20,0
1942 | 98,P1,0
1943 | 98,P2,0
1944 | 98,P3,0
1945 | 98,P4,1
1946 | 98,P5,0
1947 | 98,P6,0
1948 | 98,P7,1
1949 | 98,P8,0
1950 | 98,P9,0
1951 | 98,P10,1
1952 | 98,P11,1
1953 | 98,P12,0
1954 | 98,P13,0
1955 | 98,P14,1
1956 | 98,P15,0
1957 | 98,P16,0
1958 | 98,P17,0
1959 | 98,P18,0
1960 | 98,P19,1
1961 | 98,P20,0
1962 | 99,P1,0
1963 | 99,P2,0
1964 | 99,P3,0
1965 | 99,P4,0
1966 | 99,P5,0
1967 | 99,P6,0
1968 | 99,P7,1
1969 | 99,P8,0
1970 | 99,P9,1
1971 | 99,P10,0
1972 | 99,P11,0
1973 | 99,P12,0
1974 | 99,P13,0
1975 | 99,P14,0
1976 | 99,P15,0
1977 | 99,P16,0
1978 | 99,P17,1
1979 | 99,P18,1
1980 | 99,P19,0
1981 | 99,P20,0
1982 | 100,P1,1
1983 | 100,P2,0
1984 | 100,P3,0
1985 | 100,P4,0
1986 | 100,P5,0
1987 | 100,P6,0
1988 | 100,P7,1
1989 | 100,P8,0
1990 | 100,P9,1
1991 | 100,P10,1
1992 | 100,P11,0
1993 | 100,P12,0
1994 | 100,P13,0
1995 | 100,P14,1
1996 | 100,P15,0
1997 | 100,P16,0
1998 | 100,P17,0
1999 | 100,P18,0
2000 | 100,P19,0
2001 | 100,P20,0
2002 | 


--------------------------------------------------------------------------------