├── input ├── ch01-titanic │ └── .gitkeep ├── ch03 │ ├── multi_table_product.csv │ ├── time_series_events.csv │ ├── time_series_wide.csv │ ├── multi_table_log.csv │ └── multi_table_train.csv ├── readme.md └── sample-data │ ├── input_preprocess.py │ └── input_create.py ├── ch04-model-interface ├── input │ └── .gitkeep ├── model │ └── .gitkeep ├── submission │ └── .gitkeep ├── readme.md └── code │ ├── model.py │ ├── model_xgb.py │ ├── run.py │ ├── util.py │ ├── model_nn.py │ └── runner.py ├── misc ├── cover.jpg └── cover_small.jpg ├── ch02 ├── ch02-05-custom-function.py ├── ch02-03-optimize.py ├── ch02-04-optimize-cv.py ├── ch02-02-custom-usage.py └── ch02-01-metrics.py ├── ch03 ├── ch03-03-multi_tables.py ├── ch03-06-reduction-mnist.py ├── ch03-04-time_series.py ├── ch03-05-reduction.py ├── ch03-01-numerical.py └── ch03-02-categorical.py ├── ch06 ├── ch06-02-hopt_xgb.py ├── ch06-05-embedded.py ├── ch06-04-filter.py ├── ch06-06-wrapper.py ├── ch06-01-hopt.py └── ch06-03-hopt_nn.py ├── LICENSE ├── ch04 ├── ch04-05-run_linear.py ├── ch04-03-run_lgb.py ├── ch04-02-run_xgb.py ├── ch04-04-run_nn.py └── ch04-01-introduction.py ├── ch07 ├── ch07-03-adversarial.py ├── ch07-02-blending.py ├── models.py └── ch07-01-stacking.py ├── ch05 ├── ch05-02-timeseries.py └── ch05-01-validation.py ├── readme.md └── ch01 └── ch01-01-titanic.py /input/ch01-titanic/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ch04-model-interface/input/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ch04-model-interface/model/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ch04-model-interface/submission/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /misc/cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmalins/kagglebook/HEAD/misc/cover.jpg -------------------------------------------------------------------------------- /misc/cover_small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexmalins/kagglebook/HEAD/misc/cover_small.jpg -------------------------------------------------------------------------------- /input/ch03/multi_table_product.csv: -------------------------------------------------------------------------------- 1 | product_id,product_category,price 2 | P1,C1,550 3 | P2,C1,100 4 | P3,C2,100 5 | P4,C3,100 6 | P5,C1,200 7 | P6,C5,1000 8 | P7,C5,1500 9 | P8,C4,300 10 | P9,C1,200 11 | P10,C1,370 12 | P11,C1,350 13 | P12,C1,300 14 | P13,C2,70 15 | P14,C3,300 16 | P15,C1,400 17 | P16,C1,600 18 | P17,C3,700 19 | P18,C3,600 20 | P19,C5,2500 21 | P20,C5,4000 22 | -------------------------------------------------------------------------------- /input/ch03/time_series_events.csv: -------------------------------------------------------------------------------- 1 | date,event 2 | 2018/1/3,sale 3 | 2018/1/3,conpon 4 | 2018/1/4,points 5 | 2018/1/5,points 6 | 2018/5/3,sale 7 | 2018/5/4,sale 8 | 2018/5/5,sale 9 | 2018/5/6,points 10 | 2018/5/7,points 11 | 2018/5/8,points 12 | 2018/7/1,conpon 13 | 2018/8/13,points 14 | 2018/8/14,points 15 | 2018/8/15,points 16 | 2018/8/16,points 17 | 2018/8/17,points 18 | 2018/8/30,points 19 | 2018/8/31,points 20 | 2018/9/1,conpon 21 | 2018/10/30,points 22 | 2018/10/31,points 23 | 2018/12/30,sale 24 | 2018/12/30,points 25 | 2018/12/31,points 26 | -------------------------------------------------------------------------------- /ch04-model-interface/readme.md: -------------------------------------------------------------------------------- 1 | ### Chapter 4 section on "class and directory structures for competitions": sample code 2 | 3 | This is the sample code for the section in chapter 4 on "class and directory structures for competitions". 4 | 5 | Input data is from the Kaggle competition [Otto Group Product Classification Challenge](https://www.kaggle.com/c/otto-group-product-classification-challenge/). 6 | The code shows the process of training and making predictions using xgboost and keras. 7 | Refer to https://github.com/puyokw/kaggle_Otto/ to understand parameters and modelling method. 8 | 9 | Execute the code using the following steps 10 | 11 | 1. Download the [Data](https://www.kaggle.com/c/otto-group-product-classification-challenge/data) and save in the `input` folder. 12 | 2. Make a `code` folder then execute ```python run.py```. 13 | -------------------------------------------------------------------------------- /ch02/ch02-05-custom-function.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | # ----------------------------------- 6 | # Optimizing MAE by approximating metric with a custom objective function 7 | # ----------------------------------- 8 | 9 | # Fair function 10 | def fair(preds, dtrain): 11 | x = preds - dtrain.get_labels() # Get residual 12 | c = 1.0 # Parameter of fair function 13 | den = abs(x) + c # Calculate denominator of gradient formula 14 | grad = c * x / den # Gradient 15 | hess = c * c / den ** 2 # Second derivative 16 | return grad, hess 17 | 18 | 19 | # Pseudo-Huber function 20 | def psuedo_huber(preds, dtrain): 21 | d = preds - dtrain.get_labels() # Get residual 22 | delta = 1.0 # Parameter of Pseudo-Huber function 23 | scale = 1 + (d / delta) ** 2 24 | scale_sqrt = np.sqrt(scale) 25 | grad = d / scale_sqrt # Gradient 26 | hess = 1 / scale / scale_sqrt # Second derivative 27 | return grad, hess 28 | -------------------------------------------------------------------------------- /ch03/ch03-03-multi_tables.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # ----------------------------------- 5 | # Merging data 6 | # ----------------------------------- 7 | # Load the data 8 | train = pd.read_csv('../input/ch03/multi_table_train.csv') 9 | product_master = pd.read_csv('../input/ch03/multi_table_product.csv') 10 | user_log = pd.read_csv('../input/ch03/multi_table_log.csv') 11 | 12 | # ----------------------------------- 13 | # Suppose we have a data frame in the format shown in the diagram 14 | # train : Training data (UserID, ProductID, Target value columns etc.) 15 | # product_master: Product data (ProductID, Product information columns etc.) 16 | # user_log : User actions log data (UserID, Columns recording user action data etc.) 17 | 18 | # Combine the product data and training data 19 | train = train.merge(product_master, on='product_id', how='left') 20 | 21 | # Aggregate the lines containing data for each user, and append to the training data 22 | user_log_agg = user_log.groupby('user_id').size().reset_index().rename(columns={0: 'user_count'}) 23 | train = train.merge(user_log_agg, on='user_id', how='left') 24 | -------------------------------------------------------------------------------- /ch06/ch06-02-hopt_xgb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hyperopt import hp 3 | 4 | # ----------------------------------- 5 | # Example of xgboost parameter space 6 | # ----------------------------------- 7 | 8 | # Baseline parameters 9 | params = { 10 | 'booster': 'gbtree', 11 | 'objective': 'binary:logistic', 12 | 'eta': 0.1, 13 | 'gamma': 0.0, 14 | 'alpha': 0.0, 15 | 'lambda': 1.0, 16 | 'min_child_weight': 1, 17 | 'max_depth': 5, 18 | 'subsample': 0.8, 19 | 'colsample_bytree': 0.8, 20 | 'random_state': 71, 21 | } 22 | 23 | # Parameter search space 24 | param_space = { 25 | 'min_child_weight': hp.loguniform('min_child_weight', np.log(0.1), np.log(10)), 26 | 'max_depth': hp.quniform('max_depth', 3, 9, 1), 27 | 'subsample': hp.quniform('subsample', 0.6, 0.95, 0.05), 28 | 'colsample_bytree': hp.quniform('subsample', 0.6, 0.95, 0.05), 29 | 'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)), 30 | # If there is enough leeway tune alpha and lambda as well 31 | # 'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)), 32 | # 'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0)), 33 | } 34 | -------------------------------------------------------------------------------- /ch02/ch02-03-optimize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # ----------------------------------- 5 | # Optimal threshold 6 | # ----------------------------------- 7 | from sklearn.metrics import f1_score 8 | from scipy.optimize import minimize 9 | 10 | # Prepartions for creating sample data 11 | rand = np.random.RandomState(seed=71) 12 | train_y_prob = np.linspace(0, 1.0, 10000) 13 | 14 | # Assume that the true and predicted values are train_y and train_pred_prob, respectively 15 | train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob) 16 | train_pred_prob = np.clip(train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0) 17 | 18 | # When the threshold is 0.5, F1 is 0.722 19 | init_threshold = 0.5 20 | init_score = f1_score(train_y, train_pred_prob >= init_threshold) 21 | print(init_threshold, init_score) 22 | 23 | 24 | # Objective function for optimization 25 | def f1_opt(x): 26 | return -f1_score(train_y, train_pred_prob >= x) 27 | 28 | 29 | # Use scipy.optimize minimize() function to find optimal threshold 30 | # F1 is 0.756 when obtained with the optimal threshold 31 | result = minimize(f1_opt, x0=np.array([0.5]), method='Nelder-Mead') 32 | best_threshold = result['x'].item() 33 | best_score = f1_score(train_y, train_pred_prob >= best_threshold) 34 | print(best_threshold, best_score) 35 | -------------------------------------------------------------------------------- /ch04-model-interface/code/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from abc import ABCMeta, abstractmethod 4 | from typing import Optional 5 | 6 | 7 | class Model(metaclass=ABCMeta): 8 | 9 | def __init__(self, run_fold_name: str, params: dict) -> None: 10 | """Constructor 11 | 12 | :param run_fold_name: concatenation of run name and fold number 13 | :param params: hyperparameters 14 | """ 15 | self.run_fold_name = run_fold_name 16 | self.params = params 17 | self.model = None 18 | 19 | @abstractmethod 20 | def train(self, tr_x: pd.DataFrame, tr_y: pd.Series, 21 | va_x: Optional[pd.DataFrame] = None, 22 | va_y: Optional[pd.Series] = None) -> None: 23 | """Perform model training and save trained model 24 | 25 | :param tr_x: Training data features 26 | :param tr_y: Training data target values 27 | :param va_x: Validation data features 28 | :param va_y: Validation data target values 29 | """ 30 | pass 31 | 32 | @abstractmethod 33 | def predict(self, te_x: pd.DataFrame) -> np.array: 34 | """Return predictions from trained model 35 | 36 | :param te_x: Validation data or test data features 37 | :return: Predictions 38 | """ 39 | pass 40 | 41 | @abstractmethod 42 | def save_model(self) -> None: 43 | """Save the model """ 44 | pass 45 | 46 | @abstractmethod 47 | def load_model(self) -> None: 48 | """Load the model""" 49 | pass 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, ghmagazine 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /input/readme.md: -------------------------------------------------------------------------------- 1 | ## Input data 2 | 3 | ### Sample data 4 | 5 | #### Sample data overview 6 | 7 | * Input data for sample code after chapter 2 8 | * Use data from Kaggle competition [Prudential Life Insurance Assessment](https://www.kaggle.com/c/prudential-life-insurance-assessment) 9 | as a reference. Data was made artificially to simulate insurance underwriting data. The data construction was simple, so its structure is simpler than real life data. 10 | * Total of training and test data is 10000 lines 11 | 12 | #### Sample data items 13 | 14 | | Column name | Notes | 15 | |:----|:-------| 16 | | age | | 17 | | gender | | 18 | | height | | 19 | | weight | | 20 | | product | product type | 21 | | amount | insurance premium | 22 | | date | application date | 23 | | medical_info_a1/a2/a3 | medical information - continuous variable | 24 | | medical_info_b1/b2/b3 | medical information - continuous and catergorical variables | 25 | | medical_info_c1/c2 | medical information - continuous and catergorical variables | 26 | | medical_keyword_1-10 | medical information - binary variable | 27 | | target | target values (binary) | 28 | 29 | 30 | ### Input data used in chapter 1 (ch01-titanic) 31 | 32 | * From Kaggle competition [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic), save following [data](https://www.kaggle.com/c/titanic/data) 33 | (save into folders as follows: ch01-titanic/train.csv, ch01-titanic/test.csv) 34 | 35 | 36 | ### Input data used in chapter 3 (ch03) 37 | 38 | * Data for explanations on how to combine different tables 39 | * Data for explanations on how to process time series data 40 | 41 | -------------------------------------------------------------------------------- /ch04-model-interface/code/model_xgb.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import xgboost as xgb 6 | 7 | from model import Model 8 | from util import Util 9 | 10 | 11 | class ModelXGB(Model): 12 | 13 | def train(self, tr_x, tr_y, va_x=None, va_y=None): 14 | 15 | # Set the data 16 | validation = va_x is not None 17 | dtrain = xgb.DMatrix(tr_x, label=tr_y) 18 | if validation: 19 | dvalid = xgb.DMatrix(va_x, label=va_y) 20 | 21 | # Set the hyperparameters 22 | params = dict(self.params) 23 | num_round = params.pop('num_round') 24 | 25 | # Train 26 | if validation: 27 | early_stopping_rounds = params.pop('early_stopping_rounds') 28 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')] 29 | self.model = xgb.train(params, dtrain, num_round, evals=watchlist, 30 | early_stopping_rounds=early_stopping_rounds) 31 | else: 32 | watchlist = [(dtrain, 'train')] 33 | self.model = xgb.train(params, dtrain, num_round, evals=watchlist) 34 | 35 | def predict(self, te_x): 36 | dtest = xgb.DMatrix(te_x) 37 | return self.model.predict(dtest, ntree_limit=self.model.best_ntree_limit) 38 | 39 | def save_model(self): 40 | model_path = os.path.join('../model/model', f'{self.run_fold_name}.model') 41 | os.makedirs(os.path.dirname(model_path), exist_ok=True) 42 | # To prevent loss of best_ntree_limit model, save model using pickle 43 | Util.dump(self.model, model_path) 44 | 45 | def load_model(self): 46 | model_path = os.path.join('../model/model', f'{self.run_fold_name}.model') 47 | self.model = Util.load(model_path) 48 | -------------------------------------------------------------------------------- /ch04/ch04-05-run_linear.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | # Load one-hot encoded data 10 | 11 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv') 12 | train_x = train.drop(['target'], axis=1) 13 | train_y = train['target'] 14 | test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv') 15 | 16 | # Split the training data into training and validation data 17 | from sklearn.model_selection import KFold 18 | 19 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 20 | tr_idx, va_idx = list(kf.split(train_x))[0] 21 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 22 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 23 | 24 | # ----------------------------------- 25 | # Linear model implementation 26 | # ----------------------------------- 27 | from sklearn.linear_model import LogisticRegression 28 | from sklearn.metrics import log_loss 29 | from sklearn.preprocessing import StandardScaler 30 | 31 | # Data scaling 32 | scaler = StandardScaler() 33 | tr_x = scaler.fit_transform(tr_x) 34 | va_x = scaler.transform(va_x) 35 | test_x = scaler.transform(test_x) 36 | 37 | # Construction and training of linear model 38 | model = LogisticRegression(C=1.0) 39 | model.fit(tr_x, tr_y) 40 | 41 | # Check score for validation data 42 | # Use predict_proba() to output probabilities. (predict() outputs binary class predictions) 43 | va_pred = model.predict_proba(va_x) 44 | score = log_loss(va_y, va_pred) 45 | print(f'logloss: {score:.4f}') 46 | 47 | # Predictions 48 | pred = model.predict(test_x) 49 | -------------------------------------------------------------------------------- /ch06/ch06-05-embedded.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # --------------------------------- 5 | # Importance of random forest features 6 | # --------------------------------- 7 | # train_x is training data, train_y is target values 8 | # Cannot deal with missing values so read data with missing values already imputed 9 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv') 10 | train_x = train.drop(['target'], axis=1) 11 | train_y = train['target'] 12 | # --------------------------------- 13 | from sklearn.ensemble import RandomForestClassifier 14 | 15 | # Random forest 16 | clf = RandomForestClassifier(n_estimators=10, random_state=71) 17 | clf.fit(train_x, train_y) 18 | fi = clf.feature_importances_ 19 | 20 | # Output in order to top importance 21 | idx = np.argsort(fi)[::-1] 22 | top_cols, top_importances = train_x.columns.values[idx][:5], fi[idx][:5] 23 | print('random forest importance') 24 | print(top_cols, top_importances) 25 | 26 | # --------------------------------- 27 | # Importance of xgboost features 28 | # --------------------------------- 29 | # train_x is training data, train_y is target values 30 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 31 | train_x = train.drop(['target'], axis=1) 32 | train_y = train['target'] 33 | # --------------------------------- 34 | import xgboost as xgb 35 | 36 | # xgboost 37 | dtrain = xgb.DMatrix(train_x, label=train_y) 38 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71} 39 | num_round = 50 40 | model = xgb.train(params, dtrain, num_round) 41 | 42 | # Output in order to top importance 43 | fscore = model.get_score(importance_type='total_gain') 44 | fscore = sorted([(k, v) for k, v in fscore.items()], key=lambda tpl: tpl[1], reverse=True) 45 | print('xgboost importance') 46 | print(fscore[:5]) 47 | -------------------------------------------------------------------------------- /ch04-model-interface/code/run.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from model_nn import ModelNN 5 | from model_xgb import ModelXGB 6 | from runner import Runner 7 | from util import Submission 8 | 9 | if __name__ == '__main__': 10 | 11 | params_xgb = { 12 | 'objective': 'multi:softprob', 13 | 'eval_metric': 'mlogloss', 14 | 'num_class': 9, 15 | 'max_depth': 12, 16 | 'eta': 0.1, 17 | 'min_child_weight': 10, 18 | 'subsample': 0.9, 19 | 'colsample_bytree': 0.8, 20 | 'silent': 1, 21 | 'random_state': 71, 22 | 'num_round': 10000, 23 | 'early_stopping_rounds': 10, 24 | } 25 | 26 | params_xgb_all = dict(params_xgb) 27 | params_xgb_all['num_round'] = 350 28 | 29 | params_nn = { 30 | 'layers': 3, 31 | # Setting so this sample code executes quickly 32 | 'nb_epoch': 5, # 1000 33 | 'patience': 10, 34 | 'dropout': 0.5, 35 | 'units': 512, 36 | } 37 | 38 | # Specify features 39 | features = [f'feat_{i}' for i in range(1, 94)] 40 | 41 | # Train and predict using xgboost 42 | runner = Runner('xgb1', ModelXGB, features, params_xgb) 43 | runner.run_train_cv() 44 | runner.run_predict_cv() 45 | Submission.create_submission('xgb1') 46 | 47 | # Train and predict using neural network 48 | runner = Runner('nn1', ModelNN, features, params_nn) 49 | runner.run_train_cv() 50 | runner.run_predict_cv() 51 | Submission.create_submission('nn1') 52 | 53 | ''' 54 | # (For reference) Train and predict using xgboost on all training data 55 | runner = Runner('xgb1-train-all', ModelXGB, features, params_xgb_all) 56 | runner.run_train_all() 57 | runner.run_test_all() 58 | Submission.create_submission('xgb1-train-all') 59 | ''' 60 | -------------------------------------------------------------------------------- /ch02/ch02-04-optimize-cv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # ----------------------------------- 5 | # Optimization of out-of-fold threshold 6 | # ----------------------------------- 7 | from scipy.optimize import minimize 8 | from sklearn.metrics import f1_score 9 | from sklearn.model_selection import KFold 10 | 11 | # Prepartions for creating sample data 12 | rand = np.random.RandomState(seed=71) 13 | train_y_prob = np.linspace(0, 1.0, 10000) 14 | 15 | # Assume that the true and predicted values are train_y and train_pred_prob, respectively 16 | train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob) 17 | train_pred_prob = np.clip(train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0) 18 | 19 | # Find thresholds using cross validation framework 20 | thresholds = [] 21 | scores_tr = [] 22 | scores_va = [] 23 | 24 | kf = KFold(n_splits=4, random_state=71, shuffle=True) 25 | for i, (tr_idx, va_idx) in enumerate(kf.split(train_pred_prob)): 26 | tr_pred_prob, va_pred_prob = train_pred_prob[tr_idx], train_pred_prob[va_idx] 27 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 28 | 29 | # Objective function for optimization 30 | def f1_opt(x): 31 | return -f1_score(tr_y, tr_pred_prob >= x) 32 | 33 | # Optimize thresholds with training data, and evaluate with validation data 34 | result = minimize(f1_opt, x0=np.array([0.5]), method='Nelder-Mead') 35 | threshold = result['x'].item() 36 | score_tr = f1_score(tr_y, tr_pred_prob >= threshold) 37 | score_va = f1_score(va_y, va_pred_prob >= threshold) 38 | print(threshold, score_tr, score_va) 39 | 40 | thresholds.append(threshold) 41 | scores_tr.append(score_tr) 42 | scores_va.append(score_va) 43 | 44 | # Apply mean of the fold thresholds to the test data 45 | threshold_test = np.mean(thresholds) 46 | print(threshold_test) 47 | -------------------------------------------------------------------------------- /ch04/ch04-03-run_lgb.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (also possible to use numpy arrays)) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | # Split the training data into training and validation data 16 | from sklearn.model_selection import KFold 17 | 18 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 19 | tr_idx, va_idx = list(kf.split(train_x))[0] 20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 22 | 23 | # ----------------------------------- 24 | # lightgbm implementation 25 | # ----------------------------------- 26 | import lightgbm as lgb 27 | from sklearn.metrics import log_loss 28 | 29 | # Change the features and target values into format suitable for lightgbm 30 | lgb_train = lgb.Dataset(tr_x, tr_y) 31 | lgb_eval = lgb.Dataset(va_x, va_y) 32 | 33 | # Set the hyperparameters 34 | params = {'objective': 'binary', 'seed': 71, 'verbose': 0, 'metrics': 'binary_logloss'} 35 | num_round = 100 36 | 37 | # Perform training 38 | # Specify categorical features as a parameter 39 | # Pass the validation data to the model, and monitor how the score changes during training 40 | categorical_features = ['product', 'medical_info_b2', 'medical_info_b3'] 41 | model = lgb.train(params, lgb_train, num_boost_round=num_round, 42 | categorical_feature=categorical_features, 43 | valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_eval]) 44 | 45 | # Check score for validation data 46 | va_pred = model.predict(va_x) 47 | score = log_loss(va_y, va_pred) 48 | print(f'logloss: {score:.4f}') 49 | 50 | # Predictions 51 | pred = model.predict(test_x) 52 | -------------------------------------------------------------------------------- /ch07/ch07-03-adversarial.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # Data creation (just random data) 8 | rand = np.random.RandomState(71) 9 | train_x = pd.DataFrame(rand.uniform(0.0, 1.0, (10000, 2)), columns=['model1', 'model2']) 10 | adv_train = pd.Series(rand.uniform(0.0, 1.0, 10000)) 11 | w = np.array([0.3, 0.7]).reshape(1, -1) 12 | train_y = pd.Series((train_x.values * w).sum(axis=1) > 0.5) 13 | 14 | # --------------------------------- 15 | # adversarial stochastic blending 16 | # ---------------------------------- 17 | # Use adversarial validation to calculate weights for averaging predicted values from models 18 | # train_x: Predicted probabilities from each model (actually using results that have been ordered) 19 | # train_y: Target values 20 | # adv_train: Values that represent likelihood that training data was also test data 21 | 22 | from scipy.optimize import minimize 23 | from sklearn.metrics import roc_auc_score 24 | 25 | n_sampling = 50 # Number of times to sample 26 | frac_sampling = 0.5 # Fraction of training data to take when sampling 27 | 28 | 29 | def score(x, data_x, data_y): 30 | # Use AUC as evaluation metric 31 | y_prob = data_x['model1'] * x + data_x['model2'] * (1 - x) 32 | return -roc_auc_score(data_y, y_prob) 33 | 34 | 35 | # Repeatedly use sampling to calculate weights for weighted averaging 36 | results = [] 37 | for i in range(n_sampling): 38 | # Perform sampling 39 | seed = i 40 | idx = pd.Series(np.arange(len(train_y))).sample(frac=frac_sampling, replace=False, 41 | random_state=seed, weights=adv_train) 42 | x_sample = train_x.iloc[idx] 43 | y_sample = train_y.iloc[idx] 44 | 45 | # Want to use sampling data to find most optimum weights for weighted averaging 46 | # As there are constraints use the COBYLA algorithm 47 | init_x = np.array(0.5) 48 | constraints = ( 49 | {'type': 'ineq', 'fun': lambda x: x}, 50 | {'type': 'ineq', 'fun': lambda x: 1.0 - x}, 51 | ) 52 | result = minimize(score, x0=init_x, 53 | args=(x_sample, y_sample), 54 | constraints=constraints, 55 | method='COBYLA') 56 | results.append((result.x, 1.0 - result.x)) 57 | 58 | # Weights for model1 and model2 weighted averaging 59 | results = np.array(results) 60 | w_model1, w_model2 = results.mean(axis=0) 61 | -------------------------------------------------------------------------------- /ch04/ch04-02-run_xgb.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | # Split the training data into training and validation data 16 | from sklearn.model_selection import KFold 17 | 18 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 19 | tr_idx, va_idx = list(kf.split(train_x))[0] 20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 22 | 23 | # ----------------------------------- 24 | # xgboost implementation 25 | # ----------------------------------- 26 | import xgboost as xgb 27 | from sklearn.metrics import log_loss 28 | 29 | # Change the features and target values into format suitable for xgboost 30 | dtrain = xgb.DMatrix(tr_x, label=tr_y) 31 | dvalid = xgb.DMatrix(va_x, label=va_y) 32 | dtest = xgb.DMatrix(test_x) 33 | 34 | # Set the hyperparameters 35 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71} 36 | num_round = 50 37 | 38 | # Train the model 39 | # Pass the validation data to the model, and monitor how the score changes during training 40 | # In watchlist put the training and validation data 41 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')] 42 | model = xgb.train(params, dtrain, num_round, evals=watchlist) 43 | 44 | # Check the score using the validation data 45 | va_pred = model.predict(dvalid) 46 | score = log_loss(va_y, va_pred) 47 | print(f'logloss: {score:.4f}') 48 | 49 | # Output prediction (not a binary value but a probability) 50 | pred = model.predict(dtest) 51 | 52 | # ----------------------------------- 53 | # Monitor the scores for the training and validation data 54 | # ----------------------------------- 55 | # Monitor the logless metric, set number of early stopping rounds to 20 56 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71, 57 | 'eval_metric': 'logloss'} 58 | num_round = 500 59 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')] 60 | model = xgb.train(params, dtrain, num_round, evals=watchlist, 61 | early_stopping_rounds=20) 62 | 63 | # Use the optimal decision tree to make predictions 64 | pred = model.predict(dtest, ntree_limit=model.best_ntree_limit) 65 | -------------------------------------------------------------------------------- /ch04-model-interface/code/util.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import os 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.externals import joblib 8 | 9 | 10 | class Util: 11 | 12 | @classmethod 13 | def dump(cls, value, path): 14 | os.makedirs(os.path.dirname(path), exist_ok=True) 15 | joblib.dump(value, path, compress=True) 16 | 17 | @classmethod 18 | def load(cls, path): 19 | return joblib.load(path) 20 | 21 | 22 | class Logger: 23 | 24 | def __init__(self): 25 | self.general_logger = logging.getLogger('general') 26 | self.result_logger = logging.getLogger('result') 27 | stream_handler = logging.StreamHandler() 28 | file_general_handler = logging.FileHandler('../model/general.log') 29 | file_result_handler = logging.FileHandler('../model/result.log') 30 | if len(self.general_logger.handlers) == 0: 31 | self.general_logger.addHandler(stream_handler) 32 | self.general_logger.addHandler(file_general_handler) 33 | self.general_logger.setLevel(logging.INFO) 34 | self.result_logger.addHandler(stream_handler) 35 | self.result_logger.addHandler(file_result_handler) 36 | self.result_logger.setLevel(logging.INFO) 37 | 38 | def info(self, message): 39 | # Output time to console and log 40 | self.general_logger.info('[{}] - {}'.format(self.now_string(), message)) 41 | 42 | def result(self, message): 43 | self.result_logger.info(message) 44 | 45 | def result_ltsv(self, dic): 46 | self.result(self.to_ltsv(dic)) 47 | 48 | def result_scores(self, run_name, scores): 49 | # Output calculation results to console and results log 50 | dic = dict() 51 | dic['name'] = run_name 52 | dic['score'] = np.mean(scores) 53 | for i, score in enumerate(scores): 54 | dic[f'score{i}'] = score 55 | self.result(self.to_ltsv(dic)) 56 | 57 | def now_string(self): 58 | return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) 59 | 60 | def to_ltsv(self, dic): 61 | return '\t'.join(['{}:{}'.format(key, value) for key, value in dic.items()]) 62 | 63 | 64 | class Submission: 65 | 66 | @classmethod 67 | def create_submission(cls, run_name): 68 | submission = pd.read_csv('../input/sampleSubmission.csv') 69 | pred = Util.load(f'../model/pred/{run_name}-test.pkl') 70 | for i in range(pred.shape[1]): 71 | submission[f'Class_{i + 1}'] = pred[:, i] 72 | submission.to_csv(f'../submission/{run_name}.csv', index=False) 73 | -------------------------------------------------------------------------------- /ch07/ch07-02-blending.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | # Data for neural network 16 | train_nn = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv') 17 | train_x_nn = train_nn.drop(['target'], axis=1) 18 | train_y_nn = train_nn['target'] 19 | test_x_nn = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv') 20 | 21 | # --------------------------------- 22 | # Ensemble using predictions from hold-out data 23 | # ---------------------------------- 24 | from sklearn.metrics import log_loss 25 | from sklearn.model_selection import KFold 26 | 27 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 28 | tr_idx, va_index = list(kf.split(train_x))[0] 29 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_index] 30 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_index] 31 | tr_x_nn, va_x_nn = train_x_nn.iloc[tr_idx], train_x_nn.iloc[va_index] 32 | 33 | # Assume Model1_1, Model1_2 and Model2 are defined in models.py 34 | # For each class train using fit and output prediction probabilities using predict 35 | from models import Model1Xgb, Model1NN, Model2Linear 36 | 37 | # First level model 38 | # Train using training data, output predictions for hold-out and test data 39 | model_1a = Model1Xgb() 40 | model_1a.fit(tr_x, tr_y, va_x, va_y) 41 | va_pred_1a = model_1a.predict(va_x) 42 | test_pred_1a = model_1a.predict(test_x) 43 | 44 | model_1b = Model1NN() 45 | model_1b.fit(tr_x_nn, tr_y, va_x_nn, va_y) 46 | va_pred_1b = model_1b.predict(va_x_nn) 47 | test_pred_1b = model_1b.predict(test_x_nn) 48 | 49 | # Score when using hold-out data 50 | print(f'logloss: {log_loss(va_y, va_pred_1a, eps=1e-7):.4f}') 51 | print(f'logloss: {log_loss(va_y, va_pred_1b, eps=1e-7):.4f}') 52 | 53 | # Make predictions from hold-out and test data a feature and create data frame 54 | va_x_2 = pd.DataFrame({'pred_1a': va_pred_1a, 'pred_1b': va_pred_1b}) 55 | test_x_2 = pd.DataFrame({'pred_1a': test_pred_1a, 'pred_1b': test_pred_1b}) 56 | 57 | # Second level model 58 | # Trained using all hold-out data so cannot evaluate score 59 | # In order to score, a method further cross validating hold-out data can be considered 60 | model2 = Model2Linear() 61 | model2.fit(va_x_2, va_y, None, None) 62 | pred_test_2 = model2.predict(test_x_2) 63 | -------------------------------------------------------------------------------- /ch07/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import xgboost as xgb 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Dropout 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn.preprocessing import StandardScaler 8 | 9 | # Suppress tensorflow warnings 10 | import os 11 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 12 | import tensorflow as tf 13 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 14 | 15 | 16 | # xgboost model 17 | class Model1Xgb: 18 | 19 | def __init__(self): 20 | self.model = None 21 | 22 | def fit(self, tr_x, tr_y, va_x, va_y): 23 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71, 24 | 'eval_metric': 'logloss'} 25 | num_round = 10 26 | dtrain = xgb.DMatrix(tr_x, label=tr_y) 27 | dvalid = xgb.DMatrix(va_x, label=va_y) 28 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')] 29 | self.model = xgb.train(params, dtrain, num_round, evals=watchlist) 30 | 31 | def predict(self, x): 32 | data = xgb.DMatrix(x) 33 | pred = self.model.predict(data) 34 | return pred 35 | 36 | 37 | # Neural network model 38 | class Model1NN: 39 | 40 | def __init__(self): 41 | self.model = None 42 | self.scaler = None 43 | 44 | def fit(self, tr_x, tr_y, va_x, va_y): 45 | self.scaler = StandardScaler() 46 | self.scaler.fit(tr_x) 47 | 48 | batch_size = 128 49 | epochs = 10 50 | 51 | tr_x = self.scaler.transform(tr_x) 52 | va_x = self.scaler.transform(va_x) 53 | model = Sequential() 54 | model.add(Dense(256, activation='relu', input_shape=(tr_x.shape[1],))) 55 | model.add(Dropout(0.2)) 56 | model.add(Dense(256, activation='relu')) 57 | model.add(Dropout(0.2)) 58 | model.add(Dense(1, activation='sigmoid')) 59 | 60 | model.compile(loss='binary_crossentropy', optimizer='adam') 61 | 62 | history = model.fit(tr_x, tr_y, 63 | batch_size=batch_size, epochs=epochs, 64 | verbose=1, validation_data=(va_x, va_y)) 65 | self.model = model 66 | 67 | def predict(self, x): 68 | x = self.scaler.transform(x) 69 | pred = self.model.predict_proba(x).reshape(-1) 70 | return pred 71 | 72 | 73 | # Linear model 74 | class Model2Linear: 75 | 76 | def __init__(self): 77 | self.model = None 78 | self.scaler = None 79 | 80 | def fit(self, tr_x, tr_y, va_x, va_y): 81 | self.scaler = StandardScaler() 82 | self.scaler.fit(tr_x) 83 | tr_x = self.scaler.transform(tr_x) 84 | self.model = LogisticRegression(solver='lbfgs', C=1.0) 85 | self.model.fit(tr_x, tr_y) 86 | 87 | def predict(self, x): 88 | x = self.scaler.transform(x) 89 | pred = self.model.predict_proba(x)[:, 1] 90 | return pred 91 | -------------------------------------------------------------------------------- /ch02/ch02-02-custom-usage.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y contains the target values, test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | from sklearn.model_selection import KFold 16 | 17 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 18 | tr_idx, va_idx = list(kf.split(train_x))[0] 19 | 20 | # Split the training data into training and validation data 21 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 22 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 23 | 24 | # ----------------------------------- 25 | # Examples of custom metrics and objective functions in xgboost 26 | # (Reference) https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py 27 | # ----------------------------------- 28 | import xgboost as xgb 29 | from sklearn.metrics import log_loss 30 | 31 | # Convert features and target values into xgboost data structure 32 | # Test features and target values are tr_x, tr_x, validation features and target values va_x, va_y 33 | dtrain = xgb.DMatrix(tr_x, label=tr_y) 34 | dvalid = xgb.DMatrix(va_x, label=va_y) 35 | 36 | 37 | # Custom objective function (logless in this case, which is equivalent to xgboost's 'binary:logistic') 38 | def logregobj(preds, dtrain): 39 | labels = dtrain.get_label() # Get labels of true values 40 | preds = 1.0 / (1.0 + np.exp(-preds)) # Sigmoid function 41 | grad = preds - labels # Gradient 42 | hess = preds * (1.0 - preds) # Second derivative 43 | return grad, hess 44 | 45 | 46 | # Custom metric (error rate in this case) 47 | def evalerror(preds, dtrain): 48 | labels = dtrain.get_label() # Get labels of true values 49 | return 'custom-error', float(sum(labels != (preds > 0.0))) / len(labels) 50 | 51 | 52 | # Set hyperparameters 53 | params = {'silent': 1, 'random_state': 71} 54 | num_round = 50 55 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')] 56 | 57 | # Train the model 58 | bst = xgb.train(params, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror) 59 | 60 | # Unlike when binary:logistic is specified as the objective function, 61 | # the values outputted are not probabilities so they need to be converted 62 | pred_val = bst.predict(dvalid) 63 | pred = 1.0 / (1.0 + np.exp(-pred_val)) 64 | logloss = log_loss(va_y, pred) 65 | print(logloss) 66 | 67 | # For reference results from normal training method 68 | params = {'silent': 1, 'random_state': 71, 'objective': 'binary:logistic'} 69 | bst = xgb.train(params, dtrain, num_round, watchlist) 70 | 71 | pred = bst.predict(dvalid) 72 | logloss = log_loss(va_y, pred) 73 | print(logloss) 74 | -------------------------------------------------------------------------------- /ch05/ch05-02-timeseries.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | # As time-series data assume a period variable is set that changes with time 16 | train_x['period'] = np.arange(0, len(train_x)) // (len(train_x) // 4) 17 | train_x['period'] = np.clip(train_x['period'], 0, 3) 18 | test_x['period'] = 4 19 | 20 | # ----------------------------------- 21 | # Hold-out method for time-series data 22 | # ----------------------------------- 23 | # Partition using the period variable as the basis (0 to 3 are the training data, 4 is the test data) 24 | # Here for within the training data period 3 is used for validation and periods 0 to 2 are used for training 25 | is_tr = train_x['period'] < 3 26 | is_va = train_x['period'] == 3 27 | tr_x, va_x = train_x[is_tr], train_x[is_va] 28 | tr_y, va_y = train_y[is_tr], train_y[is_va] 29 | 30 | # ----------------------------------- 31 | # Cross validation for time-series data (use method that follows time) 32 | # ----------------------------------- 33 | # Partition using the period variable as the basis (0 to 3 are the training data, 4 is the test data) 34 | # Periods 1, 2 and 3 are each used for cross-validation, and the preceding periods are used for training 35 | 36 | va_period_list = [1, 2, 3] 37 | for va_period in va_period_list: 38 | is_tr = train_x['period'] < va_period 39 | is_va = train_x['period'] == va_period 40 | tr_x, va_x = train_x[is_tr], train_x[is_va] 41 | tr_y, va_y = train_y[is_tr], train_y[is_va] 42 | 43 | # (For reference) Using TimeSeriesSplit() function is difficult as only the order of the data can be used 44 | from sklearn.model_selection import TimeSeriesSplit 45 | 46 | tss = TimeSeriesSplit(n_splits=4) 47 | for tr_idx, va_idx in tss.split(train_x): 48 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 49 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 50 | 51 | # ----------------------------------- 52 | # Cross validation for time-series data (method to simply partition by time) 53 | # ----------------------------------- 54 | # Partition using the period variable as the basis (0 to 3 are the training data, 4 is the test data) 55 | # Periods 1, 2 and 3 are each used for cross-validation, and the preceding periods are used for training 56 | 57 | va_period_list = [0, 1, 2, 3] 58 | for va_period in va_period_list: 59 | is_tr = train_x['period'] != va_period 60 | is_va = train_x['period'] == va_period 61 | tr_x, va_x = train_x[is_tr], train_x[is_va] 62 | tr_y, va_y = train_y[is_tr], train_y[is_va] 63 | -------------------------------------------------------------------------------- /ch03/ch03-06-reduction-mnist.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | 8 | # Visualization of MNIST data 9 | 10 | # Import MNIST data from keras.datasets 11 | from keras.datasets import mnist 12 | (train_x, train_y), (test_x, test_y) = mnist.load_data() 13 | 14 | # Convert to 2D data 15 | train_x = train_x.reshape(train_x.shape[0], -1) 16 | 17 | # Decrease size by taking only first 1000 data 18 | train_x = pd.DataFrame(train_x[:1000, :]) 19 | train_y = train_y[:1000] 20 | 21 | # ----------------------------------- 22 | # PCA 23 | # ----------------------------------- 24 | from sklearn.decomposition import PCA 25 | 26 | # Fit the PCA transformation by using the training data 27 | pca = PCA() 28 | x_pca = pca.fit_transform(train_x) 29 | 30 | # Plot in 2D, differentiating each class by color 31 | f, ax = plt.subplots(1) 32 | for i in range(10): 33 | mask = train_y == i 34 | plt.scatter(x_pca[mask, 0], x_pca[mask, 1], label=i, s=10, alpha=0.5) 35 | ax.legend(bbox_to_anchor=(1.00, 1), loc='upper left') 36 | 37 | plt.show() 38 | 39 | # ----------------------------------- 40 | # LDA (Linear Discriminant Analysis) 41 | # ----------------------------------- 42 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 43 | 44 | # Derive the 2 axes that best split the classes using linear discriminant analysis 45 | lda = LDA(n_components=2) 46 | x_lda = lda.fit_transform(train_x, train_y) 47 | 48 | # Plot in 2D, differentiating each class by color 49 | # Note that the division is good, but this method is using the target values which gives it an advantage over other methods 50 | f, ax = plt.subplots(1) 51 | for i in range(10): 52 | mask = train_y == i 53 | plt.scatter(x_lda[mask, 0], x_lda[mask, 1], label=i, s=10, alpha=0.5) 54 | ax.legend(bbox_to_anchor=(1.00, 1), loc='upper left') 55 | 56 | plt.show() 57 | 58 | # ----------------------------------- 59 | # t-sne 60 | # ----------------------------------- 61 | from sklearn.manifold import TSNE 62 | 63 | # Transform using t-sne 64 | tsne = TSNE(n_components=2) 65 | x_tsne = tsne.fit_transform(train_x) 66 | 67 | # Plot in 2D, differentiating each class by color 68 | f, ax = plt.subplots(1) 69 | for i in range(10): 70 | mask = train_y == i 71 | plt.scatter(x_tsne[mask, 0], x_tsne[mask, 1], label=i, s=10, alpha=0.5) 72 | ax.legend(bbox_to_anchor=(1.00, 1), loc='upper left') 73 | 74 | plt.show() 75 | 76 | # ----------------------------------- 77 | # UMAP 78 | # ----------------------------------- 79 | import umap 80 | 81 | # Transform using UMAP 82 | um = umap.UMAP() 83 | x_umap = um.fit_transform(train_x) 84 | 85 | # Plot in 2D, differentiating each class by color 86 | f, ax = plt.subplots(1) 87 | for i in range(10): 88 | mask = train_y == i 89 | plt.scatter(x_umap[mask, 0], x_umap[mask, 1], label=i, s=10, alpha=0.5) 90 | ax.legend(bbox_to_anchor=(1.00, 1), loc='upper left') 91 | 92 | plt.show() 93 | -------------------------------------------------------------------------------- /ch06/ch06-04-filter.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv') 14 | 15 | # --------------------------------- 16 | # Use argsort() to do index sort 17 | # --------------------------------- 18 | # Arrays can be ordered using index sort into ascending and descending order with argsort() 19 | ary = np.array([10, 20, 30, 0]) 20 | idx = ary.argsort() 21 | print(idx) # Ascending order - [3 0 1 2] 22 | print(idx[::-1]) # Descending order - [2 1 0 3] 23 | 24 | print(ary[idx[::-1][:3]]) # Output best three - [30, 20, 10] 25 | 26 | # --------------------------------- 27 | # Correlation coefficient 28 | # --------------------------------- 29 | import scipy.stats as st 30 | 31 | # Correlation coefficient 32 | corrs = [] 33 | for c in train_x.columns: 34 | corr = np.corrcoef(train_x[c], train_y)[0, 1] 35 | corrs.append(corr) 36 | corrs = np.array(corrs) 37 | 38 | # Spearman's rank correlation coefficient 39 | corrs_sp = [] 40 | for c in train_x.columns: 41 | corr_sp = st.spearmanr(train_x[c], train_y).correlation 42 | corrs_sp.append(corr_sp) 43 | corrs_sp = np.array(corrs_sp) 44 | 45 | # Output in order to top importance (maximum of top 5) 46 | # Using np.argsort(), you can get the indices of the ordered values 47 | idx = np.argsort(np.abs(corrs))[::-1] 48 | top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5] 49 | print(top_cols, top_importances) 50 | 51 | idx2 = np.argsort(np.abs(corrs_sp))[::-1] 52 | top_cols2, top_importances2 = train_x.columns.values[idx][:5], corrs_sp[idx][:5] 53 | print(top_cols2, top_importances2) 54 | 55 | # --------------------------------- 56 | # Chi-square statistic 57 | # --------------------------------- 58 | from sklearn.feature_selection import chi2 59 | from sklearn.preprocessing import MinMaxScaler 60 | 61 | # Chi-square statistic 62 | x = MinMaxScaler().fit_transform(train_x) 63 | c2, _ = chi2(x, train_y) 64 | 65 | # Output in order to top importance (maximum of top 5) 66 | idx = np.argsort(c2)[::-1] 67 | top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5] 68 | print(top_cols, top_importances) 69 | 70 | # --------------------------------- 71 | # Mutual information 72 | # --------------------------------- 73 | from sklearn.feature_selection import mutual_info_classif 74 | 75 | # Mutual information 76 | mi = mutual_info_classif(train_x, train_y) 77 | 78 | # Output in order to top importance (maximum of top 5) 79 | idx = np.argsort(mi)[::-1] 80 | top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5] 81 | print(top_cols, top_importances) 82 | -------------------------------------------------------------------------------- /input/sample-data/input_preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.preprocessing import LabelEncoder 4 | 5 | # Read data, concatenate test and train once 6 | df_train = pd.read_csv('train.csv') 7 | df_train['is_train'] = True 8 | df_test = pd.read_csv('test.csv') 9 | df_test['target'] = 0 10 | df_test['is_train'] = False 11 | 12 | df = pd.concat([df_train, df_test], axis=0) 13 | 14 | # Preprocessing of dates 15 | df['date'] = pd.to_datetime(df['date']) 16 | df['year'] = df['date'].dt.year 17 | df['month'] = df['date'].dt.month 18 | df['day'] = df['date'].dt.day 19 | df['yearmonth'] = df['year'] * 12 + df['month'] 20 | df = df.drop(['date'], axis=1) 21 | 22 | # Different feature types 23 | numerical_features = ['age', 'height', 'weight', 'amount', 'year', 'month', 'month', 'yearmonth' 24 | 'medical_info_a1', 25 | 'medical_info_a2', 'medical_info_a3', 'medical_info_b1'] 26 | binary_features = [f'medical_keyword_{i}' for i in range(10)] 27 | categorical_features = ['sex', 'product', 'medical_info_b2', 'medical_info_b3'] 28 | 29 | # Perform Label Encoding for categorical features 30 | for c in categorical_features: 31 | le = LabelEncoder() 32 | df[c] = le.fit_transform(df[c]) 33 | print(f'{c} - {le.classes_}') 34 | 35 | # Move target to last column (for readability) 36 | df = df.reindex(columns=[c for c in df.columns if c != 'target'] + ['target']) 37 | 38 | # Split into train/test and output 39 | train = df[df['is_train']].drop(['is_train'], axis=1).reset_index(drop=True) 40 | test = df[~df['is_train']].drop(['is_train', 'target'], axis=1).reset_index(drop=True) 41 | train.to_csv('train_preprocessed.csv', index=False) 42 | test.to_csv('test_preprocessed.csv', index=False) 43 | 44 | # ---------------------- 45 | # Preprocessing for neural network and linear models 46 | 47 | # Impute missing values 48 | has_nan_features = ['medical_info_c1', 'medical_info_c2'] 49 | for c in has_nan_features: 50 | df[f'{c}_nan'] = df[c].isnull() 51 | df[c].fillna(df[c].mean(), inplace=True) 52 | 53 | # Perform One-hot Encoding 54 | df_onehot = pd.DataFrame(None, index=df.index) 55 | for c in df.columns: 56 | if c in categorical_features and df[c].nunique() > 2: 57 | dummies = pd.get_dummies(df[c], prefix=c) 58 | df_onehot = pd.concat([df_onehot, dummies], axis=1) 59 | print(f'one-hot encoded - {c}') 60 | else: 61 | df_onehot[c] = df[c] 62 | 63 | 64 | # Move target to last column (for readability) 65 | df_onehot = df_onehot.reindex(columns=[c for c in df_onehot.columns if c != 'target'] + ['target']) 66 | 67 | # Split into train/test and output 68 | train_onehot = df_onehot[df_onehot['is_train']].drop(['is_train'], axis=1).reset_index(drop=True) 69 | test_onehot = df_onehot[~df_onehot['is_train']].drop(['is_train', 'target'], axis=1).reset_index(drop=True) 70 | train_onehot.to_csv('train_preprocessed_onehot.csv', index=False) 71 | test_onehot.to_csv('test_preprocessed_onehot.csv', index=False) 72 | -------------------------------------------------------------------------------- /ch04/ch04-04-run_nn.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | # Load one-hot encoded data 10 | 11 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv') 12 | train_x = train.drop(['target'], axis=1) 13 | train_y = train['target'] 14 | test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv') 15 | 16 | # Split the training data into training and validation data 17 | from sklearn.model_selection import KFold 18 | 19 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 20 | tr_idx, va_idx = list(kf.split(train_x))[0] 21 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 22 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 23 | 24 | # Suppress tensorflow warnings 25 | import os 26 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 27 | import tensorflow as tf 28 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 29 | 30 | # ----------------------------------- 31 | # Neural network implementation 32 | # ----------------------------------- 33 | from keras.layers import Dense, Dropout 34 | from keras.models import Sequential 35 | from sklearn.metrics import log_loss 36 | from sklearn.preprocessing import StandardScaler 37 | 38 | # Data scaling 39 | scaler = StandardScaler() 40 | tr_x = scaler.fit_transform(tr_x) 41 | va_x = scaler.transform(va_x) 42 | test_x = scaler.transform(test_x) 43 | 44 | # Construct the neural network 45 | model = Sequential() 46 | model.add(Dense(256, activation='relu', input_shape=(train_x.shape[1],))) 47 | model.add(Dropout(0.2)) 48 | model.add(Dense(256, activation='relu')) 49 | model.add(Dropout(0.2)) 50 | model.add(Dense(1, activation='sigmoid')) 51 | 52 | model.compile(loss='binary_crossentropy', 53 | optimizer='adam', metrics=['accuracy']) 54 | 55 | # Perform training 56 | # Pass the validation data to the model, and monitor how the score changes during training 57 | batch_size = 128 58 | epochs = 10 59 | history = model.fit(tr_x, tr_y, 60 | batch_size=batch_size, epochs=epochs, 61 | verbose=1, validation_data=(va_x, va_y)) 62 | 63 | # Check score for validation data 64 | va_pred = model.predict(va_x) 65 | score = log_loss(va_y, va_pred, eps=1e-7) 66 | print(f'logloss: {score:.4f}') 67 | 68 | # Predictions 69 | pred = model.predict(test_x) 70 | 71 | # ----------------------------------- 72 | # Early stopping 73 | # ----------------------------------- 74 | from keras.callbacks import EarlyStopping 75 | 76 | # Set number of early stopping rounds to 20 77 | # By setting restore_best_weights, we use the model from the best epoch 78 | epochs = 50 79 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True) 80 | 81 | history = model.fit(tr_x, tr_y, 82 | batch_size=batch_size, epochs=epochs, 83 | verbose=1, validation_data=(va_x, va_y), callbacks=[early_stopping]) 84 | pred = model.predict(test_x) 85 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | This is an English translation of the sample code that accompanies the bestselling Japanese Kaggle book "Data Analysis Techniques to Win Kaggle" ([Amazon Japan](https://www.amazon.co.jp/dp/4297108437)). 4 | PDF copies of the book can be purchased from the [publisher's website](https://gihyo.jp/dp/ebook/2019/978-4-297-10844-1) from anywhere in the world using PayPal. 5 | The book's authors are by Daisuke Kadowaki ([threecourse](https://www.kaggle.com/threecourse)), Ryuji Sakata ([Jack](https://www.kaggle.com/rsakata)), Keisuke Hosaka ([hskksk](https://www.kaggle.com/hskksk)) and Yuji Hiramatsu ([maxwell](https://www.kaggle.com/maxwell110)). 6 | It was first published on 9 October 2019 by Gijutsu-Hyohron Co., Ltd (ISBN-13: 978-4297108434). 7 | 8 | Book cover: 9 | 10 | 11 | 12 | ### Contents of each folder 13 | 14 | | Folder | Contents | 15 | |:----|:-------| 16 | | input | Input files | 17 | | ch01 | Sample code for chapter 1 | 18 | | ch02 | Sample code for chapter 2 | 19 | | ch03 | Sample code for chapter 3 | 20 | | ch04 | Sample code for chapter 4 | 21 | | ch05 | Sample code for chapter 5 | 22 | | ch06 | Sample code for chapter 6 | 23 | | ch07 | Sample code for chapter 7 | 24 | | ch04-model-interface | Code for "class and directory structures for competitions" section of chapter 4 | 25 | 26 | * Execute code with the each chapter folder directory as the current directory. 27 | * For chapter 1, download the titanic data first as described in [input/readme.md](input/readme.md). 28 | * For the chapter 4 model interface code, refer to [ch04-model-interface/readme.md](ch04-model-interface). 29 | 30 | 31 | ### Requirements 32 | 33 | The sample code has been checked for operability on Google Cloud Platform (GCP) using the following environment. 34 | 35 | * Ubuntu 18.04 LTS 36 | * Anaconda 2019.03 Python 3.7 37 | * Necessary Python packages (check script below) 38 | 39 | Use following script to set up GCP environment. 40 | ``` 41 | # utils ----- 42 | 43 | # Install required tools for development 44 | cd ~/ 45 | sudo apt-get update 46 | sudo apt-get install -y git build-essential libatlas-base-dev 47 | sudo apt-get install -y python3-dev 48 | 49 | # anaconda ----- 50 | 51 | # Download and install Anaconda 52 | mkdir lib 53 | wget --quiet https://repo.continuum.io/archive/Anaconda3-2019.03-Linux-x86_64.sh -O lib/anaconda.sh 54 | /bin/bash lib/anaconda.sh -b 55 | 56 | # Add to PATH 57 | echo export PATH=~/anaconda3/bin:$PATH >> ~/.bashrc 58 | source ~/.bashrc 59 | 60 | # python packages ----- 61 | 62 | # Install Python packages 63 | # Use Anaconda 2019.03 default versions for numpy, scipy and pandas 64 | # pip install numpy==1.16.2 65 | # pip install scipy==1.2.1 66 | # pip install pandas==0.24.2 67 | pip install scikit-learn==0.21.2 68 | 69 | pip install xgboost==0.81 70 | pip install lightgbm==2.2.2 71 | pip install tensorflow==1.14.0 72 | pip install keras==2.2.4 73 | pip install hyperopt==0.1.1 74 | pip install bhtsne==0.1.9 75 | pip install rgf_python==3.4.0 76 | pip install umap-learn==0.3.9 77 | 78 | # set backend for matplotlib to Agg ----- 79 | 80 | # To execute on GCP, set matplotlib to backend 81 | matplotlibrc_path=$(python -c "import site, os, fileinput; packages_dir = site.getsitepackages()[0]; print(os.path.join(packages_dir, 'matplotlib', 'mpl-data', 'matplotlibrc'))") && \ 82 | sed -i 's/^backend : qt5agg/backend : agg/' $matplotlibrc_path 83 | ``` 84 | -------------------------------------------------------------------------------- /ch04-model-interface/code/model_nn.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from keras.callbacks import EarlyStopping 6 | from keras.layers.advanced_activations import PReLU 7 | from keras.layers.core import Activation, Dense, Dropout 8 | from keras.layers.normalization import BatchNormalization 9 | from keras.models import Sequential, load_model 10 | from keras.utils import np_utils 11 | from sklearn.preprocessing import StandardScaler 12 | 13 | from model import Model 14 | from util import Util 15 | 16 | # Suppress tensorflow warnings 17 | import os 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 19 | import tensorflow as tf 20 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 21 | 22 | 23 | class ModelNN(Model): 24 | 25 | def train(self, tr_x, tr_y, va_x=None, va_y=None): 26 | 27 | # Set and scale data 28 | validation = va_x is not None 29 | scaler = StandardScaler() 30 | scaler.fit(tr_x) 31 | tr_x = scaler.transform(tr_x) 32 | tr_y = np_utils.to_categorical(tr_y, num_classes=9) 33 | 34 | if validation: 35 | va_x = scaler.transform(va_x) 36 | va_y = np_utils.to_categorical(va_y, num_classes=9) 37 | 38 | # Parameters 39 | nb_classes = 9 40 | layers = self.params['layers'] 41 | dropout = self.params['dropout'] 42 | units = self.params['units'] 43 | nb_epoch = self.params['nb_epoch'] 44 | patience = self.params['patience'] 45 | 46 | # Construct model 47 | model = Sequential() 48 | model.add(Dense(units, input_shape=(tr_x.shape[1],))) 49 | model.add(PReLU()) 50 | model.add(BatchNormalization()) 51 | model.add(Dropout(dropout)) 52 | 53 | for l in range(layers - 1): 54 | model.add(Dense(units)) 55 | model.add(PReLU()) 56 | model.add(BatchNormalization()) 57 | model.add(Dropout(dropout)) 58 | 59 | model.add(Dense(nb_classes)) 60 | model.add(Activation('softmax')) 61 | model.compile(loss='categorical_crossentropy', optimizer='adam') 62 | 63 | if validation: 64 | early_stopping = EarlyStopping(monitor='val_loss', patience=patience, 65 | verbose=1, restore_best_weights=True) 66 | model.fit(tr_x, tr_y, epochs=nb_epoch, batch_size=128, verbose=2, 67 | validation_data=(va_x, va_y), callbacks=[early_stopping]) 68 | else: 69 | model.fit(tr_x, tr_y, nb_epoch=nb_epoch, batch_size=128, verbose=2) 70 | 71 | # Retain model and scaler 72 | self.model = model 73 | self.scaler = scaler 74 | 75 | def predict(self, te_x): 76 | te_x = self.scaler.transform(te_x) 77 | pred = self.model.predict_proba(te_x) 78 | return pred 79 | 80 | def save_model(self): 81 | model_path = os.path.join('../model/model', f'{self.run_fold_name}.h5') 82 | scaler_path = os.path.join('../model/model', f'{self.run_fold_name}-scaler.pkl') 83 | os.makedirs(os.path.dirname(model_path), exist_ok=True) 84 | self.model.save(model_path) 85 | Util.dump(self.scaler, scaler_path) 86 | 87 | def load_model(self): 88 | model_path = os.path.join('../model/model', f'{self.run_fold_name}.h5') 89 | scaler_path = os.path.join('../model/model', f'{self.run_fold_name}-scaler.pkl') 90 | self.model = load_model(model_path) 91 | self.scaler = Util.load(scaler_path) 92 | -------------------------------------------------------------------------------- /ch07/ch07-01-stacking.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | # Data for neural network 16 | train_nn = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv') 17 | train_x_nn = train_nn.drop(['target'], axis=1) 18 | train_y_nn = train_nn['target'] 19 | test_x_nn = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv') 20 | 21 | # --------------------------------- 22 | # Stacking 23 | # ---------------------------------- 24 | from sklearn.metrics import log_loss 25 | from sklearn.model_selection import KFold 26 | 27 | # Assume Model1Xgb, Model1NN and Model2Linear are defined in models.py 28 | # For each class train using fit and output prediction probabilities using predict 29 | 30 | from models import Model1Xgb, Model1NN, Model2Linear 31 | 32 | 33 | # Function that returns predictions for training data without knowing the target values, and predictions for the test data 34 | def predict_cv(model, train_x, train_y, test_x): 35 | preds = [] 36 | preds_test = [] 37 | va_idxes = [] 38 | 39 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 40 | 41 | # Train and make predictions using cross validation, save indices of predictions 42 | for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)): 43 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 44 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 45 | model.fit(tr_x, tr_y, va_x, va_y) 46 | pred = model.predict(va_x) 47 | preds.append(pred) 48 | pred_test = model.predict(test_x) 49 | preds_test.append(pred_test) 50 | va_idxes.append(va_idx) 51 | 52 | # Link using predictions from validation data, then return to original order 53 | va_idxes = np.concatenate(va_idxes) 54 | preds = np.concatenate(preds, axis=0) 55 | order = np.argsort(va_idxes) 56 | pred_train = preds[order] 57 | 58 | # Take average of predictions from test data 59 | preds_test = np.mean(preds_test, axis=0) 60 | 61 | return pred_train, preds_test 62 | 63 | 64 | # First level model 65 | # pred_train_1a, pred_train_1b are predictions from training data using cross validation 66 | # pred_test_1a, pred_test_1b are predictions from test data 67 | model_1a = Model1Xgb() 68 | pred_train_1a, pred_test_1a = predict_cv(model_1a, train_x, train_y, test_x) 69 | 70 | model_1b = Model1NN() 71 | pred_train_1b, pred_test_1b = predict_cv(model_1b, train_x_nn, train_y, test_x_nn) 72 | 73 | # Score for first level model 74 | print(f'logloss: {log_loss(train_y, pred_train_1a, eps=1e-7):.4f}') 75 | print(f'logloss: {log_loss(train_y, pred_train_1b, eps=1e-7):.4f}') 76 | 77 | # Make predictions a feature and create a data frame 78 | train_x_2 = pd.DataFrame({'pred_1a': pred_train_1a, 'pred_1b': pred_train_1b}) 79 | test_x_2 = pd.DataFrame({'pred_1a': pred_test_1a, 'pred_1b': pred_test_1b}) 80 | 81 | # Second level model 82 | # pred_train_2 are predictions from training data using cross validation via second level model 83 | # pred_test_2 are predictions from test data via second level model 84 | model_2 = Model2Linear() 85 | pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2) 86 | print(f'logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}') 87 | -------------------------------------------------------------------------------- /ch06/ch06-06-wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | # Split training data into training and validation data 16 | from sklearn.model_selection import KFold 17 | 18 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 19 | tr_idx, va_idx = list(kf.split(train_x))[0] 20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 22 | 23 | # Specify evaluation function that measures accuracy of features list 24 | import xgboost as xgb 25 | from sklearn.metrics import log_loss 26 | 27 | 28 | def evaluate(features): 29 | dtrain = xgb.DMatrix(tr_x[features], label=tr_y) 30 | dvalid = xgb.DMatrix(va_x[features], label=va_y) 31 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71} 32 | num_round = 10 # In reality more rounds are necessary 33 | early_stopping_rounds = 3 34 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')] 35 | model = xgb.train(params, dtrain, num_round, 36 | evals=watchlist, early_stopping_rounds=early_stopping_rounds, 37 | verbose_eval=0) 38 | va_pred = model.predict(dvalid) 39 | score = log_loss(va_y, va_pred) 40 | 41 | return score 42 | 43 | 44 | # --------------------------------- 45 | # Greedy Forward Selection 46 | # ---------------------------------- 47 | 48 | best_score = 9999.0 49 | selected = set([]) 50 | 51 | print('start greedy forward selection') 52 | 53 | while True: 54 | 55 | if len(selected) == len(train_x.columns): 56 | # Finish once all features selected 57 | break 58 | 59 | scores = [] 60 | for feature in train_x.columns: 61 | if feature not in selected: 62 | # Assume evaluation function that measures accuracy of features list has been specified 63 | fs = list(selected) + [feature] 64 | score = evaluate(fs) 65 | scores.append((feature, score)) 66 | 67 | # Assume low score is good 68 | b_feature, b_score = sorted(scores, key=lambda tpl: tpl[1])[0] 69 | if b_score < best_score: 70 | selected.add(b_feature) 71 | best_score = b_score 72 | print(f'selected:{b_feature}') 73 | print(f'score:{b_score}') 74 | else: 75 | # The score does not increase even if any features are added, so finish 76 | break 77 | 78 | print(f'selected features: {selected}') 79 | 80 | # --------------------------------- 81 | # Simplified method for Greedy Forward Selection 82 | # ---------------------------------- 83 | 84 | best_score = 9999.0 85 | candidates = np.random.RandomState(71).permutation(train_x.columns) 86 | selected = set([]) 87 | 88 | print('start simple selection') 89 | for feature in candidates: 90 | # Assume evaluation function that measures accuracy of features list has been specified 91 | fs = list(selected) + [feature] 92 | score = evaluate(fs) 93 | 94 | # Assume low score is good 95 | if score < best_score: 96 | selected.add(feature) 97 | best_score = score 98 | print(f'selected:{feature}') 99 | print(f'score:{score}') 100 | 101 | print(f'selected features: {selected}') 102 | -------------------------------------------------------------------------------- /ch03/ch03-04-time_series.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # ----------------------------------- 5 | # Wide format, long format 6 | # ----------------------------------- 7 | 8 | # Load wide format data 9 | df_wide = pd.read_csv('../input/ch03/time_series_wide.csv', index_col=0) 10 | # Convert the index column to datetime dtype 11 | df_wide.index = pd.to_datetime(df_wide.index) 12 | 13 | print(df_wide.iloc[:5, :3]) 14 | ''' 15 | A B C 16 | date 17 | 2016-07-01 532 3314 1136 18 | 2016-07-02 798 2461 1188 19 | 2016-07-03 823 3522 1711 20 | 2016-07-04 937 5451 1977 21 | 2016-07-05 881 4729 1975 22 | ''' 23 | 24 | # Convert to long format 25 | df_long = df_wide.stack().reset_index(1) 26 | df_long.columns = ['id', 'value'] 27 | 28 | print(df_long.head(10)) 29 | ''' 30 | id value 31 | date 32 | 2016-07-01 A 532 33 | 2016-07-01 B 3314 34 | 2016-07-01 C 1136 35 | 2016-07-02 A 798 36 | 2016-07-02 B 2461 37 | 2016-07-02 C 1188 38 | 2016-07-03 A 823 39 | 2016-07-03 B 3522 40 | 2016-07-03 C 1711 41 | 2016-07-04 A 937 42 | ... 43 | ''' 44 | 45 | # Restore wide format 46 | df_wide = df_long.pivot(index=None, columns='id', values='value') 47 | 48 | # ----------------------------------- 49 | # Lag variables 50 | # ----------------------------------- 51 | # Set data to wide format 52 | x = df_wide 53 | # ----------------------------------- 54 | # x is the wide format data frame 55 | # The index is the date or timestamp, assume the columns store data of interest such as sales etc. for users or stores 56 | 57 | # Create lag data for one period ago 58 | x_lag1 = x.shift(1) 59 | 60 | # Create lag data for seven periods ago 61 | x_lag7 = x.shift(7) 62 | 63 | # ----------------------------------- 64 | # Calculate moving averages for three periods from one period before 65 | x_avg3 = x.shift(1).rolling(window=3).mean() 66 | 67 | # ----------------------------------- 68 | # Calculate max values over seven periods from one period before 69 | x_max7 = x.shift(1).rolling(window=7).max() 70 | 71 | # ----------------------------------- 72 | # Calculate average of data from 7, 14, 21 and 28 periods before 73 | x_e7_avg = (x.shift(7) + x.shift(14) + x.shift(21) + x.shift(28)) / 4.0 74 | 75 | # ----------------------------------- 76 | # Create values for one period ahead 77 | x_lead1 = x.shift(-1) 78 | 79 | # ----------------------------------- 80 | # Lag variables 81 | # ----------------------------------- 82 | # Load the data 83 | train_x = pd.read_csv('../input/ch03/time_series_train.csv') 84 | event_history = pd.read_csv('../input/ch03/time_series_events.csv') 85 | train_x['date'] = pd.to_datetime(train_x['date']) 86 | event_history['date'] = pd.to_datetime(event_history['date']) 87 | # ----------------------------------- 88 | 89 | # train_x is training data in a data frame with columns for user id and date 90 | # event_history contains data from past events in a data frame with date and event columns 91 | 92 | # occurrences is a data frame with columns for date and whether a sale was made or not 93 | dates = np.sort(train_x['date'].unique()) 94 | occurrences = pd.DataFrame(dates, columns=['date']) 95 | sale_history = event_history[event_history['event'] == 'sale'] 96 | occurrences['sale'] = occurrences['date'].isin(sale_history['date']) 97 | 98 | # Take cumulative sums to calculate to number of occurrences on each date 99 | # occurrences is now a data frame with columns for date and cumulative number of sales on that date 100 | occurrences['sale'] = occurrences['sale'].cumsum() 101 | 102 | # Using the timestamp as a key, combine with the training dataset 103 | train_x = train_x.merge(occurrences, on='date', how='left') 104 | -------------------------------------------------------------------------------- /ch04/ch04-01-introduction.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (also possible to use numpy arrays) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | import xgboost as xgb 16 | 17 | 18 | # The Model class to operate the code 19 | class Model: 20 | 21 | def __init__(self, params=None): 22 | self.model = None 23 | if params is None: 24 | self.params = {} 25 | else: 26 | self.params = params 27 | 28 | def fit(self, tr_x, tr_y): 29 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71} 30 | params.update(self.params) 31 | num_round = 10 32 | dtrain = xgb.DMatrix(tr_x, label=tr_y) 33 | self.model = xgb.train(params, dtrain, num_round) 34 | 35 | def predict(self, x): 36 | data = xgb.DMatrix(x) 37 | pred = self.model.predict(data) 38 | return pred 39 | 40 | 41 | # ----------------------------------- 42 | # Model training and prediction 43 | # ----------------------------------- 44 | # Specify the model hyperparameters 45 | params = {'param1': 10, 'param2': 100} 46 | 47 | # Define the Model class 48 | # The Model class has functions fit() for training and predict() for outputting predicted probabilities 49 | 50 | # Define the Model class 51 | model = Model(params) 52 | 53 | # Use the training data to train the model 54 | model.fit(train_x, train_y) 55 | 56 | # Output predictions for the test data 57 | pred = model.predict(test_x) 58 | 59 | # ----------------------------------- 60 | # Validation 61 | # ----------------------------------- 62 | from sklearn.metrics import log_loss 63 | from sklearn.model_selection import KFold 64 | 65 | # Create an index in order to split the training and validation data 66 | # Split the training data into 4, and keep aside 1 quarter for validation 67 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 68 | tr_idx, va_idx = list(kf.split(train_x))[0] 69 | 70 | # Split the training data into training and validation data 71 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 72 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 73 | 74 | # Define the model 75 | model = Model(params) 76 | 77 | # Use the training data to train the model 78 | # Depending on the model, validation data can be supplied at the same time in order to monitor the score 79 | model.fit(tr_x, tr_y) 80 | 81 | # Make predictions with the validation data, and calculate the score 82 | va_pred = model.predict(va_x) 83 | score = log_loss(va_y, va_pred) 84 | print(f'logloss: {score:.4f}') 85 | 86 | # ----------------------------------- 87 | # Cross validation 88 | # ----------------------------------- 89 | from sklearn.metrics import log_loss 90 | from sklearn.model_selection import KFold 91 | 92 | # Split the training data into 4, and keep aside 1 quarter for validation 93 | # Change the quarter used for validation and evaluate the score 4 times 94 | scores = [] 95 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 96 | for tr_idx, va_idx in kf.split(train_x): 97 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 98 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 99 | model = Model(params) 100 | model.fit(tr_x, tr_y) 101 | va_pred = model.predict(va_x) 102 | score = log_loss(va_y, va_pred) 103 | scores.append(score) 104 | 105 | # Output the mean cross validation score 106 | print(f'logloss: {np.mean(scores):.4f}') 107 | -------------------------------------------------------------------------------- /ch06/ch06-01-hopt.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | # Split training data into training and validation data 16 | from sklearn.model_selection import KFold 17 | 18 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 19 | tr_idx, va_idx = list(kf.split(train_x))[0] 20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 22 | 23 | # Class for training and making predictions with xgboost 24 | import xgboost as xgb 25 | 26 | 27 | class Model: 28 | 29 | def __init__(self, params=None): 30 | self.model = None 31 | if params is None: 32 | self.params = {} 33 | else: 34 | self.params = params 35 | 36 | def fit(self, tr_x, tr_y, va_x, va_y): 37 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71} 38 | params.update(self.params) 39 | num_round = 10 40 | dtrain = xgb.DMatrix(tr_x, label=tr_y) 41 | dvalid = xgb.DMatrix(va_x, label=va_y) 42 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')] 43 | self.model = xgb.train(params, dtrain, num_round, evals=watchlist) 44 | 45 | def predict(self, x): 46 | data = xgb.DMatrix(x) 47 | pred = self.model.predict(data) 48 | return pred 49 | 50 | 51 | # ----------------------------------- 52 | # Specify the parameter space to search 53 | # ----------------------------------- 54 | # hp.choice: select from multiple options 55 | # hp.uniform: select uniformly from distribution between minimum and maximum bounds. Arguments are minimum and maximum bounds. 56 | # hp.quniform: select uniformly at points separated by fixed intervals within minimum and maximum bounds. Arguments are minimum and maximum bounds and interval width. 57 | # hp.loguniform: select from distribution so logarithm of returned values is uniformly distributed. Arguments are logarithm of minimum and maximum bounds. 58 | 59 | from hyperopt import hp 60 | 61 | space = { 62 | 'activation': hp.choice('activation', ['prelu', 'relu']), 63 | 'dropout': hp.uniform('dropout', 0, 0.2), 64 | 'units': hp.quniform('units', 32, 256, 32), 65 | 'learning_rate': hp.loguniform('learning_rate', np.log(0.00001), np.log(0.01)), 66 | } 67 | 68 | # ----------------------------------- 69 | # Parameter search using hyperopt 70 | # ----------------------------------- 71 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials 72 | from sklearn.metrics import log_loss 73 | 74 | 75 | def score(params): 76 | # When specifying the parameters also specify a metric to minimize 77 | # To be more specific, specify the parameters, then return score for predictions from trained model 78 | 79 | # Convert max_depth to integer 80 | params['max_depth'] = int(params['max_depth']) 81 | 82 | # Assume Model has already been defined 83 | # The Model class function fit() performs training, and predict() outputs predicted probabilities 84 | model = Model(params) 85 | model.fit(tr_x, tr_y, va_x, va_y) 86 | va_pred = model.predict(va_x) 87 | score = log_loss(va_y, va_pred) 88 | print(f'params: {params}, logloss: {score:.4f}') 89 | 90 | # Save the information 91 | history.append((params, score)) 92 | 93 | return {'loss': score, 'status': STATUS_OK} 94 | 95 | 96 | # Specify parameter space to search 97 | space = { 98 | 'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1), 99 | 'max_depth': hp.quniform('max_depth', 3, 9, 1), 100 | 'gamma': hp.quniform('gamma', 0, 0.4, 0.1), 101 | } 102 | 103 | # Use hyperopt for parameter search 104 | max_evals = 10 105 | trials = Trials() 106 | history = [] 107 | fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals) 108 | 109 | # Use recorded information to output parameter and score 110 | # (trials provides some information, but using it to obtain parameters is difficult in practice) 111 | history = sorted(history, key=lambda tpl: tpl[1]) 112 | best = history[0] 113 | print(f'best params:{best[0]}, score:{best[1]:.4f}') 114 | -------------------------------------------------------------------------------- /input/ch03/time_series_wide.csv: -------------------------------------------------------------------------------- 1 | ,A,B,C 2 | 2016-07-01,532,3314,1136 3 | 2016-07-02,798,2461,1188 4 | 2016-07-03,823,3522,1711 5 | 2016-07-04,937,5451,1977 6 | 2016-07-05,881,4729,1975 7 | 2016-07-06,931,4694,1937 8 | 2016-07-07,989,4077,1943 9 | 2016-07-08,905,4555,2112 10 | 2016-07-09,823,4467,1964 11 | 2016-07-10,786,5170,2132 12 | 2016-07-11,984,4241,1983 13 | 2016-07-12,939,5534,1624 14 | 2016-07-13,850,5485,2184 15 | 2016-07-14,980,4805,2211 16 | 2016-07-15,887,4357,2120 17 | 2016-07-16,761,4490,1896 18 | 2016-07-17,922,4705,1813 19 | 2016-07-18,855,4806,2169 20 | 2016-07-19,916,5174,1764 21 | 2016-07-20,835,5103,1875 22 | 2016-07-21,884,4813,1811 23 | 2016-07-22,978,4604,2230 24 | 2016-07-23,919,4418,2093 25 | 2016-07-24,931,4524,2053 26 | 2016-07-25,907,4907,2021 27 | 2016-07-26,1029,5268,2141 28 | 2016-07-27,1069,4929,2027 29 | 2016-07-28,841,4639,1983 30 | 2016-07-29,942,5311,2027 31 | 2016-07-30,876,4458,1805 32 | 2016-07-31,991,4677,1898 33 | 2016-08-01,851,5131,2100 34 | 2016-08-02,994,4471,1980 35 | 2016-08-03,744,5394,2047 36 | 2016-08-04,810,3781,2097 37 | 2016-08-05,822,4411,1980 38 | 2016-08-06,724,4731,1928 39 | 2016-08-07,895,5337,2069 40 | 2016-08-08,720,4418,1696 41 | 2016-08-09,831,4586,2302 42 | 2016-08-10,910,4230,1872 43 | 2016-08-11,879,4729,1993 44 | 2016-08-12,1009,4619,2243 45 | 2016-08-13,755,5032,1970 46 | 2016-08-14,886,4557,2011 47 | 2016-08-15,982,4311,2223 48 | 2016-08-16,1139,3716,1770 49 | 2016-08-17,801,5133,2188 50 | 2016-08-18,875,5437,1652 51 | 2016-08-19,1050,4444,2225 52 | 2016-08-20,1047,4848,1901 53 | 2016-08-21,701,4667,2226 54 | 2016-08-22,970,5180,1882 55 | 2016-08-23,841,4660,1866 56 | 2016-08-24,901,4502,1825 57 | 2016-08-25,795,4911,1878 58 | 2016-08-26,882,4685,1702 59 | 2016-08-27,783,4379,2187 60 | 2016-08-28,798,4583,1949 61 | 2016-08-29,868,4768,2023 62 | 2016-08-30,800,5131,2088 63 | 2016-08-31,977,5065,2371 64 | 2016-09-01,991,5012,1821 65 | 2016-09-02,801,4696,1559 66 | 2016-09-03,898,4467,2306 67 | 2016-09-04,766,4925,1865 68 | 2016-09-05,920,4783,2522 69 | 2016-09-06,796,4091,1972 70 | 2016-09-07,1046,6043,1652 71 | 2016-09-08,842,4967,2114 72 | 2016-09-09,802,4414,2230 73 | 2016-09-10,767,4688,1824 74 | 2016-09-11,1065,5378,1944 75 | 2016-09-12,976,4492,2391 76 | 2016-09-13,885,4569,2014 77 | 2016-09-14,861,5533,2077 78 | 2016-09-15,732,4870,1799 79 | 2016-09-16,942,4380,1836 80 | 2016-09-17,793,4246,1866 81 | 2016-09-18,980,4324,2152 82 | 2016-09-19,866,4071,1760 83 | 2016-09-20,997,5980,2274 84 | 2016-09-21,937,5045,2296 85 | 2016-09-22,787,5017,2010 86 | 2016-09-23,969,5198,2087 87 | 2016-09-24,779,4500,1906 88 | 2016-09-25,915,5219,1932 89 | 2016-09-26,925,4815,2252 90 | 2016-09-27,858,5338,2257 91 | 2016-09-28,911,5173,2059 92 | 2016-09-29,914,4602,1844 93 | 2016-09-30,803,4860,2025 94 | 2016-10-01,1145,5120,1905 95 | 2016-10-02,837,4436,1867 96 | 2016-10-03,780,5155,1899 97 | 2016-10-04,920,4406,2095 98 | 2016-10-05,810,4238,2053 99 | 2016-10-06,929,6004,2002 100 | 2016-10-07,1092,4742,2264 101 | 2016-10-08,809,5159,1771 102 | 2016-10-09,980,4765,1853 103 | 2016-10-10,884,4761,1777 104 | 2016-10-11,828,4039,2088 105 | 2016-10-12,931,5125,1966 106 | 2016-10-13,862,4981,2250 107 | 2016-10-14,886,4600,2277 108 | 2016-10-15,991,5283,2149 109 | 2016-10-16,1166,4292,2178 110 | 2016-10-17,1023,4822,1864 111 | 2016-10-18,981,3740,1645 112 | 2016-10-19,890,4192,2407 113 | 2016-10-20,870,4564,2108 114 | 2016-10-21,977,5349,1831 115 | 2016-10-22,997,5652,1826 116 | 2016-10-23,787,4443,2020 117 | 2016-10-24,975,4380,2108 118 | 2016-10-25,786,4275,2121 119 | 2016-10-26,902,4861,2463 120 | 2016-10-27,830,4317,2402 121 | 2016-10-28,837,4727,1749 122 | 2016-10-29,971,4097,1988 123 | 2016-10-30,794,4331,2326 124 | 2016-10-31,702,5094,1940 125 | 2016-11-01,884,4632,1952 126 | 2016-11-02,856,4972,1836 127 | 2016-11-03,1001,4663,1936 128 | 2016-11-04,911,5228,1949 129 | 2016-11-05,846,4980,1999 130 | 2016-11-06,1091,5191,1792 131 | 2016-11-07,978,4185,1620 132 | 2016-11-08,853,5440,1704 133 | 2016-11-09,753,4414,1852 134 | 2016-11-10,774,4814,1739 135 | 2016-11-11,975,5982,1890 136 | 2016-11-12,822,5464,1796 137 | 2016-11-13,768,5583,1615 138 | 2016-11-14,900,4456,2040 139 | 2016-11-15,873,4958,1904 140 | 2016-11-16,1102,5302,1771 141 | 2016-11-17,906,5559,1947 142 | 2016-11-18,903,4484,2039 143 | 2016-11-19,1081,4729,1731 144 | 2016-11-20,931,4010,1891 145 | 2016-11-21,782,4549,2001 146 | 2016-11-22,827,4642,1929 147 | 2016-11-23,873,3989,1965 148 | 2016-11-24,869,4906,2038 149 | 2016-11-25,938,4060,1991 150 | 2016-11-26,1077,4496,2382 151 | 2016-11-27,785,4723,2190 152 | 2016-11-28,830,4573,1838 153 | 2016-11-29,979,5131,1906 154 | 2016-11-30,806,5175,1958 155 | 2016-12-01,966,4565,2020 156 | 2016-12-02,844,3930,2190 157 | 2016-12-03,1026,5353,2535 158 | 2016-12-04,1014,4330,1921 159 | 2016-12-05,927,4130,2136 160 | 2016-12-06,745,4651,1882 161 | 2016-12-07,871,4339,2033 162 | 2016-12-08,839,3908,2062 163 | 2016-12-09,865,5423,1769 164 | 2016-12-10,923,3763,1884 165 | 2016-12-11,812,5022,1989 166 | 2016-12-12,1004,3949,1691 167 | 2016-12-13,845,5112,2208 168 | 2016-12-14,984,4661,1881 169 | 2016-12-15,842,4788,1962 170 | 2016-12-16,940,5799,1750 171 | 2016-12-17,900,4817,2048 172 | 2016-12-18,1003,4967,2025 173 | 2016-12-19,977,5274,1898 174 | 2016-12-20,890,3935,2085 175 | 2016-12-21,754,4846,2226 176 | 2016-12-22,992,4949,2181 177 | 2016-12-23,854,4619,2035 178 | 2016-12-24,900,5263,2144 179 | 2016-12-25,712,5029,1832 180 | 2016-12-26,840,4576,1954 181 | 2016-12-27,840,4573,1850 182 | 2016-12-28,943,4511,1764 183 | 2016-12-29,978,4599,1787 184 | 2016-12-30,907,4243,2069 185 | 2016-12-31,869,4703,2233 186 | -------------------------------------------------------------------------------- /ch03/ch03-05-reduction.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv') 14 | 15 | # For explanations save the original forms of the training and test data 16 | train_x_saved = train_x.copy() 17 | test_x_saved = test_x.copy() 18 | 19 | from sklearn.preprocessing import StandardScaler, MinMaxScaler 20 | 21 | 22 | # Function to return standardized versions of the original training and test data 23 | def load_standarized_data(): 24 | train_x, test_x = train_x_saved.copy(), test_x_saved.copy() 25 | 26 | scaler = StandardScaler() 27 | scaler.fit(train_x) 28 | train_x = scaler.transform(train_x) 29 | test_x = scaler.transform(test_x) 30 | return pd.DataFrame(train_x), pd.DataFrame(test_x) 31 | 32 | 33 | # Function to return MinMax scaled versions of the original training and test data 34 | def load_minmax_scaled_data(): 35 | train_x, test_x = train_x_saved.copy(), test_x_saved.copy() 36 | 37 | # Apply Min-Max Scaling 38 | scaler = MinMaxScaler() 39 | scaler.fit(pd.concat([train_x, test_x], axis=0)) 40 | train_x = scaler.transform(train_x) 41 | test_x = scaler.transform(test_x) 42 | 43 | return pd.DataFrame(train_x), pd.DataFrame(test_x) 44 | 45 | 46 | # ----------------------------------- 47 | # PCA 48 | # ----------------------------------- 49 | # Use the standardized data 50 | train_x, test_x = load_standarized_data() 51 | # ----------------------------------- 52 | # PCA 53 | from sklearn.decomposition import PCA 54 | 55 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform 56 | 57 | # Fit the PCA transformation using the training data 58 | pca = PCA(n_components=5) 59 | pca.fit(train_x) 60 | 61 | # Apply the transformation 62 | train_x = pca.transform(train_x) 63 | test_x = pca.transform(test_x) 64 | 65 | # ----------------------------------- 66 | # Use the standardized data 67 | train_x, test_x = load_standarized_data() 68 | # ----------------------------------- 69 | # TruncatedSVD 70 | from sklearn.decomposition import TruncatedSVD 71 | 72 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform 73 | 74 | # Fit the SVD transformation using the training data 75 | svd = TruncatedSVD(n_components=5, random_state=71) 76 | svd.fit(train_x) 77 | 78 | # Apply the transformation 79 | train_x = svd.transform(train_x) 80 | test_x = svd.transform(test_x) 81 | 82 | # ----------------------------------- 83 | # NMF 84 | # ----------------------------------- 85 | # So that the data are non-negative, use the MinMax scaled data 86 | train_x, test_x = load_minmax_scaled_data() 87 | # ----------------------------------- 88 | from sklearn.decomposition import NMF 89 | 90 | # Assume the data only contains non-negative values 91 | 92 | # Fit the NMF transformation using the training data 93 | model = NMF(n_components=5, init='random', random_state=71) 94 | model.fit(train_x) 95 | 96 | # Apply the transformation 97 | train_x = model.transform(train_x) 98 | test_x = model.transform(test_x) 99 | 100 | # ----------------------------------- 101 | # LatentDirichletAllocation 102 | # ----------------------------------- 103 | # Use the MinMax scaled data 104 | # Although this is not a matrix of counts, as the values are all non-negative it is still possible to calculate 105 | train_x, test_x = load_minmax_scaled_data() 106 | # ----------------------------------- 107 | from sklearn.decomposition import LatentDirichletAllocation 108 | 109 | # Assume the data is a matrix of counts of words in a document 110 | 111 | # Fit the Latent Dirichlet Allocation transformation using the training data 112 | model = LatentDirichletAllocation(n_components=5, random_state=71) 113 | model.fit(train_x) 114 | 115 | # Apply the transformation 116 | train_x = model.transform(train_x) 117 | test_x = model.transform(test_x) 118 | 119 | # ----------------------------------- 120 | # LinearDiscriminantAnalysis 121 | # ----------------------------------- 122 | # Use the standardized data 123 | train_x, test_x = load_standarized_data() 124 | # ----------------------------------- 125 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 126 | 127 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform 128 | 129 | # Fit the Linear Discriminant Analysis transformation using the training data 130 | lda = LDA(n_components=1) 131 | lda.fit(train_x, train_y) 132 | 133 | # Apply the transformation 134 | train_x = lda.transform(train_x) 135 | test_x = lda.transform(test_x) 136 | 137 | # ----------------------------------- 138 | # t-sne 139 | # ----------------------------------- 140 | # Use the standardized data 141 | train_x, test_x = load_standarized_data() 142 | # ----------------------------------- 143 | import bhtsne 144 | 145 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform 146 | 147 | # Transform using t-sne 148 | data = pd.concat([train_x, test_x]) 149 | embedded = bhtsne.tsne(data.astype(np.float64), dimensions=2, rand_seed=71) 150 | 151 | # ----------------------------------- 152 | # UMAP 153 | # ----------------------------------- 154 | # Use the standardized data 155 | train_x, test_x = load_standarized_data() 156 | # ----------------------------------- 157 | import umap 158 | 159 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform 160 | 161 | # Fit the UMAP transformation using the training data 162 | um = umap.UMAP() 163 | um.fit(train_x) 164 | 165 | # Apply the transformation 166 | train_x = um.transform(train_x) 167 | test_x = um.transform(test_x) 168 | 169 | # ----------------------------------- 170 | # Clustering 171 | # ----------------------------------- 172 | # Use the standardized data 173 | train_x, test_x = load_standarized_data() 174 | # ----------------------------------- 175 | from sklearn.cluster import MiniBatchKMeans 176 | 177 | # Assume that the data has been preprocessed e.g. by standardization to make the scale uniform 178 | 179 | # Fit the Mini-Batch K-Means using the training data 180 | kmeans = MiniBatchKMeans(n_clusters=10, random_state=71) 181 | kmeans.fit(train_x) 182 | 183 | # Output the clusters to which each class belongs 184 | train_clusters = kmeans.predict(train_x) 185 | test_clusters = kmeans.predict(test_x) 186 | 187 | # Output the distance to the center for each cluster 188 | train_distances = kmeans.transform(train_x) 189 | test_distances = kmeans.transform(test_x) 190 | -------------------------------------------------------------------------------- /ch05/ch05-01-validation.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | # Class for training and making predictions using xgboost 16 | import xgboost as xgb 17 | 18 | 19 | class Model: 20 | 21 | def __init__(self, params=None): 22 | self.model = None 23 | if params is None: 24 | self.params = {} 25 | else: 26 | self.params = params 27 | 28 | def fit(self, tr_x, tr_y, va_x, va_y): 29 | params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71} 30 | params.update(self.params) 31 | num_round = 10 32 | dtrain = xgb.DMatrix(tr_x, label=tr_y) 33 | dvalid = xgb.DMatrix(va_x, label=va_y) 34 | watchlist = [(dtrain, 'train'), (dvalid, 'eval')] 35 | self.model = xgb.train(params, dtrain, num_round, evals=watchlist) 36 | 37 | def predict(self, x): 38 | data = xgb.DMatrix(x) 39 | pred = self.model.predict(data) 40 | return pred 41 | 42 | 43 | # ----------------------------------- 44 | # hold-out method 45 | # ----------------------------------- 46 | # Partition validation data for hold-out method 47 | 48 | from sklearn.model_selection import train_test_split 49 | 50 | # Use train_test_split function for partitioning 51 | tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y, 52 | test_size=0.25, random_state=71, shuffle=True) 53 | 54 | # ----------------------------------- 55 | # Perform validation with hold-out method 56 | 57 | from sklearn.metrics import log_loss 58 | from sklearn.model_selection import train_test_split 59 | 60 | # Assume Model class has been predefined 61 | # Model class performs fitting and returns predicted probabilities for each outcome 62 | 63 | # Use train_test_split() function for partitioning 64 | tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y, 65 | test_size=0.25, random_state=71, shuffle=True) 66 | 67 | # Train the model, output predictions and calculate score 68 | model = Model() 69 | model.fit(tr_x, tr_y, va_x, va_y) 70 | va_pred = model.predict(va_x) 71 | score = log_loss(va_y, va_pred) 72 | print(score) 73 | 74 | # ----------------------------------- 75 | # Use the KFold class to partition validation data for hold-out method 76 | 77 | from sklearn.model_selection import KFold 78 | 79 | # Use KFold class to partition for hold-out method 80 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 81 | tr_idx, va_idx = list(kf.split(train_x))[0] 82 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 83 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 84 | 85 | # ----------------------------------- 86 | # Cross validation 87 | # ----------------------------------- 88 | # Partition data for cross validation 89 | 90 | from sklearn.model_selection import KFold 91 | 92 | # Use KFold class for partitioning for cross validation 93 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 94 | for tr_idx, va_idx in kf.split(train_x): 95 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 96 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 97 | 98 | # ----------------------------------- 99 | # Perform cross validation 100 | 101 | from sklearn.metrics import log_loss 102 | from sklearn.model_selection import KFold 103 | 104 | # It is assumed that the Model class has been predefined 105 | # Model class performs fitting and returns predicted probabilities for each outcome 106 | 107 | scores = [] 108 | 109 | # Use KFold class for partitioning for cross validation 110 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 111 | for tr_idx, va_idx in kf.split(train_x): 112 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 113 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 114 | 115 | # Train the model, output predictions and calculate score 116 | model = Model() 117 | model.fit(tr_x, tr_y, va_x, va_y) 118 | va_pred = model.predict(va_x) 119 | score = log_loss(va_y, va_pred) 120 | scores.append(score) 121 | 122 | # Take average of scores for each fold 123 | print(np.mean(scores)) 124 | 125 | # ----------------------------------- 126 | # Stratified K-Fold 127 | # ----------------------------------- 128 | from sklearn.model_selection import StratifiedKFold 129 | 130 | # Use the StratifiedKFold class to perform partitioning into stratified folds 131 | kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=71) 132 | for tr_idx, va_idx in kf.split(train_x, train_y): 133 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 134 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 135 | 136 | # ----------------------------------- 137 | # GroupKFold 138 | # ----------------------------------- 139 | # It is assumed that the data has the same users repeated 4 times 140 | train_x['user_id'] = np.arange(0, len(train_x)) // 4 141 | # ----------------------------------- 142 | 143 | from sklearn.model_selection import KFold, GroupKFold 144 | 145 | # Partition taking the user_id column to be the customer ID 146 | user_id = train_x['user_id'] 147 | unique_user_ids = user_id.unique() 148 | 149 | # Use the KFold class and partition using the customer ID 150 | scores = [] 151 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 152 | for tr_group_idx, va_group_idx in kf.split(unique_user_ids): 153 | # Partition using the customer ID (into data for training and validation) 154 | tr_groups, va_groups = unique_user_ids[tr_group_idx], unique_user_ids[va_group_idx] 155 | 156 | # Partition records based on whether the customer ID is in train/valid 157 | is_tr = user_id.isin(tr_groups) 158 | is_va = user_id.isin(va_groups) 159 | tr_x, va_x = train_x[is_tr], train_x[is_va] 160 | tr_y, va_y = train_y[is_tr], train_y[is_va] 161 | 162 | # (For reference)GroupKFold is difficult to use as you cannot shuffle or specify the random number seed 163 | kf = GroupKFold(n_splits=4) 164 | for tr_idx, va_idx in kf.split(train_x, train_y, user_id): 165 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 166 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 167 | 168 | # ----------------------------------- 169 | # leave-one-out 170 | # ----------------------------------- 171 | # Assume that there are only 100 data 172 | train_x = train_x.iloc[:100, :].copy() 173 | # ----------------------------------- 174 | from sklearn.model_selection import LeaveOneOut 175 | 176 | loo = LeaveOneOut() 177 | for tr_idx, va_idx in loo.split(train_x): 178 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 179 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 180 | -------------------------------------------------------------------------------- /ch06/ch06-03-hopt_nn.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y is the target values, and test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv') 14 | 15 | # Split training data into training and validation data 16 | from sklearn.model_selection import KFold 17 | 18 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 19 | tr_idx, va_idx = list(kf.split(train_x))[0] 20 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 21 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 22 | 23 | # Suppress tensorflow warnings 24 | import os 25 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 26 | import tensorflow as tf 27 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 28 | 29 | # ----------------------------------- 30 | # Example of parameter tuning for a neural network 31 | # ----------------------------------- 32 | from hyperopt import hp 33 | from keras.callbacks import EarlyStopping 34 | from keras.layers.advanced_activations import ReLU, PReLU 35 | from keras.layers.core import Dense, Dropout 36 | from keras.layers.normalization import BatchNormalization 37 | from keras.models import Sequential 38 | from keras.optimizers import SGD, Adam 39 | from sklearn.preprocessing import StandardScaler 40 | 41 | # Fundamental parameters 42 | base_param = { 43 | 'input_dropout': 0.0, 44 | 'hidden_layers': 3, 45 | 'hidden_units': 96, 46 | 'hidden_activation': 'relu', 47 | 'hidden_dropout': 0.2, 48 | 'batch_norm': 'before_act', 49 | 'optimizer': {'type': 'adam', 'lr': 0.001}, 50 | 'batch_size': 64, 51 | } 52 | 53 | # Specify parameter space to search 54 | param_space = { 55 | 'input_dropout': hp.quniform('input_dropout', 0, 0.2, 0.05), 56 | 'hidden_layers': hp.quniform('hidden_layers', 2, 4, 1), 57 | 'hidden_units': hp.quniform('hidden_units', 32, 256, 32), 58 | 'hidden_activation': hp.choice('hidden_activation', ['prelu', 'relu']), 59 | 'hidden_dropout': hp.quniform('hidden_dropout', 0, 0.3, 0.05), 60 | 'batch_norm': hp.choice('batch_norm', ['before_act', 'no']), 61 | 'optimizer': hp.choice('optimizer', 62 | [{'type': 'adam', 63 | 'lr': hp.loguniform('adam_lr', np.log(0.00001), np.log(0.01))}, 64 | {'type': 'sgd', 65 | 'lr': hp.loguniform('sgd_lr', np.log(0.00001), np.log(0.01))}]), 66 | 'batch_size': hp.quniform('batch_size', 32, 128, 32), 67 | } 68 | 69 | 70 | class MLP: 71 | 72 | def __init__(self, params): 73 | self.params = params 74 | self.scaler = None 75 | self.model = None 76 | 77 | def fit(self, tr_x, tr_y, va_x, va_y): 78 | 79 | # Parameters 80 | input_dropout = self.params['input_dropout'] 81 | hidden_layers = int(self.params['hidden_layers']) 82 | hidden_units = int(self.params['hidden_units']) 83 | hidden_activation = self.params['hidden_activation'] 84 | hidden_dropout = self.params['hidden_dropout'] 85 | batch_norm = self.params['batch_norm'] 86 | optimizer_type = self.params['optimizer']['type'] 87 | optimizer_lr = self.params['optimizer']['lr'] 88 | batch_size = int(self.params['batch_size']) 89 | 90 | # Standardization 91 | self.scaler = StandardScaler() 92 | tr_x = self.scaler.fit_transform(tr_x) 93 | va_x = self.scaler.transform(va_x) 94 | 95 | self.model = Sequential() 96 | 97 | # Input layer 98 | self.model.add(Dropout(input_dropout, input_shape=(tr_x.shape[1],))) 99 | 100 | # Hidden layers 101 | for i in range(hidden_layers): 102 | self.model.add(Dense(hidden_units)) 103 | if batch_norm == 'before_act': 104 | self.model.add(BatchNormalization()) 105 | if hidden_activation == 'prelu': 106 | self.model.add(PReLU()) 107 | elif hidden_activation == 'relu': 108 | self.model.add(ReLU()) 109 | else: 110 | raise NotImplementedError 111 | self.model.add(Dropout(hidden_dropout)) 112 | 113 | # Output layer 114 | self.model.add(Dense(1, activation='sigmoid')) 115 | 116 | # Optimizer 117 | if optimizer_type == 'sgd': 118 | optimizer = SGD(lr=optimizer_lr, decay=1e-6, momentum=0.9, nesterov=True) 119 | elif optimizer_type == 'adam': 120 | optimizer = Adam(lr=optimizer_lr, beta_1=0.9, beta_2=0.999, decay=0.) 121 | else: 122 | raise NotImplementedError 123 | 124 | # Set objective function, metric etc. 125 | self.model.compile(loss='binary_crossentropy', 126 | optimizer=optimizer, metrics=['accuracy']) 127 | 128 | # Number of epochs, early stopping 129 | # Beware if the number of epochs is large, training might not complete with a small learning rate 130 | nb_epoch = 200 131 | patience = 20 132 | early_stopping = EarlyStopping(patience=patience, restore_best_weights=True) 133 | 134 | # Execute training 135 | history = self.model.fit(tr_x, tr_y, 136 | epochs=nb_epoch, 137 | batch_size=batch_size, verbose=1, 138 | validation_data=(va_x, va_y), 139 | callbacks=[early_stopping]) 140 | 141 | def predict(self, x): 142 | # Predictions 143 | x = self.scaler.transform(x) 144 | y_pred = self.model.predict(x) 145 | y_pred = y_pred.flatten() 146 | return y_pred 147 | 148 | 149 | # ----------------------------------- 150 | # Hyperparameter tuning 151 | 152 | from hyperopt import fmin, tpe, STATUS_OK, Trials 153 | from sklearn.metrics import log_loss 154 | 155 | 156 | def score(params): 157 | # Specify a function to minimize when the parameters are specified 158 | # In the search for model parameters, the score is for the predictions from the model trained with the specified parameter set 159 | model = MLP(params) 160 | model.fit(tr_x, tr_y, va_x, va_y) 161 | va_pred = model.predict(va_x) 162 | score = log_loss(va_y, va_pred) 163 | print(f'params: {params}, logloss: {score:.4f}') 164 | 165 | # Save the information 166 | history.append((params, score)) 167 | 168 | return {'loss': score, 'status': STATUS_OK} 169 | 170 | 171 | # Use hyperopt for parameter search 172 | max_evals = 10 173 | trials = Trials() 174 | history = [] 175 | fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals) 176 | 177 | # Output parameters and scores from saved information 178 | # Trials provides some information but it is difficult to obtain parameters 179 | history = sorted(history, key=lambda tpl: tpl[1]) 180 | best = history[0] 181 | print(f'best params:{best[0]}, score:{best[1]:.4f}') 182 | -------------------------------------------------------------------------------- /ch02/ch02-01-metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # ----------------------------------- 5 | # Regression 6 | # ----------------------------------- 7 | # rmse 8 | 9 | from sklearn.metrics import mean_squared_error 10 | 11 | # y_true are the true values、y_pred are the predictions 12 | y_true = [1.0, 1.5, 2.0, 1.2, 1.8] 13 | y_pred = [0.8, 1.5, 1.8, 1.3, 3.0] 14 | 15 | rmse = np.sqrt(mean_squared_error(y_true, y_pred)) 16 | print(rmse) 17 | # 0.5532 18 | 19 | # ----------------------------------- 20 | # Binary classification 21 | # ----------------------------------- 22 | # Confusion matrix 23 | 24 | from sklearn.metrics import confusion_matrix 25 | 26 | # True values and predicted values are binary, i.e. either 0 or 1 27 | y_true = [1, 0, 1, 1, 0, 1, 1, 0] 28 | y_pred = [0, 0, 1, 1, 0, 0, 1, 1] 29 | 30 | tp = np.sum((np.array(y_true) == 1) & (np.array(y_pred) == 1)) 31 | tn = np.sum((np.array(y_true) == 0) & (np.array(y_pred) == 0)) 32 | fp = np.sum((np.array(y_true) == 0) & (np.array(y_pred) == 1)) 33 | fn = np.sum((np.array(y_true) == 1) & (np.array(y_pred) == 0)) 34 | 35 | confusion_matrix1 = np.array([[tp, fp], 36 | [fn, tn]]) 37 | print(confusion_matrix1) 38 | # array([[3, 1], 39 | # [2, 2]]) 40 | 41 | # Can also be created using the confusion_matrix() function from scikit-learn's metrics, but 42 | # be aware that the arrangement of the confusion matrix elements may be different 43 | confusion_matrix2 = confusion_matrix(y_true, y_pred) 44 | print(confusion_matrix2) 45 | # array([[2, 1], 46 | # [2, 3]]) 47 | 48 | # ----------------------------------- 49 | # accuracy 50 | 51 | from sklearn.metrics import accuracy_score 52 | 53 | # True values and predicted values are binary, i.e. either 0 or 1 54 | y_true = [1, 0, 1, 1, 0, 1, 1, 0] 55 | y_pred = [0, 0, 1, 1, 0, 0, 1, 1] 56 | accuracy = accuracy_score(y_true, y_pred) 57 | print(accuracy) 58 | # 0.625 59 | 60 | # ----------------------------------- 61 | # logloss 62 | 63 | from sklearn.metrics import log_loss 64 | 65 | # True values are binary (0 or 1), predicted values are probabilities 66 | y_true = [1, 0, 1, 1, 0, 1] 67 | y_prob = [0.1, 0.2, 0.8, 0.8, 0.1, 0.3] 68 | 69 | logloss = log_loss(y_true, y_prob) 70 | print(logloss) 71 | # 0.7136 72 | 73 | # ----------------------------------- 74 | # Multi-class classification 75 | # ----------------------------------- 76 | # multi-class logloss 77 | 78 | from sklearn.metrics import log_loss 79 | 80 | # True values are 3-class classifiers, predicted values are probabilities for each class 81 | y_true = np.array([0, 2, 1, 2, 2]) 82 | y_pred = np.array([[0.68, 0.32, 0.00], 83 | [0.00, 0.00, 1.00], 84 | [0.60, 0.40, 0.00], 85 | [0.00, 0.00, 1.00], 86 | [0.28, 0.12, 0.60]]) 87 | logloss = log_loss(y_true, y_pred) 88 | print(logloss) 89 | # 0.3626 90 | 91 | # ----------------------------------- 92 | # Multi-label classification 93 | # ----------------------------------- 94 | # mean_f1, macro_f1, micro_f1 95 | 96 | from sklearn.metrics import f1_score 97 | 98 | # For calculating performance metric of multi-label classification, it is easier to handle the true / predicted values as binary matrices of record x class 99 | # True values - [[1,2], [1], [1,2,3], [2,3], [3]] 100 | y_true = np.array([[1, 1, 0], 101 | [1, 0, 0], 102 | [1, 1, 1], 103 | [0, 1, 1], 104 | [0, 0, 1]]) 105 | 106 | # Predicted values - [[1,3], [2], [1,3], [3], [3]] 107 | y_pred = np.array([[1, 0, 1], 108 | [0, 1, 0], 109 | [1, 0, 1], 110 | [0, 0, 1], 111 | [0, 0, 1]]) 112 | 113 | # mean_f1 is the mean of the F1-scores for each record 114 | mean_f1 = np.mean([f1_score(y_true[i, :], y_pred[i, :]) for i in range(len(y_true))]) 115 | 116 | # macro_f1 is the mean of the F1-scores for each class 117 | n_class = 3 118 | macro_f1 = np.mean([f1_score(y_true[:, c], y_pred[:, c]) for c in range(n_class)]) 119 | 120 | # micro-f1 is the F1-score calculate using the true/predicted values for each record-class pair 121 | micro_f1 = f1_score(y_true.reshape(-1), y_pred.reshape(-1)) 122 | 123 | print(mean_f1, macro_f1, micro_f1) 124 | # 0.5933, 0.5524, 0.6250 125 | 126 | # Can also be calculated using a scikit-learn function 127 | mean_f1 = f1_score(y_true, y_pred, average='samples') 128 | macro_f1 = f1_score(y_true, y_pred, average='macro') 129 | micro_f1 = f1_score(y_true, y_pred, average='micro') 130 | 131 | # ----------------------------------- 132 | # Multi-class classification with ordered classes 133 | # ----------------------------------- 134 | # quadratic weighted kappa 135 | 136 | from sklearn.metrics import confusion_matrix, cohen_kappa_score 137 | 138 | 139 | # Function for calculating quadratic weighted kappa 140 | def quadratic_weighted_kappa(c_matrix): 141 | numer = 0.0 142 | denom = 0.0 143 | 144 | for i in range(c_matrix.shape[0]): 145 | for j in range(c_matrix.shape[1]): 146 | n = c_matrix.shape[0] 147 | wij = ((i - j) ** 2.0) 148 | oij = c_matrix[i, j] 149 | eij = c_matrix[i, :].sum() * c_matrix[:, j].sum() / c_matrix.sum() 150 | numer += wij * oij 151 | denom += wij * eij 152 | 153 | return 1.0 - numer / denom 154 | 155 | 156 | # y_true is the true class list, y_pred is the predicted class list 157 | y_true = [1, 2, 3, 4, 3] 158 | y_pred = [2, 2, 4, 4, 5] 159 | 160 | # Calculate the confusion matrix 161 | c_matrix = confusion_matrix(y_true, y_pred, labels=[1, 2, 3, 4, 5]) 162 | 163 | # Calculate quadratic weighted kappa 164 | kappa = quadratic_weighted_kappa(c_matrix) 165 | print(kappa) 166 | # 0.6153 167 | 168 | # Can also be calculated using a scikit-learn function 169 | kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic') 170 | 171 | # ----------------------------------- 172 | # Recommendation 173 | # ----------------------------------- 174 | # MAP@K 175 | 176 | # K=3, with 5 records and 4 class types 177 | K = 3 178 | 179 | # True values for each record 180 | y_true = [[1, 2], [1, 2], [4], [1, 2, 3, 4], [3, 4]] 181 | 182 | # Predicted values for each record - as K=3, usually predict order of 3 records for each class 183 | y_pred = [[1, 2, 4], [4, 1, 2], [1, 4, 3], [1, 2, 3], [1, 2, 4]] 184 | 185 | 186 | # Function to calculate the average precision for each record 187 | def apk(y_i_true, y_i_pred): 188 | # Length of y_pred must be less than or equal to K, and all elements must be unique 189 | assert (len(y_i_pred) <= K) 190 | assert (len(np.unique(y_i_pred)) == len(y_i_pred)) 191 | 192 | sum_precision = 0.0 193 | num_hits = 0.0 194 | 195 | for i, p in enumerate(y_i_pred): 196 | if p in y_i_true: 197 | num_hits += 1 198 | precision = num_hits / (i + 1) 199 | sum_precision += precision 200 | 201 | return sum_precision / min(len(y_i_true), K) 202 | 203 | 204 | # Function for calculating MAP@K 205 | def mapk(y_true, y_pred): 206 | return np.mean([apk(y_i_true, y_i_pred) for y_i_true, y_i_pred in zip(y_true, y_pred)]) 207 | 208 | 209 | # Calculate MAP@K 210 | print(mapk(y_true, y_pred)) 211 | # 0.65 212 | 213 | # Even if the number of true values is the same, if the order is different then the score will be different 214 | print(apk(y_true[0], y_pred[0])) 215 | print(apk(y_true[1], y_pred[1])) 216 | # 1.0, 0.5833 217 | -------------------------------------------------------------------------------- /ch03/ch03-01-numerical.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y contains the target values, test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train_preprocessed.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv') 14 | 15 | # Save training and test datasets in their original form for explanations 16 | train_x_saved = train_x.copy() 17 | test_x_saved = test_x.copy() 18 | 19 | 20 | # Function to recover original training and test datasets 21 | def load_data(): 22 | train_x, test_x = train_x_saved.copy(), test_x_saved.copy() 23 | return train_x, test_x 24 | 25 | 26 | # Store names of numerical variables to be converted in list 27 | num_cols = ['age', 'height', 'weight', 'amount', 28 | 'medical_info_a1', 'medical_info_a2', 'medical_info_a3', 'medical_info_b1'] 29 | 30 | # ----------------------------------- 31 | # Standardization 32 | # ----------------------------------- 33 | # Load the data 34 | train_x, test_x = load_data() 35 | # ----------------------------------- 36 | from sklearn.preprocessing import StandardScaler 37 | 38 | # Compute standardization parameters for multiple columns of the training data 39 | scaler = StandardScaler() 40 | scaler.fit(train_x[num_cols]) 41 | 42 | # Replace columns with standardized values 43 | train_x[num_cols] = scaler.transform(train_x[num_cols]) 44 | test_x[num_cols] = scaler.transform(test_x[num_cols]) 45 | 46 | # ----------------------------------- 47 | # Load the data 48 | train_x, test_x = load_data() 49 | # ----------------------------------- 50 | from sklearn.preprocessing import StandardScaler 51 | 52 | # Compute standardization parameters for multiple columns from combined training and test data 53 | scaler = StandardScaler() 54 | scaler.fit(pd.concat([train_x[num_cols], test_x[num_cols]])) 55 | 56 | # Replace columns with standardized values 57 | train_x[num_cols] = scaler.transform(train_x[num_cols]) 58 | test_x[num_cols] = scaler.transform(test_x[num_cols]) 59 | 60 | # ----------------------------------- 61 | # Load the data 62 | train_x, test_x = load_data() 63 | # ----------------------------------- 64 | from sklearn.preprocessing import StandardScaler 65 | 66 | # Standardize training and test data separately (bad example) 67 | scaler_train = StandardScaler() 68 | scaler_train.fit(train_x[num_cols]) 69 | train_x[num_cols] = scaler_train.transform(train_x[num_cols]) 70 | scaler_test = StandardScaler() 71 | scaler_test.fit(test_x[num_cols]) 72 | test_x[num_cols] = scaler_test.transform(test_x[num_cols]) 73 | 74 | # ----------------------------------- 75 | # Min-Max scaling 76 | # ----------------------------------- 77 | # Load the data 78 | train_x, test_x = load_data() 79 | # ----------------------------------- 80 | from sklearn.preprocessing import MinMaxScaler 81 | 82 | # Compute parameters for min-max scaling for multiple columns of the training data 83 | scaler = MinMaxScaler() 84 | scaler.fit(train_x[num_cols]) 85 | 86 | # Replace columns with min-max scaled values 87 | train_x[num_cols] = scaler.transform(train_x[num_cols]) 88 | test_x[num_cols] = scaler.transform(test_x[num_cols]) 89 | 90 | # ----------------------------------- 91 | # Logarithmic transformation 92 | # ----------------------------------- 93 | x = np.array([1.0, 10.0, 100.0, 1000.0, 10000.0]) 94 | 95 | # Take simple logarithm 96 | x1 = np.log(x) 97 | 98 | # Take logarithm of x+1 99 | x2 = np.log1p(x) 100 | 101 | # Apply original sign to logarithm taken of absolute value 102 | x3 = np.sign(x) * np.log(np.abs(x)) 103 | 104 | # ----------------------------------- 105 | # Box-Cox transformation 106 | # ----------------------------------- 107 | # Load the data 108 | train_x, test_x = load_data() 109 | # ----------------------------------- 110 | 111 | # Store only columns that take positive values in a list for transformation 112 | # Note: when including missing values it is necessary to use (~(train_x[c] <= 0.0)).all() etc. 113 | pos_cols = [c for c in num_cols if (train_x[c] > 0.0).all() and (test_x[c] > 0.0).all()] 114 | 115 | from sklearn.preprocessing import PowerTransformer 116 | 117 | # Fit Box-Cox transformation to the columns with positive values in the training data 118 | pt = PowerTransformer(method='box-cox') 119 | pt.fit(train_x[pos_cols]) 120 | 121 | # Replace columns with transformed data 122 | train_x[pos_cols] = pt.transform(train_x[pos_cols]) 123 | test_x[pos_cols] = pt.transform(test_x[pos_cols]) 124 | 125 | # ----------------------------------- 126 | # Yeo-Johnson transformation 127 | # ----------------------------------- 128 | # Load the data 129 | train_x, test_x = load_data() 130 | # ----------------------------------- 131 | 132 | from sklearn.preprocessing import PowerTransformer 133 | 134 | # Compute parameters for Yeo-Johnnson transformation for multiple columns of the training data 135 | pt = PowerTransformer(method='yeo-johnson') 136 | pt.fit(train_x[num_cols]) 137 | 138 | # Replace columns with transformed data 139 | train_x[num_cols] = pt.transform(train_x[num_cols]) 140 | test_x[num_cols] = pt.transform(test_x[num_cols]) 141 | 142 | # ----------------------------------- 143 | # Clipping 144 | # ----------------------------------- 145 | # Load the data 146 | train_x, test_x = load_data() 147 | # ----------------------------------- 148 | # Calculate 1% and 99% limits of each column of the training data 149 | p01 = train_x[num_cols].quantile(0.01) 150 | p99 = train_x[num_cols].quantile(0.99) 151 | 152 | # Clip out values in the 1st and 99th percentiles 153 | train_x[num_cols] = train_x[num_cols].clip(p01, p99, axis=1) 154 | test_x[num_cols] = test_x[num_cols].clip(p01, p99, axis=1) 155 | 156 | # ----------------------------------- 157 | # Binning 158 | # ----------------------------------- 159 | x = [1, 7, 5, 4, 6, 3] 160 | 161 | # Use cut() function in pandas for binning 162 | 163 | # Case where you specify the number of bins 164 | binned = pd.cut(x, 3, labels=False) 165 | print(binned) 166 | # [0 2 1 1 2 0] - shows which of the three bins the converted values are in 167 | 168 | # Case where you specify the bin ranges (<3.0, 3.0->5.0, >5.0) 169 | bin_edges = [-float('inf'), 3.0, 5.0, float('inf')] 170 | binned = pd.cut(x, bin_edges, labels=False) 171 | print(binned) 172 | # [0 2 1 1 2 0] - shows which of the three bins the converted values are in 173 | 174 | # ----------------------------------- 175 | # Rank transformation 176 | # ----------------------------------- 177 | x = [10, 20, 30, 0, 40, 40] 178 | 179 | # Use rank() function in pandas for rank transformation 180 | rank = pd.Series(x).rank() 181 | print(rank.values) 182 | # First value is 1, mean rank is given for values in equal position 183 | # [2. 3. 4. 1. 5.5 5.5] 184 | 185 | # Also possible to to apply argsort() function in numpy twice to make rank transformation 186 | order = np.argsort(x) 187 | rank = np.argsort(order) 188 | print(rank) 189 | # First value is zero, equal position values are ordered by whichever is first 190 | # [1 2 3 0 4 5] 191 | 192 | # ----------------------------------- 193 | # RankGauss 194 | # ----------------------------------- 195 | # Load the data 196 | train_x, test_x = load_data() 197 | # ----------------------------------- 198 | from sklearn.preprocessing import QuantileTransformer 199 | 200 | # Compute parameters for Rank-Gauss transformation for multiple columns of the training data 201 | transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal') 202 | transformer.fit(train_x[num_cols]) 203 | 204 | # Replace columns with transformed data 205 | train_x[num_cols] = transformer.transform(train_x[num_cols]) 206 | test_x[num_cols] = transformer.transform(test_x[num_cols]) 207 | -------------------------------------------------------------------------------- /ch04-model-interface/code/runner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from model import Model 4 | from sklearn.metrics import log_loss 5 | from sklearn.model_selection import StratifiedKFold 6 | from typing import Callable, List, Optional, Tuple, Union 7 | 8 | from util import Logger, Util 9 | 10 | logger = Logger() 11 | 12 | 13 | class Runner: 14 | 15 | def __init__(self, run_name: str, model_cls: Callable[[str, dict], Model], features: List[str], params: dict): 16 | """Constructor 17 | 18 | :param run_name: Run name 19 | :param model_cls: Model class 20 | :param features: List of features 21 | :param params: Hyperparameters 22 | """ 23 | self.run_name = run_name 24 | self.model_cls = model_cls 25 | self.features = features 26 | self.params = params 27 | self.n_fold = 4 28 | 29 | def train_fold(self, i_fold: Union[int, str]) -> Tuple[ 30 | Model, Optional[np.array], Optional[np.array], Optional[float]]: 31 | """Specify cross validation, train then calculate score 32 | 33 | In addition to calling from other methods, this is also used itself for checks and to adjust parameters 34 | 35 | :param i_fold: fold number (when everything use 'all') 36 | :return: Tuple containing (model instance, record index, predictions, validation score) 37 | """ 38 | # Load training data 39 | validation = i_fold != 'all' 40 | train_x = self.load_x_train() 41 | train_y = self.load_y_train() 42 | 43 | if validation: 44 | # Set training and validation data 45 | tr_idx, va_idx = self.load_index_fold(i_fold) 46 | tr_x, tr_y = train_x.iloc[tr_idx], train_y.iloc[tr_idx] 47 | va_x, va_y = train_x.iloc[va_idx], train_y.iloc[va_idx] 48 | 49 | # Train model 50 | model = self.build_model(i_fold) 51 | model.train(tr_x, tr_y, va_x, va_y) 52 | 53 | # Make predictions using validation data and calculate score 54 | va_pred = model.predict(va_x) 55 | score = log_loss(va_y, va_pred, eps=1e-15, normalize=True) 56 | 57 | # Return model, index, predictions and score 58 | return model, va_idx, va_pred, score 59 | else: 60 | # Train using all training data 61 | model = self.build_model(i_fold) 62 | model.train(train_x, train_y) 63 | 64 | # Return model 65 | return model, None, None, None 66 | 67 | def run_train_cv(self) -> None: 68 | """Training and evaluation using cross validation 69 | 70 | Train, score, save each fold model, output score to log 71 | """ 72 | logger.info(f'{self.run_name} - start training cv') 73 | 74 | scores = [] 75 | va_idxes = [] 76 | preds = [] 77 | 78 | # Train on each fold 79 | for i_fold in range(self.n_fold): 80 | # Train 81 | logger.info(f'{self.run_name} fold {i_fold} - start training') 82 | model, va_idx, va_pred, score = self.train_fold(i_fold) 83 | logger.info(f'{self.run_name} fold {i_fold} - end training - score {score}') 84 | 85 | # Save model 86 | model.save_model() 87 | 88 | # Retain results 89 | va_idxes.append(va_idx) 90 | scores.append(score) 91 | preds.append(va_pred) 92 | 93 | # Gather results for all folds 94 | va_idxes = np.concatenate(va_idxes) 95 | order = np.argsort(va_idxes) 96 | preds = np.concatenate(preds, axis=0) 97 | preds = preds[order] 98 | 99 | logger.info(f'{self.run_name} - end training cv - score {np.mean(scores)}') 100 | 101 | # Save predictions 102 | Util.dump(preds, f'../model/pred/{self.run_name}-train.pkl') 103 | 104 | # Save scores 105 | logger.result_scores(self.run_name, scores) 106 | 107 | def run_predict_cv(self) -> None: 108 | """Take average of results from models trained on each fold and make predictions for test data 109 | 110 | Necessary to run_train_cv beforehand 111 | """ 112 | logger.info(f'{self.run_name} - start prediction cv') 113 | 114 | test_x = self.load_x_test() 115 | 116 | preds = [] 117 | 118 | # Train on each fold 119 | for i_fold in range(self.n_fold): 120 | logger.info(f'{self.run_name} - start prediction fold:{i_fold}') 121 | model = self.build_model(i_fold) 122 | model.load_model() 123 | pred = model.predict(test_x) 124 | preds.append(pred) 125 | logger.info(f'{self.run_name} - end prediction fold:{i_fold}') 126 | 127 | # Output mean value of predictions 128 | pred_avg = np.mean(preds, axis=0) 129 | 130 | # Save predictions 131 | Util.dump(pred_avg, f'../model/pred/{self.run_name}-test.pkl') 132 | 133 | logger.info(f'{self.run_name} - end prediction cv') 134 | 135 | def run_train_all(self) -> None: 136 | """Train using all training data and save model""" 137 | logger.info(f'{self.run_name} - start training all') 138 | 139 | # Train on all training data 140 | i_fold = 'all' 141 | model, _, _, _ = self.train_fold(i_fold) 142 | model.save_model() 143 | 144 | logger.info(f'{self.run_name} - end training all') 145 | 146 | def run_predict_all(self) -> None: 147 | """Make predictions using model trained with all training data 148 | 149 | Necessary to run_train_all beforehand 150 | """ 151 | logger.info(f'{self.run_name} - start prediction all') 152 | 153 | test_x = self.load_x_test() 154 | 155 | # Make predictions using model trained on all training data 156 | i_fold = 'all' 157 | model = self.build_model(i_fold) 158 | model.load_model() 159 | pred = model.predict(test_x) 160 | 161 | # Save predictions 162 | Util.dump(pred, f'../model/pred/{self.run_name}-test.pkl') 163 | 164 | logger.info(f'{self.run_name} - end prediction all') 165 | 166 | def build_model(self, i_fold: Union[int, str]) -> Model: 167 | """Specify cross validation fold and create model 168 | 169 | :param i_fold: fold number 170 | :return: model instance 171 | """ 172 | # Create model from run name, fold and model class 173 | run_fold_name = f'{self.run_name}-{i_fold}' 174 | return self.model_cls(run_fold_name, self.params) 175 | 176 | def load_x_train(self) -> pd.DataFrame: 177 | """Load features of training data 178 | 179 | :return: Training data features 180 | """ 181 | # Load training data 182 | # Note you must modify this method if you want to do anything more than just extraction by column name 183 | # As it is inefficient to load train.csv every time, use this method appropriately for the data (same applies for other methods also) 184 | return pd.read_csv('../input/train.csv')[self.features] 185 | 186 | def load_y_train(self) -> pd.Series: 187 | """Load target values of training data 188 | 189 | :return: Training data target values 190 | """ 191 | # Load target values 192 | train_y = pd.read_csv('../input/train.csv')['target'] 193 | train_y = np.array([int(st[-1]) for st in train_y]) - 1 194 | train_y = pd.Series(train_y) 195 | return train_y 196 | 197 | def load_x_test(self) -> pd.DataFrame: 198 | """Load features of test data 199 | 200 | :return: Test data features 201 | """ 202 | return pd.read_csv('../input/test.csv')[self.features] 203 | 204 | def load_index_fold(self, i_fold: int) -> np.array: 205 | """Specify cross validation fold and return corresponding record index 206 | 207 | :param i_fold: Fold number 208 | :return: Record index of corresponding fold 209 | """ 210 | # Return index that separates training and validation data 211 | # Here a random number is created every time, so there is also a method to save it to a file 212 | train_y = self.load_y_train() 213 | dummy_x = np.zeros(len(train_y)) 214 | skf = StratifiedKFold(n_splits=self.n_fold, shuffle=True, random_state=71) 215 | return list(skf.split(dummy_x, train_y))[i_fold] 216 | -------------------------------------------------------------------------------- /ch01/ch01-01-titanic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # ----------------------------------- 5 | # Load the training and test data 6 | # ----------------------------------- 7 | # Load the training and test data 8 | train = pd.read_csv('../input/ch01-titanic/train.csv') 9 | test = pd.read_csv('../input/ch01-titanic/test.csv') 10 | 11 | # Split the training data into features and target values 12 | train_x = train.drop(['Survived'], axis=1) 13 | train_y = train['Survived'] 14 | 15 | # The test data only contains features, so is ok as is 16 | test_x = test.copy() 17 | 18 | # ----------------------------------- 19 | # Feature engineering 20 | # ----------------------------------- 21 | from sklearn.preprocessing import LabelEncoder 22 | 23 | # Drop the PassengerID variables 24 | train_x = train_x.drop(['PassengerId'], axis=1) 25 | test_x = test_x.drop(['PassengerId'], axis=1) 26 | 27 | # Drop the Name, Ticket & Cabin variables 28 | train_x = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1) 29 | test_x = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1) 30 | 31 | # Apply label encoding to categorical variables 32 | for c in ['Sex', 'Embarked']: 33 | # Fit the labels using the training data 34 | le = LabelEncoder() 35 | le.fit(train_x[c].fillna('NA')) 36 | 37 | # Return the encoded labels for the training and test data 38 | train_x[c] = le.transform(train_x[c].fillna('NA')) 39 | test_x[c] = le.transform(test_x[c].fillna('NA')) 40 | 41 | # ----------------------------------- 42 | # Model creation 43 | # ----------------------------------- 44 | from xgboost import XGBClassifier 45 | 46 | # Create the model and fit it using the training data 47 | model = XGBClassifier(n_estimators=20, random_state=71) 48 | model.fit(train_x, train_y) 49 | 50 | # Output predicted probabilities for the test data 51 | pred = model.predict_proba(test_x)[:, 1] 52 | 53 | # Convert into binary predictions 54 | pred_label = np.where(pred > 0.5, 1, 0) 55 | 56 | # Create a submission file 57 | submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label}) 58 | submission.to_csv('submission_first.csv', index=False) 59 | # Score: 0.7799 (it is possible that this value differs from the one in the book) 60 | 61 | # ----------------------------------- 62 | # Validation 63 | # ----------------------------------- 64 | from sklearn.metrics import log_loss, accuracy_score 65 | from sklearn.model_selection import KFold 66 | 67 | # Create lists to store the scores for each fold 68 | scores_accuracy = [] 69 | scores_logloss = [] 70 | 71 | # Perform cross validation 72 | # Split the training data into 4, use 1 part for validation, then use the next part for validation, and so on... 73 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 74 | for tr_idx, va_idx in kf.split(train_x): 75 | # Split the training data into training and validation data 76 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 77 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 78 | 79 | # Train the model 80 | model = XGBClassifier(n_estimators=20, random_state=71) 81 | model.fit(tr_x, tr_y) 82 | 83 | # Output predicted probabilities for the validation data 84 | va_pred = model.predict_proba(va_x)[:, 1] 85 | 86 | # Calculate scores for the validation data 87 | logloss = log_loss(va_y, va_pred) 88 | accuracy = accuracy_score(va_y, va_pred > 0.5) 89 | 90 | # Store the scores for this fold 91 | scores_logloss.append(logloss) 92 | scores_accuracy.append(accuracy) 93 | 94 | # Calculate the mean scores using all folds 95 | logloss = np.mean(scores_logloss) 96 | accuracy = np.mean(scores_accuracy) 97 | print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}') 98 | # logloss: 0.4270, accuracy: 0.8148 (it is possible that these values differ from the book) 99 | 100 | # ----------------------------------- 101 | # Model tuning 102 | # ----------------------------------- 103 | import itertools 104 | 105 | # Prepare candidate tuning parameters 106 | param_space = { 107 | 'max_depth': [3, 5, 7], 108 | 'min_child_weight': [1.0, 2.0, 4.0] 109 | } 110 | 111 | # Hyperparamter combinations to try 112 | param_combinations = itertools.product(param_space['max_depth'], param_space['min_child_weight']) 113 | 114 | # Create lists to store scores for the different hyperparameter combinations 115 | params = [] 116 | scores = [] 117 | 118 | # Perform cross validation for each hyperparameter combination 119 | for max_depth, min_child_weight in param_combinations: 120 | 121 | score_folds = [] 122 | # Perform cross validation 123 | # Split the training data into 4, use 1 part for validation, then use the next part for validation, and so on... 124 | kf = KFold(n_splits=4, shuffle=True, random_state=123456) 125 | for tr_idx, va_idx in kf.split(train_x): 126 | # Split the training data into training and validation data 127 | tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] 128 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 129 | 130 | # Train the model 131 | model = XGBClassifier(n_estimators=20, random_state=71, 132 | max_depth=max_depth, min_child_weight=min_child_weight) 133 | model.fit(tr_x, tr_y) 134 | 135 | # Output predicted probabilities for the validation data 136 | va_pred = model.predict_proba(va_x)[:, 1] 137 | logloss = log_loss(va_y, va_pred) 138 | score_folds.append(logloss) 139 | 140 | # Calculate the mean score using all folds 141 | score_mean = np.mean(score_folds) 142 | 143 | # Store the scores for this hyperparameter combination 144 | params.append((max_depth, min_child_weight)) 145 | scores.append(score_mean) 146 | 147 | # Set the parameters to the best values giving the highest score 148 | best_idx = np.argsort(scores)[0] 149 | best_param = params[best_idx] 150 | print(f'max_depth: {best_param[0]}, min_child_weight: {best_param[1]}') 151 | # Best score is with max_depth=7, min_child_weight=2.0 152 | 153 | 154 | # ----------------------------------- 155 | # Create features for logistic regression 156 | # ----------------------------------- 157 | from sklearn.preprocessing import OneHotEncoder 158 | 159 | # Copy the original datasets 160 | train_x2 = train.drop(['Survived'], axis=1) 161 | test_x2 = test.copy() 162 | 163 | # Drop the PassengerID variables 164 | train_x2 = train_x2.drop(['PassengerId'], axis=1) 165 | test_x2 = test_x2.drop(['PassengerId'], axis=1) 166 | 167 | # Drop the Name, Ticket & Cabin variables 168 | train_x2 = train_x2.drop(['Name', 'Ticket', 'Cabin'], axis=1) 169 | test_x2 = test_x2.drop(['Name', 'Ticket', 'Cabin'], axis=1) 170 | 171 | # Perform one-hot encoding 172 | cat_cols = ['Sex', 'Embarked', 'Pclass'] 173 | ohe = OneHotEncoder(categories='auto', sparse=False) 174 | ohe.fit(train_x2[cat_cols].fillna('NA')) 175 | 176 | # Create column names for dummy one-hot encoding variables 177 | ohe_columns = [] 178 | for i, c in enumerate(cat_cols): 179 | ohe_columns += [f'{c}_{v}' for v in ohe.categories_[i]] 180 | 181 | # Create DataFrames for one-hot encoding 182 | ohe_train_x2 = pd.DataFrame(ohe.transform(train_x2[cat_cols].fillna('NA')), columns=ohe_columns) 183 | ohe_test_x2 = pd.DataFrame(ohe.transform(test_x2[cat_cols].fillna('NA')), columns=ohe_columns) 184 | 185 | # Drop the original columns that were used for one-hot encoding 186 | train_x2 = train_x2.drop(cat_cols, axis=1) 187 | test_x2 = test_x2.drop(cat_cols, axis=1) 188 | 189 | # Append the one-hot encoded columns 190 | train_x2 = pd.concat([train_x2, ohe_train_x2], axis=1) 191 | test_x2 = pd.concat([test_x2, ohe_test_x2], axis=1) 192 | 193 | # Replace missing values in these columns with the means of the values that exist 194 | num_cols = ['Age', 'SibSp', 'Parch', 'Fare'] 195 | for col in num_cols: 196 | train_x2[col].fillna(train_x2[col].mean(), inplace=True) 197 | test_x2[col].fillna(train_x2[col].mean(), inplace=True) 198 | 199 | # Make a logarithmic transformation of the Fare variables 200 | train_x2['Fare'] = np.log1p(train_x2['Fare']) 201 | test_x2['Fare'] = np.log1p(test_x2['Fare']) 202 | 203 | # ----------------------------------- 204 | # Ensembling 205 | # ----------------------------------- 206 | from sklearn.linear_model import LogisticRegression 207 | 208 | # xgboost model 209 | model_xgb = XGBClassifier(n_estimators=20, random_state=71) 210 | model_xgb.fit(train_x, train_y) 211 | pred_xgb = model_xgb.predict_proba(test_x)[:, 1] 212 | 213 | # Logistic regression model 214 | # As the xgboost model uses differently engineered features, train_x2, test_x2 were created separately 215 | model_lr = LogisticRegression(solver='lbfgs', max_iter=300) 216 | model_lr.fit(train_x2, train_y) 217 | pred_lr = model_lr.predict_proba(test_x2)[:, 1] 218 | 219 | # Take a weighted average of the predictions 220 | pred = pred_xgb * 0.8 + pred_lr * 0.2 221 | pred_label = np.where(pred > 0.5, 1, 0) 222 | -------------------------------------------------------------------------------- /input/sample-data/input_create.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pandas.tseries.offsets as offsets 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--test', action='store_true') 8 | args = parser.parse_args() 9 | 10 | 11 | class Util: 12 | 13 | @classmethod 14 | def iif(cls, cond, iftrue, ifelse): 15 | if cond: 16 | return iftrue 17 | else: 18 | return ifelse 19 | 20 | 21 | class Generator: 22 | 23 | def __init__(self): 24 | pass 25 | 26 | def choice_prob(self, prob, iftrue, ifelse): 27 | if self.r.rand() < prob: 28 | return iftrue 29 | else: 30 | return ifelse 31 | 32 | def run_all(self, seed=71, n=100): 33 | self.r = np.random.RandomState(seed) 34 | ds = [] 35 | for i in range(n): 36 | print(i) 37 | data, scores, target = self.run() 38 | ds.append(data + scores + target) 39 | 40 | columns = self.column_names() 41 | df = pd.DataFrame(ds, columns=columns) 42 | return df 43 | 44 | def run(self): 45 | 46 | # Information on insured person 47 | # Set score according to age, weight and BMI 48 | insured_age = self.r.choice(range(5, 80)) 49 | insured_sex = self.r.choice(['Male', 'Female'], p=[0.6, 0.4]) 50 | height = 160.0 + Util.iif(insured_sex == 'Male', 10.0, 0.0) + self.r.randn() * 8 51 | bmi = 22.0 + self.r.randn() * 3.0 52 | weight = height * height * bmi / 10000.0 53 | 54 | if insured_age <= 15: 55 | score_01 = 0.1 56 | elif 60 <= insured_age < 70: 57 | score_01 = 0.2 58 | elif 70 <= insured_age: 59 | score_01 = 0.3 60 | else: 61 | score_01 = 0.0 62 | 63 | if weight >= 85.0: 64 | score_02 = 0.2 65 | else: 66 | score_02 = 0.0 67 | 68 | if bmi <= 19.0: 69 | score_03 = (19.0 - bmi) * 0.15 70 | elif bmi >= 25.0: 71 | score_03 = (bmi - 25.0) * 0.15 72 | else: 73 | score_03 = 0.0 74 | 75 | # Product information 76 | # A1-A3, B1-B3, C1-C3, D1, E1 77 | # Set scores for types D, E and 2 78 | 79 | product = self.r.choice(list('ABCDE'), p=[0.5, 0.1, 0.25, 0.1, 0.05]) 80 | is_prodtype_1 = product in list('ABD') 81 | is_prodtype_2 = product in list('CE') 82 | is_prodtype_a = product in list('ABC') 83 | is_prodtype_b = product in list('DE') 84 | 85 | if is_prodtype_a: 86 | product_sub = self.r.choice([1, 2, 3], p=[0.4, 0.2, 0.4]) 87 | else: 88 | product_sub = 1 89 | product = '{}{}'.format(product, product_sub) 90 | 91 | if is_prodtype_b: 92 | score_21 = 0.2 93 | else: 94 | score_21 = 0.0 95 | if product_sub == 2: 96 | score_22 = 0.1 97 | else: 98 | score_22 = 0.0 99 | 100 | # Insurance premiums - premiums vary according to product type 101 | # Set score for large premiums 102 | amount_raw = self.r.choice(range(1, 11)) 103 | if is_prodtype_1: 104 | amount = amount_raw * 1000 * 1000 105 | else: 106 | amount = amount_raw * 1000 107 | 108 | if amount_raw > 5: 109 | score_23 = 0.1 110 | else: 111 | score_23 = 0.0 112 | 113 | # No score for application date 114 | date_start = pd.to_datetime('2015/1/1') 115 | date_end = pd.to_datetime('2016/12/31') 116 | days = (date_end - date_start).days 117 | 118 | app_date = date_start + offsets.Day(self.r.choice(range(days))) 119 | app_year, app_month, app_day = app_date.year, app_date.month, app_date.day 120 | application_date = '{}/{}/{}'.format(app_year, app_month, app_day) 121 | 122 | # Medical information 123 | # Set scores based on a1, difference between a1 and a2, and when a3 is greater than or equal to 5 124 | # No score for b 125 | medical_info_a1 = int(250.0 + 100.0 * self.r.rand() + 100.0 * self.r.randn()) 126 | medical_info_a2 = int(200.0 + 100.0 * self.r.rand() + 100.0 * self.r.randn()) 127 | medical_info_a3 = self.r.poisson(lam=2) 128 | 129 | medical_info_b1 = int(10.0 + 10.0 * self.r.rand()) 130 | medical_info_b2 = self.r.choice([1,2,3,9], p=[0.5, 0.25, 0.2, 0.05]) 131 | medical_info_b3 = self.r.choice(list('ABCDEFGH') + list('abcde') + ['1', '2', '3', '4']) 132 | 133 | medical_info_c1 = self.r.choice([np.nan, self.r.poisson(lam=1.5)], p=[0.3, 0.7]) 134 | medical_info_c2 = self.r.choice([np.nan, self.r.uniform(8.0, 22.0)], p=[0.8, 0.2]) 135 | 136 | if medical_info_a1 > 350: 137 | score_41 = 0.2 138 | else: 139 | score_41 = 0.0 140 | 141 | medical_info_a_diff = medical_info_a1 - medical_info_a2 142 | score_42 = max(np.abs(medical_info_a_diff) - 100.0, 0.0) / 800.0 143 | 144 | if medical_info_a3 >= 5: 145 | score_43 = 0.2 146 | else: 147 | score_43 = 0.0 148 | 149 | if np.isnan(medical_info_c1): 150 | score_44 = -0.1 151 | elif medical_info_c1 >= 3: 152 | score_44 = 0.1 153 | else: 154 | score_44 = 0.0 155 | 156 | # Medical information binary variable 157 | # 1-5 are related with a score 158 | # 6-7 are related with a score for women only 159 | # 8-10 have no relation with a score 160 | medical_keyword_probs = np.array([ 161 | 0.8, 0.5, 0.2, 0.05, 0.02, 0.4, 0.1, 0.8, 0.3, 0.05, 162 | ]) 163 | medical_keywords = [] 164 | for prob in medical_keyword_probs: 165 | medical_keywords.append(self.r.choice([0, 1], p=[1 - prob, prob])) 166 | 167 | mkeys = medical_keywords[:5] 168 | mprobs = medical_keyword_probs[:5] 169 | mkeys_sum = np.array(mkeys).sum() 170 | mkeys_score = 1.0 / mprobs * 0.01 171 | 172 | score_51 = np.sum(np.array(mkeys) * mkeys_score) 173 | if mkeys_sum >= 4: 174 | score_52 = 0.5 175 | elif mkeys_sum >= 3: 176 | score_52 = 0.3 177 | elif mkeys_sum >= 2: 178 | score_52 = 0.1 179 | else: 180 | score_52 = 0.0 181 | 182 | score_53 = 0.0 183 | if insured_sex == 'Female': 184 | if medical_keywords[5] == 1 or medical_keywords[6] == 1: 185 | score_53 += 0.1 186 | 187 | # noise 188 | score_noise = self.r.uniform(-0.2, 0.2) 189 | 190 | data = [insured_age, insured_sex, height, weight, 191 | product, amount, application_date, 192 | medical_info_a1, medical_info_a2, medical_info_a3, 193 | medical_info_b1, medical_info_b2, medical_info_b3, 194 | medical_info_c1, medical_info_c2] + medical_keywords 195 | 196 | score_elements = [score_01, score_02, score_03, score_21, score_22, score_23, 197 | score_41, score_42, score_43, score_44, score_51, score_52, score_53, score_noise] 198 | score = np.array(score_elements).sum() 199 | scores = [score] + score_elements 200 | 201 | target = [Util.iif(score >= 0.8, 1, 0)] 202 | 203 | return data, scores, target 204 | 205 | def column_names(self): 206 | return self.column_names_data() + self.column_names_scores() + self.column_names_target() 207 | 208 | def column_names_data(self): 209 | data = (['age', 'sex', 'height', 'weight', 210 | 'product', 'amount', 'date', 211 | 'medical_info_a1', 'medical_info_a2', 'medical_info_a3', 212 | 'medical_info_b1', 'medical_info_b2', 'medical_info_b3', 213 | 'medical_info_c1', 'medical_info_c2'] 214 | + ['medical_keyword_{}'.format(i) for i in range(1, 11)]) 215 | return data 216 | 217 | def column_names_scores(self): 218 | score_elements = ['score_01', 'score_02', 'score_03', 'score_21', 'score_22', 'score_23', 219 | 'score_41', 'score_42', 'score_43', 'score_44', 220 | 'score_51', 'score_52', 'score_53', 'score_noise'] 221 | scores = ['score'] + score_elements 222 | return scores 223 | 224 | def column_names_target(self): 225 | target = ['target'] 226 | return target 227 | 228 | 229 | if __name__ == '__main__': 230 | gen = Generator() 231 | if args.test: 232 | n_tr = 100 233 | else: 234 | n_tr = 10000 235 | 236 | df = gen.run_all(n=n_tr * 2) 237 | cols_data = gen.column_names_data() 238 | cols_target = gen.column_names_target() 239 | 240 | # df[:n_tr].to_csv('train_debug.csv', index=False, sep='\t') 241 | # df[n_tr:].to_csv('test_debug.csv', index=False, sep='\t') 242 | df[:n_tr][cols_data + cols_target].to_csv('train.csv', index=False, sep=',') 243 | df[n_tr:][cols_data].to_csv('test.csv', index=False, sep=',') 244 | -------------------------------------------------------------------------------- /ch03/ch03-02-categorical.py: -------------------------------------------------------------------------------- 1 | # --------------------------------- 2 | # Prepare the data etc. 3 | # ---------------------------------- 4 | import numpy as np 5 | import pandas as pd 6 | 7 | # train_x is the training data, train_y contains the target values, test_x is the test data 8 | # stored in pandas DataFrames and Series (numpy arrays also used) 9 | 10 | train = pd.read_csv('../input/sample-data/train.csv') 11 | train_x = train.drop(['target'], axis=1) 12 | train_y = train['target'] 13 | test_x = pd.read_csv('../input/sample-data/test.csv') 14 | 15 | # Save training and test datasets in their original form for explanations 16 | train_x_saved = train_x.copy() 17 | test_x_saved = test_x.copy() 18 | 19 | 20 | # Function to recover original training and test datasets 21 | def load_data(): 22 | train_x, test_x = train_x_saved.copy(), test_x_saved.copy() 23 | return train_x, test_x 24 | 25 | 26 | # Store names of categorical variables to be converted in list 27 | cat_cols = ['sex', 'product', 'medical_info_b2', 'medical_info_b3'] 28 | 29 | # ----------------------------------- 30 | # One-hot encoding 31 | # ----------------------------------- 32 | # Load the data 33 | train_x, test_x = load_data() 34 | # ----------------------------------- 35 | 36 | # Concatenate the training and test datasets, and apply one-hot encoding via get_dummies() 37 | all_x = pd.concat([train_x, test_x]) 38 | all_x = pd.get_dummies(all_x, columns=cat_cols) 39 | 40 | # Resplit into training and test data 41 | train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True) 42 | test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True) 43 | 44 | # ----------------------------------- 45 | # Load the data 46 | train_x, test_x = load_data() 47 | # ----------------------------------- 48 | from sklearn.preprocessing import OneHotEncoder 49 | 50 | # Encoding with the OneHotEncoder() function 51 | ohe = OneHotEncoder(sparse=False, categories='auto') 52 | ohe.fit(train_x[cat_cols]) 53 | 54 | # Create column names for dummy variables 55 | columns = [] 56 | for i, c in enumerate(cat_cols): 57 | columns += [f'{c}_{v}' for v in ohe.categories_[i]] 58 | 59 | # Put created dummy variables into data frames 60 | dummy_vals_train = pd.DataFrame(ohe.transform(train_x[cat_cols]), columns=columns) 61 | dummy_vals_test = pd.DataFrame(ohe.transform(test_x[cat_cols]), columns=columns) 62 | 63 | # Join the remaining variables 64 | train_x = pd.concat([train_x.drop(cat_cols, axis=1), dummy_vals_train], axis=1) 65 | test_x = pd.concat([test_x.drop(cat_cols, axis=1), dummy_vals_test], axis=1) 66 | 67 | # ----------------------------------- 68 | # Label encoding 69 | # ----------------------------------- 70 | # Load the data 71 | train_x, test_x = load_data() 72 | # ----------------------------------- 73 | from sklearn.preprocessing import LabelEncoder 74 | 75 | # Loop over the categorical variables and apply label encoding 76 | for c in cat_cols: 77 | # Define labels based on the training data 78 | le = LabelEncoder() 79 | le.fit(train_x[c]) 80 | train_x[c] = le.transform(train_x[c]) 81 | test_x[c] = le.transform(test_x[c]) 82 | 83 | # ----------------------------------- 84 | # Feature hashing 85 | # ----------------------------------- 86 | # Load the data 87 | train_x, test_x = load_data() 88 | # ----------------------------------- 89 | from sklearn.feature_extraction import FeatureHasher 90 | 91 | # Loop over the categorical variables and apply feature hashing 92 | for c in cat_cols: 93 | # Using the FeatureHasher() function is slightly different from other encoders 94 | 95 | fh = FeatureHasher(n_features=5, input_type='string') 96 | # Convert the variable to a string and apply the FeatureHasher() function 97 | hash_train = fh.transform(train_x[[c]].astype(str).values) 98 | hash_test = fh.transform(test_x[[c]].astype(str).values) 99 | # Add to a data frame 100 | hash_train = pd.DataFrame(hash_train.todense(), columns=[f'{c}_{i}' for i in range(5)]) 101 | hash_test = pd.DataFrame(hash_test.todense(), columns=[f'{c}_{i}' for i in range(5)]) 102 | # Join with the original data frame 103 | train_x = pd.concat([train_x, hash_train], axis=1) 104 | test_x = pd.concat([test_x, hash_test], axis=1) 105 | 106 | # Drop the original categorical variable columns 107 | train_x.drop(cat_cols, axis=1, inplace=True) 108 | test_x.drop(cat_cols, axis=1, inplace=True) 109 | 110 | # ----------------------------------- 111 | # Frequency encoding 112 | # ----------------------------------- 113 | # Load the data 114 | train_x, test_x = load_data() 115 | # ----------------------------------- 116 | # Loop over the categorical variables and apply frequency encoding 117 | for c in cat_cols: 118 | freq = train_x[c].value_counts() 119 | # Replace each categorical variable with its frequency of occurrence 120 | train_x[c] = train_x[c].map(freq) 121 | test_x[c] = test_x[c].map(freq) 122 | 123 | # ----------------------------------- 124 | # Target encoding 125 | # ----------------------------------- 126 | # Load the data 127 | train_x, test_x = load_data() 128 | # ----------------------------------- 129 | from sklearn.model_selection import KFold 130 | 131 | # Loop over the categorical variables and apply target encoding 132 | for c in cat_cols: 133 | # Calculate the average of the target for each categorical value in the training data 134 | data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y}) 135 | target_mean = data_tmp.groupby(c)['target'].mean() 136 | # Replace the categorical variables in the test data 137 | test_x[c] = test_x[c].map(target_mean) 138 | 139 | # Prepare an array to store the converted training data 140 | tmp = np.repeat(np.nan, train_x.shape[0]) 141 | 142 | # Split the training data 143 | kf = KFold(n_splits=4, shuffle=True, random_state=72) 144 | for idx_1, idx_2 in kf.split(train_x): 145 | # Calculate the average of the target values for the out-of-fold categorical variables 146 | target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean() 147 | # Store the converted values temporarily in an array 148 | tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean) 149 | 150 | # Replace the original data with the converted values 151 | train_x[c] = tmp 152 | 153 | # ----------------------------------- 154 | # Target encoding - for each fold of cross validation 155 | # ----------------------------------- 156 | # Load the data 157 | train_x, test_x = load_data() 158 | # ----------------------------------- 159 | from sklearn.model_selection import KFold 160 | 161 | # Apply target encoding for each cross validation fold 162 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 163 | for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)): 164 | 165 | # Split the validation data off from the training data 166 | tr_x, va_x = train_x.iloc[tr_idx].copy(), train_x.iloc[va_idx].copy() 167 | tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 168 | 169 | # Loop over the categorical variables and apply target encoding 170 | for c in cat_cols: 171 | # Calculate the average of the target for each categorical value in the training data 172 | data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y}) 173 | target_mean = data_tmp.groupby(c)['target'].mean() 174 | # Replace the categorical variables in the validation data 175 | va_x.loc[:, c] = va_x[c].map(target_mean) 176 | 177 | # Prepare an array to store the converted training data 178 | tmp = np.repeat(np.nan, tr_x.shape[0]) 179 | kf_encoding = KFold(n_splits=4, shuffle=True, random_state=72) 180 | for idx_1, idx_2 in kf_encoding.split(tr_x): 181 | # Calculate the average of the target values for the out-of-fold categorical variables 182 | target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean() 183 | # Store the converted values temporarily in an array 184 | tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean) 185 | 186 | tr_x.loc[:, c] = tmp 187 | 188 | # Remember to save the encoded features so you can come back and read the data later if necessary 189 | 190 | # ----------------------------------- 191 | # Target encoding - when the cross validation and target encoded folds need to be partitioned 192 | # ----------------------------------- 193 | # Load the data 194 | train_x, test_x = load_data() 195 | # ----------------------------------- 196 | from sklearn.model_selection import KFold 197 | 198 | # Define the cross validation folds 199 | kf = KFold(n_splits=4, shuffle=True, random_state=71) 200 | 201 | # Loop over the categorical variables and apply target encoding 202 | for c in cat_cols: 203 | 204 | # Add the target values 205 | data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y}) 206 | # Store the converted values temporarily in an array 207 | tmp = np.repeat(np.nan, train_x.shape[0]) 208 | 209 | # Split off the cross validation 210 | for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)): 211 | # Calculate the average of the target values for each category for the training data 212 | target_mean = data_tmp.iloc[tr_idx].groupby(c)['target'].mean() 213 | # For the validation data, store the converted values temporarily in an array 214 | tmp[va_idx] = train_x[c].iloc[va_idx].map(target_mean) 215 | 216 | # Replace the original data with the converted values 217 | train_x[c] = tmp 218 | -------------------------------------------------------------------------------- /input/ch03/multi_table_log.csv: -------------------------------------------------------------------------------- 1 | user_id,date,event,product_id 2 | 40,2018-01-01,login, 3 | 40,2018-01-01,view,P5 4 | 9,2018-01-03,login, 5 | 9,2018-01-03,view,P3 6 | 9,2018-01-03,view,P5 7 | 9,2018-01-03,view,P11 8 | 9,2018-01-03,view,P20 9 | 11,2018-01-03,login, 10 | 11,2018-01-03,view,P2 11 | 11,2018-01-03,view,P4 12 | 11,2018-01-03,view,P9 13 | 11,2018-01-03,view,P10 14 | 11,2018-01-03,view,P15 15 | 11,2018-01-03,view,P19 16 | 33,2018-01-04,login, 17 | 33,2018-01-04,view,P5 18 | 33,2018-01-04,view,P6 19 | 33,2018-01-04,view,P8 20 | 49,2018-01-04,login, 21 | 49,2018-01-04,view,P3 22 | 49,2018-01-04,view,P5 23 | 49,2018-01-04,view,P19 24 | 25,2018-01-06,login, 25 | 25,2018-01-06,view,P1 26 | 25,2018-01-06,view,P4 27 | 25,2018-01-06,view,P9 28 | 25,2018-01-06,view,P14 29 | 25,2018-01-06,view,P15 30 | 25,2018-01-06,view,P16 31 | 55,2018-01-06,login, 32 | 55,2018-01-06,view,P11 33 | 55,2018-01-06,view,P12 34 | 55,2018-01-06,view,P13 35 | 55,2018-01-06,view,P20 36 | 70,2018-01-07,login, 37 | 70,2018-01-07,view,P7 38 | 70,2018-01-07,view,P20 39 | 57,2018-01-08,login, 40 | 57,2018-01-08,view,P8 41 | 57,2018-01-08,view,P16 42 | 27,2018-01-09,login, 43 | 27,2018-01-09,view,P14 44 | 56,2018-01-09,login, 45 | 56,2018-01-09,view,P3 46 | 70,2018-01-10,login, 47 | 70,2018-01-10,view,P3 48 | 33,2018-01-13,login, 49 | 33,2018-01-13,view,P8 50 | 79,2018-01-13,login, 51 | 79,2018-01-13,view,P9 52 | 1,2018-01-14,login, 53 | 1,2018-01-14,view,P17 54 | 1,2018-01-14,view,P18 55 | 31,2018-01-14,login, 56 | 31,2018-01-14,view,P2 57 | 31,2018-01-14,view,P4 58 | 31,2018-01-14,view,P6 59 | 31,2018-01-14,view,P7 60 | 31,2018-01-14,view,P12 61 | 31,2018-01-14,view,P14 62 | 31,2018-01-14,view,P16 63 | 61,2018-01-14,login, 64 | 61,2018-01-14,view,P11 65 | 61,2018-01-14,view,P14 66 | 89,2018-01-16,login, 67 | 89,2018-01-16,view,P3 68 | 89,2018-01-16,view,P8 69 | 89,2018-01-16,view,P11 70 | 89,2018-01-16,view,P14 71 | 23,2018-01-17,login, 72 | 23,2018-01-17,view,P3 73 | 23,2018-01-17,view,P8 74 | 23,2018-01-17,view,P11 75 | 23,2018-01-17,view,P13 76 | 44,2018-01-17,login, 77 | 44,2018-01-17,view,P2 78 | 44,2018-01-17,view,P6 79 | 44,2018-01-17,view,P11 80 | 44,2018-01-17,view,P19 81 | 99,2018-01-17,login, 82 | 99,2018-01-17,view,P5 83 | 99,2018-01-17,view,P9 84 | 99,2018-01-17,view,P17 85 | 24,2018-01-19,login, 86 | 24,2018-01-19,view,P2 87 | 24,2018-01-19,view,P3 88 | 24,2018-01-19,view,P4 89 | 24,2018-01-19,view,P9 90 | 24,2018-01-19,view,P12 91 | 24,2018-01-19,view,P14 92 | 60,2018-01-20,login, 93 | 60,2018-01-20,view,P9 94 | 60,2018-01-20,view,P14 95 | 60,2018-01-20,view,P17 96 | 60,2018-01-20,view,P19 97 | 4,2018-01-21,login, 98 | 4,2018-01-21,view,P4 99 | 4,2018-01-21,view,P19 100 | 40,2018-01-21,login, 101 | 40,2018-01-21,view,P2 102 | 40,2018-01-21,view,P10 103 | 40,2018-01-21,view,P13 104 | 40,2018-01-21,view,P15 105 | 40,2018-01-21,view,P18 106 | 34,2018-01-22,login, 107 | 34,2018-01-22,view,P7 108 | 34,2018-01-22,view,P17 109 | 34,2018-01-22,view,P20 110 | 42,2018-01-22,login, 111 | 42,2018-01-22,view,P2 112 | 42,2018-01-22,view,P4 113 | 42,2018-01-22,view,P5 114 | 42,2018-01-22,view,P7 115 | 23,2018-01-23,login, 116 | 23,2018-01-23,view,P1 117 | 23,2018-01-23,view,P11 118 | 23,2018-01-23,view,P12 119 | 23,2018-01-23,view,P13 120 | 17,2018-01-26,login, 121 | 17,2018-01-26,view,P2 122 | 17,2018-01-26,view,P3 123 | 17,2018-01-26,view,P11 124 | 64,2018-01-26,login, 125 | 64,2018-01-26,view,P17 126 | 64,2018-01-26,view,P18 127 | 5,2018-01-27,login, 128 | 5,2018-01-27,view,P9 129 | 5,2018-01-27,view,P11 130 | 5,2018-01-27,view,P13 131 | 5,2018-01-27,view,P19 132 | 60,2018-01-28,login, 133 | 60,2018-01-28,view,P2 134 | 60,2018-01-28,view,P8 135 | 60,2018-01-28,view,P14 136 | 60,2018-01-28,view,P16 137 | 60,2018-01-28,view,P17 138 | 63,2018-02-01,login, 139 | 63,2018-02-01,view,P2 140 | 63,2018-02-01,view,P7 141 | 63,2018-02-01,view,P8 142 | 63,2018-02-01,view,P11 143 | 63,2018-02-01,view,P14 144 | 100,2018-02-01,login, 145 | 100,2018-02-01,view,P1 146 | 100,2018-02-01,view,P5 147 | 100,2018-02-01,view,P7 148 | 100,2018-02-01,view,P14 149 | 100,2018-02-01,view,P16 150 | 29,2018-02-03,login, 151 | 29,2018-02-03,view,P3 152 | 29,2018-02-03,view,P10 153 | 29,2018-02-03,view,P16 154 | 29,2018-02-03,view,P18 155 | 82,2018-02-03,login, 156 | 82,2018-02-03,view,P6 157 | 82,2018-02-03,view,P10 158 | 82,2018-02-03,view,P13 159 | 82,2018-02-03,view,P14 160 | 82,2018-02-03,view,P17 161 | 96,2018-02-03,login, 162 | 96,2018-02-03,view,P4 163 | 96,2018-02-03,view,P13 164 | 96,2018-02-03,view,P17 165 | 76,2018-02-04,login, 166 | 76,2018-02-04,view,P9 167 | 76,2018-02-04,view,P11 168 | 61,2018-02-05,login, 169 | 61,2018-02-05,view,P6 170 | 61,2018-02-05,view,P19 171 | 10,2018-02-06,login, 172 | 10,2018-02-06,view,P2 173 | 10,2018-02-06,view,P4 174 | 10,2018-02-06,view,P8 175 | 10,2018-02-06,view,P11 176 | 10,2018-02-06,view,P13 177 | 28,2018-02-06,login, 178 | 28,2018-02-06,view,P2 179 | 37,2018-02-06,login, 180 | 37,2018-02-06,view,P2 181 | 37,2018-02-06,view,P4 182 | 37,2018-02-06,view,P17 183 | 37,2018-02-06,view,P19 184 | 37,2018-02-06,view,P20 185 | 28,2018-02-07,login, 186 | 28,2018-02-07,view,P2 187 | 28,2018-02-07,view,P7 188 | 28,2018-02-07,view,P17 189 | 52,2018-02-07,login, 190 | 52,2018-02-07,view,P5 191 | 52,2018-02-07,view,P8 192 | 52,2018-02-07,view,P17 193 | 63,2018-02-08,login, 194 | 63,2018-02-08,view,P1 195 | 63,2018-02-08,view,P2 196 | 63,2018-02-08,view,P7 197 | 63,2018-02-08,view,P8 198 | 63,2018-02-08,view,P14 199 | 63,2018-02-08,view,P15 200 | 63,2018-02-08,view,P18 201 | 90,2018-02-11,login, 202 | 90,2018-02-11,view,P5 203 | 90,2018-02-11,view,P12 204 | 90,2018-02-11,view,P13 205 | 90,2018-02-11,view,P14 206 | 90,2018-02-11,view,P18 207 | 22,2018-02-12,login, 208 | 22,2018-02-12,view,P1 209 | 22,2018-02-12,view,P4 210 | 22,2018-02-12,view,P5 211 | 74,2018-02-16,login, 212 | 74,2018-02-16,view,P1 213 | 74,2018-02-16,view,P5 214 | 56,2018-02-18,login, 215 | 56,2018-02-18,view,P1 216 | 56,2018-02-18,view,P10 217 | 56,2018-02-18,view,P12 218 | 56,2018-02-18,view,P15 219 | 56,2018-02-18,view,P20 220 | 61,2018-02-18,login, 221 | 61,2018-02-18,view,P6 222 | 61,2018-02-18,view,P12 223 | 61,2018-02-18,view,P14 224 | 61,2018-02-18,view,P20 225 | 69,2018-02-18,login, 226 | 69,2018-02-18,view,P2 227 | 69,2018-02-18,view,P3 228 | 69,2018-02-18,view,P14 229 | 69,2018-02-18,view,P20 230 | 12,2018-02-19,login, 231 | 12,2018-02-19,view,P8 232 | 12,2018-02-19,view,P11 233 | 12,2018-02-19,view,P13 234 | 12,2018-02-19,view,P14 235 | 12,2018-02-19,view,P17 236 | 72,2018-02-19,login, 237 | 72,2018-02-19,view,P7 238 | 72,2018-02-19,view,P11 239 | 72,2018-02-19,view,P14 240 | 86,2018-02-19,login, 241 | 86,2018-02-19,view,P4 242 | 86,2018-02-19,view,P7 243 | 86,2018-02-19,view,P13 244 | 42,2018-02-21,login, 245 | 42,2018-02-21,view,P2 246 | 42,2018-02-21,view,P4 247 | 42,2018-02-21,view,P5 248 | 42,2018-02-21,view,P8 249 | 42,2018-02-21,view,P20 250 | 61,2018-02-21,login, 251 | 61,2018-02-21,view,P7 252 | 61,2018-02-21,view,P8 253 | 61,2018-02-21,view,P11 254 | 61,2018-02-21,view,P16 255 | 14,2018-02-22,login, 256 | 14,2018-02-22,view,P1 257 | 14,2018-02-22,view,P4 258 | 14,2018-02-22,view,P13 259 | 37,2018-02-22,login, 260 | 37,2018-02-22,view,P1 261 | 37,2018-02-22,view,P4 262 | 37,2018-02-22,view,P5 263 | 76,2018-02-23,login, 264 | 76,2018-02-23,view,P2 265 | 76,2018-02-23,view,P16 266 | 61,2018-02-25,login, 267 | 61,2018-02-25,view,P9 268 | 61,2018-02-25,view,P15 269 | 34,2018-02-27,login, 270 | 34,2018-02-27,view,P9 271 | 34,2018-02-27,view,P11 272 | 34,2018-02-27,view,P13 273 | 34,2018-02-27,view,P18 274 | 34,2018-02-27,view,P19 275 | 93,2018-02-27,login, 276 | 93,2018-02-27,view,P1 277 | 93,2018-02-27,view,P2 278 | 93,2018-02-27,view,P17 279 | 97,2018-02-27,login, 280 | 97,2018-02-27,view,P1 281 | 97,2018-02-27,view,P9 282 | 97,2018-02-27,view,P11 283 | 97,2018-02-27,view,P20 284 | 100,2018-02-27,login, 285 | 100,2018-02-27,view,P1 286 | 100,2018-02-27,view,P12 287 | 100,2018-02-27,view,P14 288 | 27,2018-02-28,login, 289 | 27,2018-02-28,view,P12 290 | 27,2018-02-28,view,P14 291 | 27,2018-02-28,view,P15 292 | 27,2018-02-28,view,P20 293 | 20,2018-03-01,login, 294 | 20,2018-03-01,view,P7 295 | 20,2018-03-01,view,P19 296 | 33,2018-03-02,login, 297 | 33,2018-03-02,view,P5 298 | 33,2018-03-02,view,P8 299 | 75,2018-03-02,login, 300 | 75,2018-03-02,view,P3 301 | 75,2018-03-02,view,P5 302 | 75,2018-03-02,view,P8 303 | 75,2018-03-02,view,P10 304 | 75,2018-03-02,view,P14 305 | 75,2018-03-02,view,P18 306 | 83,2018-03-02,login, 307 | 83,2018-03-02,view,P1 308 | 83,2018-03-02,view,P2 309 | 83,2018-03-02,view,P5 310 | 83,2018-03-02,view,P6 311 | 83,2018-03-02,view,P9 312 | 83,2018-03-02,view,P13 313 | 66,2018-03-03,login, 314 | 66,2018-03-03,view,P2 315 | 66,2018-03-03,view,P6 316 | 66,2018-03-03,view,P17 317 | 17,2018-03-05,login, 318 | 17,2018-03-05,view,P2 319 | 17,2018-03-05,view,P5 320 | 17,2018-03-05,view,P8 321 | 17,2018-03-05,view,P13 322 | 37,2018-03-05,login, 323 | 37,2018-03-05,view,P4 324 | 37,2018-03-05,view,P6 325 | 37,2018-03-05,view,P13 326 | 37,2018-03-05,view,P19 327 | 58,2018-03-05,login, 328 | 58,2018-03-05,view,P5 329 | 58,2018-03-05,view,P8 330 | 58,2018-03-05,view,P15 331 | 12,2018-03-06,login, 332 | 12,2018-03-06,view,P9 333 | 12,2018-03-06,view,P14 334 | 28,2018-03-09,login, 335 | 28,2018-03-09,view,P4 336 | 28,2018-03-09,view,P8 337 | 28,2018-03-09,view,P16 338 | 28,2018-03-09,view,P20 339 | 35,2018-03-09,login, 340 | 35,2018-03-09,view,P2 341 | 35,2018-03-09,view,P11 342 | 48,2018-03-11,login, 343 | 48,2018-03-11,view,P3 344 | 48,2018-03-11,view,P17 345 | 22,2018-03-12,login, 346 | 22,2018-03-12,view,P4 347 | 22,2018-03-12,view,P5 348 | 22,2018-03-12,view,P8 349 | 22,2018-03-12,view,P14 350 | 72,2018-03-14,login, 351 | 72,2018-03-14,view,P5 352 | 72,2018-03-14,view,P9 353 | 72,2018-03-14,view,P11 354 | 43,2018-03-15,login, 355 | 43,2018-03-15,view,P2 356 | 43,2018-03-15,view,P4 357 | 43,2018-03-15,view,P9 358 | 43,2018-03-15,view,P10 359 | 43,2018-03-15,view,P11 360 | 43,2018-03-15,view,P13 361 | 43,2018-03-15,view,P14 362 | 61,2018-03-15,login, 363 | 61,2018-03-15,view,P9 364 | 61,2018-03-15,view,P12 365 | 61,2018-03-15,view,P18 366 | 7,2018-03-17,login, 367 | 7,2018-03-17,view,P1 368 | 7,2018-03-17,view,P2 369 | 7,2018-03-17,view,P11 370 | 7,2018-03-17,view,P17 371 | 36,2018-03-18,login, 372 | 36,2018-03-18,view,P2 373 | 36,2018-03-18,view,P4 374 | 36,2018-03-18,view,P10 375 | 36,2018-03-18,view,P14 376 | 5,2018-03-19,login, 377 | 5,2018-03-19,view,P4 378 | 82,2018-03-19,login, 379 | 82,2018-03-19,view,P5 380 | 82,2018-03-19,view,P12 381 | 82,2018-03-19,view,P13 382 | 82,2018-03-19,view,P15 383 | 72,2018-03-20,login, 384 | 72,2018-03-20,view,P3 385 | 72,2018-03-20,view,P8 386 | 72,2018-03-20,view,P11 387 | 48,2018-03-21,login, 388 | 48,2018-03-21,view,P1 389 | 54,2018-03-22,login, 390 | 54,2018-03-22,view,P1 391 | 54,2018-03-22,view,P10 392 | 59,2018-03-23,login, 393 | 59,2018-03-23,view,P4 394 | 59,2018-03-23,view,P5 395 | 59,2018-03-23,view,P11 396 | 59,2018-03-23,view,P15 397 | 59,2018-03-23,view,P16 398 | 43,2018-03-26,login, 399 | 43,2018-03-26,view,P2 400 | 43,2018-03-26,view,P9 401 | 43,2018-03-26,view,P15 402 | 59,2018-03-26,login, 403 | 59,2018-03-26,view,P4 404 | 59,2018-03-26,view,P6 405 | 59,2018-03-26,view,P8 406 | 59,2018-03-26,view,P11 407 | 59,2018-03-26,view,P12 408 | 84,2018-03-27,login, 409 | 84,2018-03-27,view,P2 410 | 84,2018-03-27,view,P11 411 | 2,2018-03-28,login, 412 | 2,2018-03-28,view,P2 413 | 2,2018-03-28,view,P4 414 | 2,2018-03-28,view,P7 415 | 2,2018-03-28,view,P11 416 | 2,2018-03-28,view,P19 417 | 97,2018-03-28,login, 418 | 97,2018-03-28,view,P8 419 | 97,2018-03-28,view,P11 420 | 26,2018-03-29,login, 421 | 26,2018-03-29,view,P7 422 | 26,2018-03-29,view,P13 423 | 26,2018-03-29,view,P14 424 | 40,2018-03-29,login, 425 | 40,2018-03-29,view,P5 426 | 40,2018-03-29,view,P18 427 | 40,2018-03-29,view,P19 428 | 56,2018-03-29,login, 429 | 56,2018-03-29,view,P7 430 | 52,2018-03-30,login, 431 | 52,2018-03-30,view,P3 432 | 52,2018-03-30,view,P10 433 | 52,2018-03-30,view,P11 434 | 11,2018-03-31,login, 435 | 11,2018-03-31,view,P2 436 | 11,2018-03-31,view,P4 437 | 11,2018-03-31,view,P20 438 | 84,2018-03-31,login, 439 | 84,2018-03-31,view,P4 440 | 84,2018-03-31,view,P13 441 | 84,2018-03-31,view,P17 442 | 92,2018-03-31,login, 443 | 92,2018-03-31,view,P1 444 | 92,2018-03-31,view,P14 445 | 92,2018-03-31,view,P19 446 | 92,2018-03-31,view,P20 447 | 76,2018-04-01,login, 448 | 76,2018-04-01,view,P3 449 | 76,2018-04-01,view,P11 450 | 82,2018-04-01,login, 451 | 82,2018-04-01,view,P2 452 | 82,2018-04-01,view,P5 453 | 82,2018-04-01,view,P6 454 | 82,2018-04-01,view,P8 455 | 82,2018-04-01,view,P17 456 | 47,2018-04-02,login, 457 | 47,2018-04-02,view,P4 458 | 47,2018-04-02,view,P9 459 | 47,2018-04-02,view,P11 460 | 47,2018-04-02,view,P19 461 | 10,2018-04-04,login, 462 | 10,2018-04-04,view,P2 463 | 10,2018-04-04,view,P4 464 | 10,2018-04-04,view,P9 465 | 10,2018-04-04,view,P13 466 | 86,2018-04-04,login, 467 | 86,2018-04-04,view,P3 468 | 86,2018-04-04,view,P4 469 | 86,2018-04-04,view,P6 470 | 86,2018-04-04,view,P7 471 | 86,2018-04-04,view,P11 472 | 86,2018-04-04,view,P14 473 | 86,2018-04-04,view,P17 474 | 39,2018-04-05,login, 475 | 39,2018-04-05,view,P4 476 | 39,2018-04-05,view,P9 477 | 39,2018-04-05,view,P12 478 | 39,2018-04-05,view,P18 479 | 13,2018-04-06,login, 480 | 13,2018-04-06,view,P2 481 | 13,2018-04-06,view,P4 482 | 13,2018-04-06,view,P8 483 | 13,2018-04-06,view,P16 484 | 13,2018-04-06,view,P20 485 | 70,2018-04-06,login, 486 | 96,2018-04-06,login, 487 | 96,2018-04-06,view,P1 488 | 96,2018-04-06,view,P3 489 | 96,2018-04-06,view,P5 490 | 96,2018-04-06,view,P9 491 | 96,2018-04-06,view,P10 492 | 96,2018-04-06,view,P11 493 | 96,2018-04-06,view,P12 494 | 11,2018-04-07,login, 495 | 11,2018-04-07,view,P2 496 | 11,2018-04-07,view,P3 497 | 11,2018-04-07,view,P4 498 | 11,2018-04-07,view,P14 499 | 11,2018-04-07,view,P16 500 | 34,2018-04-09,login, 501 | 34,2018-04-09,view,P1 502 | 34,2018-04-09,view,P16 503 | 34,2018-04-09,view,P18 504 | 34,2018-04-09,view,P20 505 | 72,2018-04-14,login, 506 | 72,2018-04-14,view,P3 507 | 72,2018-04-14,view,P6 508 | 72,2018-04-14,view,P7 509 | 17,2018-04-16,login, 510 | 17,2018-04-16,view,P1 511 | 17,2018-04-16,view,P2 512 | 17,2018-04-16,view,P5 513 | 17,2018-04-16,view,P8 514 | 17,2018-04-16,view,P11 515 | 17,2018-04-16,view,P12 516 | 17,2018-04-16,view,P15 517 | 17,2018-04-16,view,P16 518 | 29,2018-04-16,login, 519 | 29,2018-04-16,view,P12 520 | 100,2018-04-16,login, 521 | 100,2018-04-16,view,P7 522 | 100,2018-04-16,view,P10 523 | 100,2018-04-16,view,P14 524 | 5,2018-04-19,login, 525 | 69,2018-04-22,login, 526 | 69,2018-04-22,view,P1 527 | 69,2018-04-22,view,P2 528 | 69,2018-04-22,view,P10 529 | 69,2018-04-22,view,P11 530 | 69,2018-04-22,view,P17 531 | 69,2018-04-22,view,P19 532 | 93,2018-04-22,login, 533 | 93,2018-04-22,view,P11 534 | 93,2018-04-22,view,P19 535 | 93,2018-04-22,view,P20 536 | 18,2018-04-23,login, 537 | 18,2018-04-23,view,P5 538 | 18,2018-04-23,view,P7 539 | 18,2018-04-23,view,P15 540 | 24,2018-04-23,login, 541 | 24,2018-04-23,view,P2 542 | 24,2018-04-23,view,P4 543 | 24,2018-04-23,view,P9 544 | 24,2018-04-23,view,P13 545 | 24,2018-04-23,view,P14 546 | 24,2018-04-23,view,P15 547 | 57,2018-04-23,login, 548 | 57,2018-04-23,view,P4 549 | 57,2018-04-23,view,P5 550 | 57,2018-04-23,view,P11 551 | 57,2018-04-23,view,P18 552 | 60,2018-04-24,login, 553 | 60,2018-04-24,view,P2 554 | 60,2018-04-24,view,P8 555 | 60,2018-04-24,view,P20 556 | 42,2018-04-26,login, 557 | 42,2018-04-26,view,P2 558 | 42,2018-04-26,view,P4 559 | 42,2018-04-26,view,P5 560 | 42,2018-04-26,view,P7 561 | 22,2018-04-27,login, 562 | 22,2018-04-27,view,P1 563 | 22,2018-04-27,view,P4 564 | 22,2018-04-27,view,P9 565 | 22,2018-04-27,view,P14 566 | 1,2018-04-28,login, 567 | 1,2018-04-28,view,P1 568 | 1,2018-04-28,view,P6 569 | 1,2018-04-28,view,P18 570 | 69,2018-04-28,login, 571 | 69,2018-04-28,view,P2 572 | 69,2018-04-28,view,P3 573 | 69,2018-04-28,view,P4 574 | 69,2018-04-28,view,P13 575 | 69,2018-04-28,view,P14 576 | 69,2018-04-28,view,P17 577 | 81,2018-04-28,login, 578 | 81,2018-04-28,view,P2 579 | 81,2018-04-28,view,P8 580 | 81,2018-04-28,view,P9 581 | 81,2018-04-28,view,P11 582 | 81,2018-04-28,view,P17 583 | 52,2018-04-29,login, 584 | 52,2018-04-29,view,P4 585 | 52,2018-04-29,view,P8 586 | 52,2018-04-29,view,P12 587 | 56,2018-04-29,login, 588 | 56,2018-04-29,view,P6 589 | 56,2018-04-29,view,P8 590 | 56,2018-04-29,view,P12 591 | 56,2018-04-29,view,P14 592 | 69,2018-04-29,login, 593 | 69,2018-04-29,view,P2 594 | 69,2018-04-29,view,P7 595 | 69,2018-04-29,view,P10 596 | 69,2018-04-29,view,P11 597 | 69,2018-04-29,view,P12 598 | 69,2018-04-29,view,P14 599 | 69,2018-04-29,view,P17 600 | 69,2018-04-29,view,P20 601 | 79,2018-04-29,login, 602 | 79,2018-04-29,view,P9 603 | 79,2018-04-29,view,P10 604 | 79,2018-04-29,view,P13 605 | 79,2018-04-29,view,P19 606 | 4,2018-04-30,login, 607 | 4,2018-04-30,view,P4 608 | 4,2018-04-30,view,P15 609 | 97,2018-05-01,login, 610 | 97,2018-05-01,view,P5 611 | 97,2018-05-01,view,P8 612 | 97,2018-05-01,view,P9 613 | 97,2018-05-01,view,P18 614 | 61,2018-05-02,login, 615 | 61,2018-05-02,view,P3 616 | 61,2018-05-02,view,P4 617 | 61,2018-05-02,view,P10 618 | 61,2018-05-02,view,P13 619 | 61,2018-05-02,view,P14 620 | 61,2018-05-02,view,P16 621 | 61,2018-05-02,view,P18 622 | 79,2018-05-02,login, 623 | 99,2018-05-02,login, 624 | 99,2018-05-02,view,P4 625 | 99,2018-05-02,view,P5 626 | 99,2018-05-02,view,P6 627 | 27,2018-05-03,login, 628 | 27,2018-05-03,view,P7 629 | 27,2018-05-03,view,P14 630 | 27,2018-05-03,view,P16 631 | 52,2018-05-04,login, 632 | 52,2018-05-04,view,P4 633 | 52,2018-05-04,view,P5 634 | 52,2018-05-04,view,P8 635 | 90,2018-05-04,login, 636 | 90,2018-05-04,view,P3 637 | 90,2018-05-04,view,P10 638 | 90,2018-05-04,view,P14 639 | 90,2018-05-04,view,P15 640 | 90,2018-05-04,view,P16 641 | 90,2018-05-04,view,P18 642 | 97,2018-05-04,login, 643 | 97,2018-05-04,view,P11 644 | 97,2018-05-04,view,P20 645 | 54,2018-05-05,login, 646 | 54,2018-05-05,view,P9 647 | 54,2018-05-05,view,P10 648 | 54,2018-05-05,view,P16 649 | 14,2018-05-06,login, 650 | 14,2018-05-06,view,P1 651 | 14,2018-05-06,view,P4 652 | 14,2018-05-06,view,P5 653 | 14,2018-05-06,view,P12 654 | 14,2018-05-06,view,P14 655 | 64,2018-05-06,login, 656 | 64,2018-05-06,view,P16 657 | 90,2018-05-09,login, 658 | 90,2018-05-09,view,P5 659 | 90,2018-05-09,view,P8 660 | 90,2018-05-09,view,P13 661 | 90,2018-05-09,view,P14 662 | 90,2018-05-09,view,P16 663 | 90,2018-05-09,view,P18 664 | 90,2018-05-09,view,P20 665 | 71,2018-05-11,login, 666 | 71,2018-05-11,view,P8 667 | 71,2018-05-11,view,P10 668 | 71,2018-05-11,view,P16 669 | 82,2018-05-11,login, 670 | 82,2018-05-11,view,P2 671 | 82,2018-05-11,view,P5 672 | 82,2018-05-11,view,P9 673 | 82,2018-05-11,view,P12 674 | 82,2018-05-11,view,P13 675 | 82,2018-05-11,view,P18 676 | 98,2018-05-11,login, 677 | 98,2018-05-11,view,P4 678 | 98,2018-05-11,view,P5 679 | 98,2018-05-11,view,P11 680 | 98,2018-05-11,view,P14 681 | 64,2018-05-13,login, 682 | 64,2018-05-13,view,P13 683 | 64,2018-05-13,view,P14 684 | 53,2018-05-14,login, 685 | 53,2018-05-14,view,P8 686 | 79,2018-05-14,login, 687 | 79,2018-05-14,view,P9 688 | 79,2018-05-14,view,P16 689 | 97,2018-05-14,login, 690 | 97,2018-05-14,view,P1 691 | 97,2018-05-14,view,P8 692 | 97,2018-05-14,view,P11 693 | 97,2018-05-14,view,P13 694 | 97,2018-05-14,view,P18 695 | 60,2018-05-16,login, 696 | 60,2018-05-16,view,P9 697 | 60,2018-05-16,view,P14 698 | 60,2018-05-16,view,P17 699 | 60,2018-05-16,view,P20 700 | 37,2018-05-17,login, 701 | 37,2018-05-17,view,P3 702 | 37,2018-05-17,view,P4 703 | 37,2018-05-17,view,P10 704 | 37,2018-05-17,view,P12 705 | 37,2018-05-17,view,P14 706 | 81,2018-05-17,login, 707 | 81,2018-05-17,view,P2 708 | 81,2018-05-17,view,P3 709 | 81,2018-05-17,view,P8 710 | 81,2018-05-17,view,P13 711 | 81,2018-05-17,view,P14 712 | 81,2018-05-17,view,P20 713 | 12,2018-05-19,login, 714 | 12,2018-05-19,view,P1 715 | 12,2018-05-19,view,P8 716 | 12,2018-05-19,view,P15 717 | 12,2018-05-19,view,P18 718 | 48,2018-05-19,login, 719 | 48,2018-05-19,view,P3 720 | 48,2018-05-19,view,P16 721 | 48,2018-05-19,view,P17 722 | 87,2018-05-20,login, 723 | 87,2018-05-20,view,P4 724 | 87,2018-05-20,view,P6 725 | 87,2018-05-20,view,P7 726 | 87,2018-05-20,view,P18 727 | 56,2018-05-21,login, 728 | 56,2018-05-21,view,P3 729 | 56,2018-05-21,view,P10 730 | 56,2018-05-21,view,P12 731 | 56,2018-05-21,view,P18 732 | 28,2018-05-22,login, 733 | 28,2018-05-22,view,P3 734 | 28,2018-05-22,view,P16 735 | 50,2018-05-22,login, 736 | 50,2018-05-22,view,P14 737 | 50,2018-05-22,view,P15 738 | 52,2018-05-23,login, 739 | 52,2018-05-23,view,P1 740 | 52,2018-05-23,view,P3 741 | 52,2018-05-23,view,P9 742 | 52,2018-05-23,view,P11 743 | 52,2018-05-23,view,P12 744 | 8,2018-05-25,login, 745 | 8,2018-05-25,view,P4 746 | 8,2018-05-25,view,P12 747 | 8,2018-05-25,view,P19 748 | 79,2018-05-26,login, 749 | 79,2018-05-26,view,P10 750 | 2,2018-05-27,login, 751 | 2,2018-05-27,view,P2 752 | 2,2018-05-27,view,P7 753 | 2,2018-05-27,view,P11 754 | 22,2018-05-28,login, 755 | 22,2018-05-28,view,P5 756 | 28,2018-05-29,login, 757 | 28,2018-05-29,view,P2 758 | 28,2018-05-29,view,P11 759 | 39,2018-05-29,login, 760 | 39,2018-05-29,view,P4 761 | 39,2018-05-29,view,P9 762 | 39,2018-05-29,view,P10 763 | 39,2018-05-29,view,P13 764 | 39,2018-05-29,view,P19 765 | 14,2018-05-30,login, 766 | 14,2018-05-30,view,P1 767 | 14,2018-05-30,view,P4 768 | 14,2018-05-30,view,P11 769 | 14,2018-05-30,view,P13 770 | 14,2018-05-30,view,P16 771 | 14,2018-05-30,view,P18 772 | 67,2018-05-30,login, 773 | 67,2018-05-30,view,P18 774 | 93,2018-05-31,login, 775 | 93,2018-05-31,view,P9 776 | 93,2018-05-31,view,P11 777 | 67,2018-06-01,login, 778 | 67,2018-06-01,view,P7 779 | 67,2018-06-01,view,P8 780 | 67,2018-06-01,view,P14 781 | 28,2018-06-04,login, 782 | 28,2018-06-04,view,P2 783 | 33,2018-06-07,login, 784 | 33,2018-06-07,view,P5 785 | 33,2018-06-07,view,P6 786 | 33,2018-06-07,view,P8 787 | 33,2018-06-07,view,P17 788 | 28,2018-06-08,login, 789 | 28,2018-06-08,view,P2 790 | 28,2018-06-08,view,P4 791 | 28,2018-06-08,view,P8 792 | 28,2018-06-08,view,P14 793 | 35,2018-06-08,login, 794 | 35,2018-06-08,view,P2 795 | 35,2018-06-08,view,P13 796 | 44,2018-06-08,login, 797 | 44,2018-06-08,view,P2 798 | 44,2018-06-08,view,P3 799 | 44,2018-06-08,view,P6 800 | 44,2018-06-08,view,P11 801 | 44,2018-06-08,view,P13 802 | 44,2018-06-08,view,P19 803 | 59,2018-06-09,login, 804 | 59,2018-06-09,view,P4 805 | 59,2018-06-09,view,P5 806 | 59,2018-06-09,view,P7 807 | 59,2018-06-09,view,P8 808 | 59,2018-06-09,view,P11 809 | 59,2018-06-09,view,P15 810 | 59,2018-06-09,view,P20 811 | 80,2018-06-10,login, 812 | 80,2018-06-10,view,P1 813 | 80,2018-06-10,view,P2 814 | 80,2018-06-10,view,P4 815 | 80,2018-06-10,view,P11 816 | 80,2018-06-10,view,P19 817 | 6,2018-06-13,login, 818 | 6,2018-06-13,view,P4 819 | 6,2018-06-13,view,P17 820 | 35,2018-06-14,login, 821 | 35,2018-06-14,view,P1 822 | 35,2018-06-14,view,P2 823 | 35,2018-06-14,view,P3 824 | 35,2018-06-14,view,P7 825 | 35,2018-06-14,view,P16 826 | 29,2018-06-17,login, 827 | 44,2018-06-17,login, 828 | 44,2018-06-17,view,P2 829 | 44,2018-06-17,view,P12 830 | 93,2018-06-17,login, 831 | 93,2018-06-17,view,P11 832 | 93,2018-06-17,view,P15 833 | 97,2018-06-17,login, 834 | 97,2018-06-17,view,P13 835 | 97,2018-06-17,view,P15 836 | 66,2018-06-19,login, 837 | 66,2018-06-19,view,P2 838 | 66,2018-06-19,view,P13 839 | 66,2018-06-19,view,P18 840 | 69,2018-06-19,login, 841 | 69,2018-06-19,view,P2 842 | 69,2018-06-19,view,P11 843 | 69,2018-06-19,view,P14 844 | 69,2018-06-19,view,P17 845 | 97,2018-06-20,login, 846 | 97,2018-06-20,view,P7 847 | 97,2018-06-20,view,P8 848 | 28,2018-06-21,login, 849 | 28,2018-06-21,view,P2 850 | 28,2018-06-21,view,P5 851 | 28,2018-06-21,view,P16 852 | 55,2018-06-21,login, 853 | 55,2018-06-21,view,P8 854 | 55,2018-06-21,view,P12 855 | 55,2018-06-21,view,P13 856 | 55,2018-06-21,view,P18 857 | 67,2018-06-21,login, 858 | 67,2018-06-21,view,P14 859 | 22,2018-06-22,login, 860 | 22,2018-06-22,view,P1 861 | 22,2018-06-22,view,P4 862 | 22,2018-06-22,view,P11 863 | 22,2018-06-22,view,P17 864 | 37,2018-06-22,login, 865 | 37,2018-06-22,view,P6 866 | 37,2018-06-22,view,P8 867 | 37,2018-06-22,view,P12 868 | 37,2018-06-22,view,P13 869 | 37,2018-06-22,view,P14 870 | 40,2018-06-22,login, 871 | 40,2018-06-22,view,P5 872 | 40,2018-06-22,view,P6 873 | 40,2018-06-22,view,P13 874 | 72,2018-06-23,login, 875 | 72,2018-06-23,view,P8 876 | 72,2018-06-23,view,P11 877 | 72,2018-06-23,view,P18 878 | 72,2018-06-23,view,P19 879 | 29,2018-06-24,login, 880 | 29,2018-06-24,view,P7 881 | 29,2018-06-24,view,P8 882 | 29,2018-06-24,view,P13 883 | 29,2018-06-24,view,P18 884 | 59,2018-06-24,login, 885 | 59,2018-06-24,view,P1 886 | 59,2018-06-24,view,P4 887 | 59,2018-06-24,view,P11 888 | 59,2018-06-24,view,P15 889 | 61,2018-06-24,login, 890 | 61,2018-06-24,view,P4 891 | 61,2018-06-24,view,P18 892 | 80,2018-06-24,login, 893 | 80,2018-06-24,view,P1 894 | 80,2018-06-24,view,P2 895 | 80,2018-06-24,view,P4 896 | 80,2018-06-24,view,P7 897 | 80,2018-06-24,view,P12 898 | 27,2018-06-25,login, 899 | 27,2018-06-25,view,P10 900 | 27,2018-06-25,view,P14 901 | 84,2018-06-25,login, 902 | 84,2018-06-25,view,P2 903 | 84,2018-06-25,view,P3 904 | 84,2018-06-25,view,P13 905 | 88,2018-06-25,login, 906 | 88,2018-06-25,view,P6 907 | 88,2018-06-25,view,P17 908 | 36,2018-06-28,login, 909 | 36,2018-06-28,view,P4 910 | 36,2018-06-28,view,P5 911 | 36,2018-06-28,view,P8 912 | 36,2018-06-28,view,P14 913 | 29,2018-06-30,login, 914 | 29,2018-06-30,view,P8 915 | 29,2018-06-30,view,P19 916 | 30,2018-06-30,login, 917 | 30,2018-06-30,view,P1 918 | 30,2018-06-30,view,P2 919 | 30,2018-06-30,view,P5 920 | 30,2018-06-30,view,P17 921 | 30,2018-06-30,view,P19 922 | 90,2018-06-30,login, 923 | 90,2018-06-30,view,P3 924 | 90,2018-06-30,view,P8 925 | 90,2018-06-30,view,P10 926 | 90,2018-06-30,view,P14 927 | -------------------------------------------------------------------------------- /input/ch03/multi_table_train.csv: -------------------------------------------------------------------------------- 1 | user_id,product_id,target 2 | 1,P1,0 3 | 1,P2,0 4 | 1,P3,1 5 | 1,P4,0 6 | 1,P5,0 7 | 1,P6,0 8 | 1,P7,0 9 | 1,P8,0 10 | 1,P9,0 11 | 1,P10,0 12 | 1,P11,0 13 | 1,P12,0 14 | 1,P13,0 15 | 1,P14,0 16 | 1,P15,0 17 | 1,P16,0 18 | 1,P17,1 19 | 1,P18,0 20 | 1,P19,0 21 | 1,P20,0 22 | 2,P1,0 23 | 2,P2,1 24 | 2,P3,0 25 | 2,P4,0 26 | 2,P5,0 27 | 2,P6,0 28 | 2,P7,1 29 | 2,P8,0 30 | 2,P9,1 31 | 2,P10,1 32 | 2,P11,1 33 | 2,P12,1 34 | 2,P13,1 35 | 2,P14,0 36 | 2,P15,0 37 | 2,P16,0 38 | 2,P17,0 39 | 2,P18,0 40 | 2,P19,0 41 | 2,P20,0 42 | 3,P1,1 43 | 3,P2,1 44 | 3,P3,0 45 | 3,P4,0 46 | 3,P5,0 47 | 3,P6,0 48 | 3,P7,1 49 | 3,P8,0 50 | 3,P9,1 51 | 3,P10,0 52 | 3,P11,1 53 | 3,P12,0 54 | 3,P13,0 55 | 3,P14,1 56 | 3,P15,0 57 | 3,P16,0 58 | 3,P17,1 59 | 3,P18,0 60 | 3,P19,0 61 | 3,P20,0 62 | 4,P1,0 63 | 4,P2,0 64 | 4,P3,1 65 | 4,P4,1 66 | 4,P5,0 67 | 4,P6,0 68 | 4,P7,0 69 | 4,P8,0 70 | 4,P9,0 71 | 4,P10,1 72 | 4,P11,0 73 | 4,P12,0 74 | 4,P13,1 75 | 4,P14,0 76 | 4,P15,1 77 | 4,P16,0 78 | 4,P17,0 79 | 4,P18,1 80 | 4,P19,1 81 | 4,P20,1 82 | 5,P1,0 83 | 5,P2,0 84 | 5,P3,1 85 | 5,P4,1 86 | 5,P5,0 87 | 5,P6,0 88 | 5,P7,0 89 | 5,P8,0 90 | 5,P9,0 91 | 5,P10,0 92 | 5,P11,1 93 | 5,P12,1 94 | 5,P13,0 95 | 5,P14,1 96 | 5,P15,1 97 | 5,P16,0 98 | 5,P17,0 99 | 5,P18,1 100 | 5,P19,0 101 | 5,P20,0 102 | 6,P1,0 103 | 6,P2,1 104 | 6,P3,0 105 | 6,P4,1 106 | 6,P5,0 107 | 6,P6,0 108 | 6,P7,0 109 | 6,P8,1 110 | 6,P9,0 111 | 6,P10,0 112 | 6,P11,1 113 | 6,P12,0 114 | 6,P13,1 115 | 6,P14,0 116 | 6,P15,0 117 | 6,P16,0 118 | 6,P17,1 119 | 6,P18,0 120 | 6,P19,0 121 | 6,P20,0 122 | 7,P1,1 123 | 7,P2,1 124 | 7,P3,0 125 | 7,P4,0 126 | 7,P5,1 127 | 7,P6,0 128 | 7,P7,0 129 | 7,P8,0 130 | 7,P9,1 131 | 7,P10,0 132 | 7,P11,1 133 | 7,P12,1 134 | 7,P13,1 135 | 7,P14,1 136 | 7,P15,0 137 | 7,P16,0 138 | 7,P17,1 139 | 7,P18,0 140 | 7,P19,0 141 | 7,P20,0 142 | 8,P1,0 143 | 8,P2,0 144 | 8,P3,1 145 | 8,P4,1 146 | 8,P5,0 147 | 8,P6,1 148 | 8,P7,0 149 | 8,P8,0 150 | 8,P9,0 151 | 8,P10,1 152 | 8,P11,1 153 | 8,P12,1 154 | 8,P13,0 155 | 8,P14,0 156 | 8,P15,0 157 | 8,P16,1 158 | 8,P17,0 159 | 8,P18,0 160 | 8,P19,1 161 | 8,P20,1 162 | 9,P1,0 163 | 9,P2,0 164 | 9,P3,0 165 | 9,P4,0 166 | 9,P5,0 167 | 9,P6,0 168 | 9,P7,0 169 | 9,P8,1 170 | 9,P9,0 171 | 9,P10,0 172 | 9,P11,1 173 | 9,P12,0 174 | 9,P13,0 175 | 9,P14,1 176 | 9,P15,0 177 | 9,P16,1 178 | 9,P17,0 179 | 9,P18,1 180 | 9,P19,1 181 | 9,P20,0 182 | 10,P1,1 183 | 10,P2,1 184 | 10,P3,0 185 | 10,P4,1 186 | 10,P5,0 187 | 10,P6,0 188 | 10,P7,1 189 | 10,P8,1 190 | 10,P9,0 191 | 10,P10,0 192 | 10,P11,1 193 | 10,P12,0 194 | 10,P13,1 195 | 10,P14,1 196 | 10,P15,1 197 | 10,P16,1 198 | 10,P17,0 199 | 10,P18,0 200 | 10,P19,0 201 | 10,P20,0 202 | 11,P1,0 203 | 11,P2,1 204 | 11,P3,1 205 | 11,P4,1 206 | 11,P5,0 207 | 11,P6,0 208 | 11,P7,0 209 | 11,P8,0 210 | 11,P9,0 211 | 11,P10,0 212 | 11,P11,1 213 | 11,P12,0 214 | 11,P13,0 215 | 11,P14,1 216 | 11,P15,0 217 | 11,P16,0 218 | 11,P17,0 219 | 11,P18,0 220 | 11,P19,0 221 | 11,P20,0 222 | 12,P1,1 223 | 12,P2,0 224 | 12,P3,0 225 | 12,P4,1 226 | 12,P5,0 227 | 12,P6,0 228 | 12,P7,0 229 | 12,P8,0 230 | 12,P9,0 231 | 12,P10,0 232 | 12,P11,0 233 | 12,P12,1 234 | 12,P13,0 235 | 12,P14,1 236 | 12,P15,1 237 | 12,P16,1 238 | 12,P17,1 239 | 12,P18,1 240 | 12,P19,0 241 | 12,P20,0 242 | 13,P1,1 243 | 13,P2,1 244 | 13,P3,0 245 | 13,P4,0 246 | 13,P5,1 247 | 13,P6,0 248 | 13,P7,0 249 | 13,P8,1 250 | 13,P9,1 251 | 13,P10,0 252 | 13,P11,0 253 | 13,P12,1 254 | 13,P13,1 255 | 13,P14,0 256 | 13,P15,0 257 | 13,P16,0 258 | 13,P17,0 259 | 13,P18,1 260 | 13,P19,1 261 | 13,P20,1 262 | 14,P1,1 263 | 14,P2,0 264 | 14,P3,1 265 | 14,P4,1 266 | 14,P5,0 267 | 14,P6,1 268 | 14,P7,1 269 | 14,P8,0 270 | 14,P9,0 271 | 14,P10,0 272 | 14,P11,0 273 | 14,P12,0 274 | 14,P13,1 275 | 14,P14,0 276 | 14,P15,1 277 | 14,P16,1 278 | 14,P17,0 279 | 14,P18,0 280 | 14,P19,1 281 | 14,P20,0 282 | 15,P1,0 283 | 15,P2,1 284 | 15,P3,0 285 | 15,P4,0 286 | 15,P5,1 287 | 15,P6,1 288 | 15,P7,1 289 | 15,P8,0 290 | 15,P9,0 291 | 15,P10,0 292 | 15,P11,0 293 | 15,P12,0 294 | 15,P13,0 295 | 15,P14,1 296 | 15,P15,0 297 | 15,P16,1 298 | 15,P17,0 299 | 15,P18,1 300 | 15,P19,0 301 | 15,P20,0 302 | 16,P1,0 303 | 16,P2,1 304 | 16,P3,0 305 | 16,P4,0 306 | 16,P5,0 307 | 16,P6,1 308 | 16,P7,1 309 | 16,P8,1 310 | 16,P9,0 311 | 16,P10,0 312 | 16,P11,0 313 | 16,P12,1 314 | 16,P13,0 315 | 16,P14,0 316 | 16,P15,0 317 | 16,P16,0 318 | 16,P17,0 319 | 16,P18,0 320 | 16,P19,1 321 | 16,P20,0 322 | 17,P1,1 323 | 17,P2,1 324 | 17,P3,1 325 | 17,P4,0 326 | 17,P5,1 327 | 17,P6,0 328 | 17,P7,0 329 | 17,P8,1 330 | 17,P9,0 331 | 17,P10,0 332 | 17,P11,1 333 | 17,P12,0 334 | 17,P13,1 335 | 17,P14,0 336 | 17,P15,0 337 | 17,P16,0 338 | 17,P17,1 339 | 17,P18,0 340 | 17,P19,1 341 | 17,P20,1 342 | 18,P1,1 343 | 18,P2,1 344 | 18,P3,0 345 | 18,P4,0 346 | 18,P5,0 347 | 18,P6,1 348 | 18,P7,0 349 | 18,P8,0 350 | 18,P9,0 351 | 18,P10,0 352 | 18,P11,0 353 | 18,P12,1 354 | 18,P13,0 355 | 18,P14,0 356 | 18,P15,0 357 | 18,P16,0 358 | 18,P17,1 359 | 18,P18,0 360 | 18,P19,0 361 | 18,P20,0 362 | 19,P1,1 363 | 19,P2,1 364 | 19,P3,0 365 | 19,P4,0 366 | 19,P5,0 367 | 19,P6,0 368 | 19,P7,0 369 | 19,P8,0 370 | 19,P9,0 371 | 19,P10,0 372 | 19,P11,0 373 | 19,P12,1 374 | 19,P13,1 375 | 19,P14,0 376 | 19,P15,0 377 | 19,P16,1 378 | 19,P17,1 379 | 19,P18,0 380 | 19,P19,0 381 | 19,P20,0 382 | 20,P1,0 383 | 20,P2,1 384 | 20,P3,0 385 | 20,P4,1 386 | 20,P5,0 387 | 20,P6,0 388 | 20,P7,0 389 | 20,P8,0 390 | 20,P9,1 391 | 20,P10,0 392 | 20,P11,0 393 | 20,P12,1 394 | 20,P13,1 395 | 20,P14,1 396 | 20,P15,0 397 | 20,P16,0 398 | 20,P17,0 399 | 20,P18,0 400 | 20,P19,0 401 | 20,P20,1 402 | 21,P1,0 403 | 21,P2,0 404 | 21,P3,0 405 | 21,P4,0 406 | 21,P5,1 407 | 21,P6,0 408 | 21,P7,1 409 | 21,P8,1 410 | 21,P9,1 411 | 21,P10,0 412 | 21,P11,1 413 | 21,P12,0 414 | 21,P13,0 415 | 21,P14,1 416 | 21,P15,1 417 | 21,P16,0 418 | 21,P17,1 419 | 21,P18,0 420 | 21,P19,1 421 | 21,P20,0 422 | 22,P1,1 423 | 22,P2,0 424 | 22,P3,0 425 | 22,P4,1 426 | 22,P5,0 427 | 22,P6,0 428 | 22,P7,0 429 | 22,P8,0 430 | 22,P9,1 431 | 22,P10,1 432 | 22,P11,1 433 | 22,P12,0 434 | 22,P13,0 435 | 22,P14,0 436 | 22,P15,1 437 | 22,P16,0 438 | 22,P17,0 439 | 22,P18,0 440 | 22,P19,0 441 | 22,P20,1 442 | 23,P1,1 443 | 23,P2,0 444 | 23,P3,1 445 | 23,P4,0 446 | 23,P5,0 447 | 23,P6,0 448 | 23,P7,0 449 | 23,P8,0 450 | 23,P9,0 451 | 23,P10,0 452 | 23,P11,1 453 | 23,P12,0 454 | 23,P13,1 455 | 23,P14,0 456 | 23,P15,0 457 | 23,P16,0 458 | 23,P17,0 459 | 23,P18,0 460 | 23,P19,0 461 | 23,P20,1 462 | 24,P1,1 463 | 24,P2,1 464 | 24,P3,0 465 | 24,P4,1 466 | 24,P5,0 467 | 24,P6,1 468 | 24,P7,0 469 | 24,P8,1 470 | 24,P9,1 471 | 24,P10,0 472 | 24,P11,1 473 | 24,P12,0 474 | 24,P13,0 475 | 24,P14,1 476 | 24,P15,1 477 | 24,P16,0 478 | 24,P17,1 479 | 24,P18,0 480 | 24,P19,0 481 | 24,P20,0 482 | 25,P1,0 483 | 25,P2,1 484 | 25,P3,0 485 | 25,P4,1 486 | 25,P5,0 487 | 25,P6,0 488 | 25,P7,0 489 | 25,P8,1 490 | 25,P9,0 491 | 25,P10,0 492 | 25,P11,0 493 | 25,P12,0 494 | 25,P13,1 495 | 25,P14,1 496 | 25,P15,0 497 | 25,P16,0 498 | 25,P17,0 499 | 25,P18,0 500 | 25,P19,0 501 | 25,P20,0 502 | 26,P1,0 503 | 26,P2,0 504 | 26,P3,0 505 | 26,P4,1 506 | 26,P5,0 507 | 26,P6,0 508 | 26,P7,0 509 | 26,P8,0 510 | 26,P9,0 511 | 26,P10,1 512 | 26,P11,1 513 | 26,P12,0 514 | 26,P13,1 515 | 26,P14,1 516 | 26,P15,1 517 | 26,P16,1 518 | 26,P17,0 519 | 26,P18,0 520 | 26,P19,0 521 | 26,P20,0 522 | 27,P1,0 523 | 27,P2,1 524 | 27,P3,0 525 | 27,P4,0 526 | 27,P5,0 527 | 27,P6,0 528 | 27,P7,0 529 | 27,P8,0 530 | 27,P9,0 531 | 27,P10,0 532 | 27,P11,0 533 | 27,P12,0 534 | 27,P13,1 535 | 27,P14,1 536 | 27,P15,1 537 | 27,P16,0 538 | 27,P17,0 539 | 27,P18,0 540 | 27,P19,0 541 | 27,P20,0 542 | 28,P1,0 543 | 28,P2,1 544 | 28,P3,0 545 | 28,P4,0 546 | 28,P5,0 547 | 28,P6,0 548 | 28,P7,1 549 | 28,P8,1 550 | 28,P9,1 551 | 28,P10,0 552 | 28,P11,1 553 | 28,P12,0 554 | 28,P13,0 555 | 28,P14,1 556 | 28,P15,0 557 | 28,P16,0 558 | 28,P17,0 559 | 28,P18,0 560 | 28,P19,1 561 | 28,P20,0 562 | 29,P1,0 563 | 29,P2,0 564 | 29,P3,0 565 | 29,P4,0 566 | 29,P5,0 567 | 29,P6,1 568 | 29,P7,1 569 | 29,P8,0 570 | 29,P9,0 571 | 29,P10,0 572 | 29,P11,0 573 | 29,P12,0 574 | 29,P13,0 575 | 29,P14,0 576 | 29,P15,1 577 | 29,P16,1 578 | 29,P17,0 579 | 29,P18,0 580 | 29,P19,0 581 | 29,P20,0 582 | 30,P1,0 583 | 30,P2,1 584 | 30,P3,1 585 | 30,P4,0 586 | 30,P5,1 587 | 30,P6,0 588 | 30,P7,0 589 | 30,P8,1 590 | 30,P9,0 591 | 30,P10,1 592 | 30,P11,0 593 | 30,P12,0 594 | 30,P13,0 595 | 30,P14,0 596 | 30,P15,0 597 | 30,P16,0 598 | 30,P17,0 599 | 30,P18,0 600 | 30,P19,0 601 | 30,P20,0 602 | 31,P1,1 603 | 31,P2,1 604 | 31,P3,0 605 | 31,P4,1 606 | 31,P5,1 607 | 31,P6,0 608 | 31,P7,0 609 | 31,P8,1 610 | 31,P9,1 611 | 31,P10,1 612 | 31,P11,1 613 | 31,P12,0 614 | 31,P13,0 615 | 31,P14,1 616 | 31,P15,1 617 | 31,P16,0 618 | 31,P17,0 619 | 31,P18,0 620 | 31,P19,0 621 | 31,P20,0 622 | 32,P1,0 623 | 32,P2,0 624 | 32,P3,0 625 | 32,P4,1 626 | 32,P5,1 627 | 32,P6,1 628 | 32,P7,0 629 | 32,P8,0 630 | 32,P9,0 631 | 32,P10,1 632 | 32,P11,0 633 | 32,P12,1 634 | 32,P13,1 635 | 32,P14,0 636 | 32,P15,0 637 | 32,P16,0 638 | 32,P17,1 639 | 32,P18,0 640 | 32,P19,1 641 | 32,P20,1 642 | 33,P1,0 643 | 33,P2,0 644 | 33,P3,0 645 | 33,P4,0 646 | 33,P5,0 647 | 33,P6,0 648 | 33,P7,1 649 | 33,P8,1 650 | 33,P9,0 651 | 33,P10,1 652 | 33,P11,0 653 | 33,P12,0 654 | 33,P13,0 655 | 33,P14,0 656 | 33,P15,0 657 | 33,P16,0 658 | 33,P17,0 659 | 33,P18,0 660 | 33,P19,0 661 | 33,P20,0 662 | 34,P1,1 663 | 34,P2,1 664 | 34,P3,1 665 | 34,P4,0 666 | 34,P5,1 667 | 34,P6,1 668 | 34,P7,1 669 | 34,P8,0 670 | 34,P9,1 671 | 34,P10,0 672 | 34,P11,1 673 | 34,P12,0 674 | 34,P13,1 675 | 34,P14,0 676 | 34,P15,0 677 | 34,P16,0 678 | 34,P17,0 679 | 34,P18,1 680 | 34,P19,0 681 | 34,P20,0 682 | 35,P1,1 683 | 35,P2,1 684 | 35,P3,0 685 | 35,P4,0 686 | 35,P5,0 687 | 35,P6,0 688 | 35,P7,1 689 | 35,P8,1 690 | 35,P9,0 691 | 35,P10,0 692 | 35,P11,1 693 | 35,P12,0 694 | 35,P13,0 695 | 35,P14,0 696 | 35,P15,0 697 | 35,P16,0 698 | 35,P17,0 699 | 35,P18,1 700 | 35,P19,0 701 | 35,P20,0 702 | 36,P1,0 703 | 36,P2,1 704 | 36,P3,0 705 | 36,P4,1 706 | 36,P5,0 707 | 36,P6,0 708 | 36,P7,0 709 | 36,P8,0 710 | 36,P9,1 711 | 36,P10,0 712 | 36,P11,1 713 | 36,P12,0 714 | 36,P13,0 715 | 36,P14,1 716 | 36,P15,0 717 | 36,P16,0 718 | 36,P17,0 719 | 36,P18,0 720 | 36,P19,0 721 | 36,P20,1 722 | 37,P1,1 723 | 37,P2,1 724 | 37,P3,0 725 | 37,P4,1 726 | 37,P5,0 727 | 37,P6,0 728 | 37,P7,0 729 | 37,P8,1 730 | 37,P9,1 731 | 37,P10,1 732 | 37,P11,0 733 | 37,P12,1 734 | 37,P13,1 735 | 37,P14,0 736 | 37,P15,0 737 | 37,P16,0 738 | 37,P17,0 739 | 37,P18,0 740 | 37,P19,0 741 | 37,P20,1 742 | 38,P1,1 743 | 38,P2,1 744 | 38,P3,0 745 | 38,P4,1 746 | 38,P5,0 747 | 38,P6,0 748 | 38,P7,1 749 | 38,P8,0 750 | 38,P9,0 751 | 38,P10,0 752 | 38,P11,0 753 | 38,P12,1 754 | 38,P13,0 755 | 38,P14,0 756 | 38,P15,0 757 | 38,P16,0 758 | 38,P17,1 759 | 38,P18,0 760 | 38,P19,0 761 | 38,P20,0 762 | 39,P1,0 763 | 39,P2,0 764 | 39,P3,0 765 | 39,P4,1 766 | 39,P5,0 767 | 39,P6,0 768 | 39,P7,0 769 | 39,P8,0 770 | 39,P9,1 771 | 39,P10,0 772 | 39,P11,0 773 | 39,P12,1 774 | 39,P13,1 775 | 39,P14,0 776 | 39,P15,0 777 | 39,P16,1 778 | 39,P17,0 779 | 39,P18,0 780 | 39,P19,0 781 | 39,P20,1 782 | 40,P1,0 783 | 40,P2,1 784 | 40,P3,0 785 | 40,P4,0 786 | 40,P5,1 787 | 40,P6,0 788 | 40,P7,0 789 | 40,P8,0 790 | 40,P9,0 791 | 40,P10,1 792 | 40,P11,0 793 | 40,P12,0 794 | 40,P13,1 795 | 40,P14,0 796 | 40,P15,1 797 | 40,P16,0 798 | 40,P17,0 799 | 40,P18,1 800 | 40,P19,1 801 | 40,P20,0 802 | 41,P1,1 803 | 41,P2,0 804 | 41,P3,0 805 | 41,P4,0 806 | 41,P5,0 807 | 41,P6,1 808 | 41,P7,1 809 | 41,P8,1 810 | 41,P9,0 811 | 41,P10,0 812 | 41,P11,1 813 | 41,P12,1 814 | 41,P13,1 815 | 41,P14,0 816 | 41,P15,0 817 | 41,P16,0 818 | 41,P17,0 819 | 41,P18,0 820 | 41,P19,0 821 | 41,P20,0 822 | 42,P1,0 823 | 42,P2,1 824 | 42,P3,0 825 | 42,P4,1 826 | 42,P5,0 827 | 42,P6,0 828 | 42,P7,0 829 | 42,P8,0 830 | 42,P9,1 831 | 42,P10,0 832 | 42,P11,1 833 | 42,P12,1 834 | 42,P13,0 835 | 42,P14,0 836 | 42,P15,0 837 | 42,P16,0 838 | 42,P17,0 839 | 42,P18,0 840 | 42,P19,0 841 | 42,P20,0 842 | 43,P1,1 843 | 43,P2,1 844 | 43,P3,1 845 | 43,P4,1 846 | 43,P5,1 847 | 43,P6,0 848 | 43,P7,1 849 | 43,P8,0 850 | 43,P9,1 851 | 43,P10,0 852 | 43,P11,1 853 | 43,P12,1 854 | 43,P13,1 855 | 43,P14,1 856 | 43,P15,0 857 | 43,P16,0 858 | 43,P17,1 859 | 43,P18,1 860 | 43,P19,1 861 | 43,P20,0 862 | 44,P1,0 863 | 44,P2,1 864 | 44,P3,1 865 | 44,P4,0 866 | 44,P5,0 867 | 44,P6,0 868 | 44,P7,0 869 | 44,P8,1 870 | 44,P9,0 871 | 44,P10,0 872 | 44,P11,1 873 | 44,P12,0 874 | 44,P13,1 875 | 44,P14,0 876 | 44,P15,0 877 | 44,P16,0 878 | 44,P17,1 879 | 44,P18,0 880 | 44,P19,1 881 | 44,P20,1 882 | 45,P1,0 883 | 45,P2,0 884 | 45,P3,1 885 | 45,P4,0 886 | 45,P5,0 887 | 45,P6,0 888 | 45,P7,1 889 | 45,P8,0 890 | 45,P9,0 891 | 45,P10,0 892 | 45,P11,1 893 | 45,P12,0 894 | 45,P13,0 895 | 45,P14,1 896 | 45,P15,0 897 | 45,P16,1 898 | 45,P17,1 899 | 45,P18,0 900 | 45,P19,0 901 | 45,P20,0 902 | 46,P1,0 903 | 46,P2,0 904 | 46,P3,0 905 | 46,P4,0 906 | 46,P5,1 907 | 46,P6,0 908 | 46,P7,1 909 | 46,P8,0 910 | 46,P9,1 911 | 46,P10,0 912 | 46,P11,0 913 | 46,P12,0 914 | 46,P13,0 915 | 46,P14,0 916 | 46,P15,1 917 | 46,P16,0 918 | 46,P17,0 919 | 46,P18,0 920 | 46,P19,0 921 | 46,P20,0 922 | 47,P1,0 923 | 47,P2,0 924 | 47,P3,0 925 | 47,P4,1 926 | 47,P5,0 927 | 47,P6,1 928 | 47,P7,0 929 | 47,P8,0 930 | 47,P9,0 931 | 47,P10,1 932 | 47,P11,0 933 | 47,P12,0 934 | 47,P13,0 935 | 47,P14,0 936 | 47,P15,0 937 | 47,P16,0 938 | 47,P17,0 939 | 47,P18,0 940 | 47,P19,0 941 | 47,P20,0 942 | 48,P1,1 943 | 48,P2,0 944 | 48,P3,1 945 | 48,P4,0 946 | 48,P5,1 947 | 48,P6,0 948 | 48,P7,1 949 | 48,P8,0 950 | 48,P9,1 951 | 48,P10,0 952 | 48,P11,0 953 | 48,P12,0 954 | 48,P13,1 955 | 48,P14,1 956 | 48,P15,0 957 | 48,P16,0 958 | 48,P17,1 959 | 48,P18,0 960 | 48,P19,1 961 | 48,P20,0 962 | 49,P1,0 963 | 49,P2,0 964 | 49,P3,0 965 | 49,P4,0 966 | 49,P5,1 967 | 49,P6,0 968 | 49,P7,0 969 | 49,P8,0 970 | 49,P9,0 971 | 49,P10,1 972 | 49,P11,0 973 | 49,P12,0 974 | 49,P13,0 975 | 49,P14,0 976 | 49,P15,0 977 | 49,P16,1 978 | 49,P17,0 979 | 49,P18,0 980 | 49,P19,1 981 | 49,P20,0 982 | 50,P1,0 983 | 50,P2,0 984 | 50,P3,0 985 | 50,P4,0 986 | 50,P5,0 987 | 50,P6,0 988 | 50,P7,0 989 | 50,P8,0 990 | 50,P9,0 991 | 50,P10,0 992 | 50,P11,0 993 | 50,P12,0 994 | 50,P13,1 995 | 50,P14,1 996 | 50,P15,0 997 | 50,P16,1 998 | 50,P17,1 999 | 50,P18,0 1000 | 50,P19,1 1001 | 50,P20,1 1002 | 51,P1,0 1003 | 51,P2,0 1004 | 51,P3,1 1005 | 51,P4,1 1006 | 51,P5,0 1007 | 51,P6,0 1008 | 51,P7,0 1009 | 51,P8,0 1010 | 51,P9,0 1011 | 51,P10,0 1012 | 51,P11,1 1013 | 51,P12,1 1014 | 51,P13,0 1015 | 51,P14,1 1016 | 51,P15,0 1017 | 51,P16,1 1018 | 51,P17,0 1019 | 51,P18,0 1020 | 51,P19,0 1021 | 51,P20,0 1022 | 52,P1,0 1023 | 52,P2,0 1024 | 52,P3,0 1025 | 52,P4,1 1026 | 52,P5,0 1027 | 52,P6,0 1028 | 52,P7,0 1029 | 52,P8,1 1030 | 52,P9,0 1031 | 52,P10,1 1032 | 52,P11,1 1033 | 52,P12,0 1034 | 52,P13,0 1035 | 52,P14,0 1036 | 52,P15,0 1037 | 52,P16,0 1038 | 52,P17,1 1039 | 52,P18,0 1040 | 52,P19,0 1041 | 52,P20,0 1042 | 53,P1,0 1043 | 53,P2,0 1044 | 53,P3,0 1045 | 53,P4,0 1046 | 53,P5,0 1047 | 53,P6,1 1048 | 53,P7,0 1049 | 53,P8,0 1050 | 53,P9,0 1051 | 53,P10,0 1052 | 53,P11,1 1053 | 53,P12,1 1054 | 53,P13,0 1055 | 53,P14,0 1056 | 53,P15,0 1057 | 53,P16,1 1058 | 53,P17,0 1059 | 53,P18,0 1060 | 53,P19,0 1061 | 53,P20,0 1062 | 54,P1,1 1063 | 54,P2,1 1064 | 54,P3,1 1065 | 54,P4,0 1066 | 54,P5,0 1067 | 54,P6,0 1068 | 54,P7,0 1069 | 54,P8,0 1070 | 54,P9,1 1071 | 54,P10,1 1072 | 54,P11,0 1073 | 54,P12,0 1074 | 54,P13,0 1075 | 54,P14,0 1076 | 54,P15,1 1077 | 54,P16,1 1078 | 54,P17,0 1079 | 54,P18,0 1080 | 54,P19,0 1081 | 54,P20,0 1082 | 55,P1,1 1083 | 55,P2,0 1084 | 55,P3,0 1085 | 55,P4,0 1086 | 55,P5,0 1087 | 55,P6,0 1088 | 55,P7,0 1089 | 55,P8,1 1090 | 55,P9,0 1091 | 55,P10,0 1092 | 55,P11,1 1093 | 55,P12,1 1094 | 55,P13,1 1095 | 55,P14,0 1096 | 55,P15,0 1097 | 55,P16,0 1098 | 55,P17,1 1099 | 55,P18,0 1100 | 55,P19,0 1101 | 55,P20,0 1102 | 56,P1,1 1103 | 56,P2,0 1104 | 56,P3,0 1105 | 56,P4,0 1106 | 56,P5,0 1107 | 56,P6,0 1108 | 56,P7,0 1109 | 56,P8,0 1110 | 56,P9,1 1111 | 56,P10,1 1112 | 56,P11,0 1113 | 56,P12,1 1114 | 56,P13,0 1115 | 56,P14,1 1116 | 56,P15,0 1117 | 56,P16,0 1118 | 56,P17,0 1119 | 56,P18,0 1120 | 56,P19,0 1121 | 56,P20,1 1122 | 57,P1,0 1123 | 57,P2,0 1124 | 57,P3,0 1125 | 57,P4,1 1126 | 57,P5,0 1127 | 57,P6,0 1128 | 57,P7,1 1129 | 57,P8,1 1130 | 57,P9,0 1131 | 57,P10,0 1132 | 57,P11,0 1133 | 57,P12,1 1134 | 57,P13,0 1135 | 57,P14,0 1136 | 57,P15,0 1137 | 57,P16,0 1138 | 57,P17,0 1139 | 57,P18,1 1140 | 57,P19,1 1141 | 57,P20,1 1142 | 58,P1,1 1143 | 58,P2,1 1144 | 58,P3,0 1145 | 58,P4,0 1146 | 58,P5,1 1147 | 58,P6,0 1148 | 58,P7,1 1149 | 58,P8,1 1150 | 58,P9,0 1151 | 58,P10,0 1152 | 58,P11,0 1153 | 58,P12,0 1154 | 58,P13,0 1155 | 58,P14,0 1156 | 58,P15,0 1157 | 58,P16,1 1158 | 58,P17,0 1159 | 58,P18,1 1160 | 58,P19,0 1161 | 58,P20,0 1162 | 59,P1,0 1163 | 59,P2,0 1164 | 59,P3,0 1165 | 59,P4,1 1166 | 59,P5,1 1167 | 59,P6,1 1168 | 59,P7,0 1169 | 59,P8,1 1170 | 59,P9,0 1171 | 59,P10,0 1172 | 59,P11,1 1173 | 59,P12,0 1174 | 59,P13,0 1175 | 59,P14,1 1176 | 59,P15,0 1177 | 59,P16,1 1178 | 59,P17,0 1179 | 59,P18,0 1180 | 59,P19,0 1181 | 59,P20,0 1182 | 60,P1,0 1183 | 60,P2,1 1184 | 60,P3,0 1185 | 60,P4,1 1186 | 60,P5,0 1187 | 60,P6,1 1188 | 60,P7,0 1189 | 60,P8,1 1190 | 60,P9,0 1191 | 60,P10,1 1192 | 60,P11,0 1193 | 60,P12,0 1194 | 60,P13,0 1195 | 60,P14,0 1196 | 60,P15,0 1197 | 60,P16,1 1198 | 60,P17,0 1199 | 60,P18,1 1200 | 60,P19,0 1201 | 60,P20,0 1202 | 61,P1,0 1203 | 61,P2,0 1204 | 61,P3,0 1205 | 61,P4,1 1206 | 61,P5,0 1207 | 61,P6,0 1208 | 61,P7,0 1209 | 61,P8,1 1210 | 61,P9,1 1211 | 61,P10,0 1212 | 61,P11,1 1213 | 61,P12,0 1214 | 61,P13,0 1215 | 61,P14,1 1216 | 61,P15,1 1217 | 61,P16,0 1218 | 61,P17,0 1219 | 61,P18,1 1220 | 61,P19,0 1221 | 61,P20,0 1222 | 62,P1,1 1223 | 62,P2,1 1224 | 62,P3,0 1225 | 62,P4,1 1226 | 62,P5,1 1227 | 62,P6,1 1228 | 62,P7,0 1229 | 62,P8,0 1230 | 62,P9,0 1231 | 62,P10,0 1232 | 62,P11,1 1233 | 62,P12,0 1234 | 62,P13,0 1235 | 62,P14,0 1236 | 62,P15,0 1237 | 62,P16,0 1238 | 62,P17,1 1239 | 62,P18,0 1240 | 62,P19,1 1241 | 62,P20,0 1242 | 63,P1,1 1243 | 63,P2,1 1244 | 63,P3,1 1245 | 63,P4,1 1246 | 63,P5,0 1247 | 63,P6,0 1248 | 63,P7,1 1249 | 63,P8,1 1250 | 63,P9,1 1251 | 63,P10,0 1252 | 63,P11,1 1253 | 63,P12,1 1254 | 63,P13,0 1255 | 63,P14,1 1256 | 63,P15,0 1257 | 63,P16,0 1258 | 63,P17,0 1259 | 63,P18,0 1260 | 63,P19,0 1261 | 63,P20,0 1262 | 64,P1,1 1263 | 64,P2,0 1264 | 64,P3,0 1265 | 64,P4,1 1266 | 64,P5,0 1267 | 64,P6,0 1268 | 64,P7,0 1269 | 64,P8,0 1270 | 64,P9,1 1271 | 64,P10,0 1272 | 64,P11,1 1273 | 64,P12,0 1274 | 64,P13,1 1275 | 64,P14,1 1276 | 64,P15,0 1277 | 64,P16,1 1278 | 64,P17,1 1279 | 64,P18,0 1280 | 64,P19,1 1281 | 64,P20,0 1282 | 65,P1,0 1283 | 65,P2,1 1284 | 65,P3,1 1285 | 65,P4,1 1286 | 65,P5,0 1287 | 65,P6,1 1288 | 65,P7,0 1289 | 65,P8,0 1290 | 65,P9,0 1291 | 65,P10,1 1292 | 65,P11,0 1293 | 65,P12,0 1294 | 65,P13,0 1295 | 65,P14,0 1296 | 65,P15,0 1297 | 65,P16,1 1298 | 65,P17,0 1299 | 65,P18,0 1300 | 65,P19,0 1301 | 65,P20,0 1302 | 66,P1,1 1303 | 66,P2,1 1304 | 66,P3,1 1305 | 66,P4,0 1306 | 66,P5,0 1307 | 66,P6,1 1308 | 66,P7,0 1309 | 66,P8,1 1310 | 66,P9,1 1311 | 66,P10,0 1312 | 66,P11,0 1313 | 66,P12,0 1314 | 66,P13,0 1315 | 66,P14,0 1316 | 66,P15,0 1317 | 66,P16,1 1318 | 66,P17,1 1319 | 66,P18,0 1320 | 66,P19,0 1321 | 66,P20,0 1322 | 67,P1,0 1323 | 67,P2,0 1324 | 67,P3,0 1325 | 67,P4,0 1326 | 67,P5,1 1327 | 67,P6,0 1328 | 67,P7,0 1329 | 67,P8,1 1330 | 67,P9,0 1331 | 67,P10,0 1332 | 67,P11,1 1333 | 67,P12,1 1334 | 67,P13,0 1335 | 67,P14,1 1336 | 67,P15,0 1337 | 67,P16,0 1338 | 67,P17,0 1339 | 67,P18,0 1340 | 67,P19,1 1341 | 67,P20,0 1342 | 68,P1,1 1343 | 68,P2,0 1344 | 68,P3,0 1345 | 68,P4,0 1346 | 68,P5,1 1347 | 68,P6,0 1348 | 68,P7,0 1349 | 68,P8,0 1350 | 68,P9,1 1351 | 68,P10,0 1352 | 68,P11,1 1353 | 68,P12,0 1354 | 68,P13,0 1355 | 68,P14,1 1356 | 68,P15,0 1357 | 68,P16,0 1358 | 68,P17,0 1359 | 68,P18,0 1360 | 68,P19,1 1361 | 68,P20,0 1362 | 69,P1,0 1363 | 69,P2,1 1364 | 69,P3,1 1365 | 69,P4,1 1366 | 69,P5,0 1367 | 69,P6,0 1368 | 69,P7,1 1369 | 69,P8,0 1370 | 69,P9,0 1371 | 69,P10,0 1372 | 69,P11,1 1373 | 69,P12,0 1374 | 69,P13,0 1375 | 69,P14,1 1376 | 69,P15,0 1377 | 69,P16,0 1378 | 69,P17,1 1379 | 69,P18,0 1380 | 69,P19,0 1381 | 69,P20,1 1382 | 70,P1,1 1383 | 70,P2,0 1384 | 70,P3,1 1385 | 70,P4,0 1386 | 70,P5,0 1387 | 70,P6,0 1388 | 70,P7,1 1389 | 70,P8,0 1390 | 70,P9,1 1391 | 70,P10,0 1392 | 70,P11,0 1393 | 70,P12,0 1394 | 70,P13,0 1395 | 70,P14,0 1396 | 70,P15,0 1397 | 70,P16,1 1398 | 70,P17,0 1399 | 70,P18,1 1400 | 70,P19,0 1401 | 70,P20,0 1402 | 71,P1,1 1403 | 71,P2,1 1404 | 71,P3,1 1405 | 71,P4,0 1406 | 71,P5,0 1407 | 71,P6,0 1408 | 71,P7,1 1409 | 71,P8,1 1410 | 71,P9,0 1411 | 71,P10,1 1412 | 71,P11,0 1413 | 71,P12,1 1414 | 71,P13,0 1415 | 71,P14,1 1416 | 71,P15,1 1417 | 71,P16,0 1418 | 71,P17,0 1419 | 71,P18,0 1420 | 71,P19,0 1421 | 71,P20,0 1422 | 72,P1,0 1423 | 72,P2,0 1424 | 72,P3,0 1425 | 72,P4,0 1426 | 72,P5,0 1427 | 72,P6,0 1428 | 72,P7,1 1429 | 72,P8,1 1430 | 72,P9,0 1431 | 72,P10,0 1432 | 72,P11,1 1433 | 72,P12,0 1434 | 72,P13,0 1435 | 72,P14,1 1436 | 72,P15,1 1437 | 72,P16,0 1438 | 72,P17,0 1439 | 72,P18,0 1440 | 72,P19,0 1441 | 72,P20,1 1442 | 73,P1,0 1443 | 73,P2,1 1444 | 73,P3,0 1445 | 73,P4,1 1446 | 73,P5,0 1447 | 73,P6,0 1448 | 73,P7,0 1449 | 73,P8,0 1450 | 73,P9,0 1451 | 73,P10,1 1452 | 73,P11,0 1453 | 73,P12,0 1454 | 73,P13,1 1455 | 73,P14,1 1456 | 73,P15,0 1457 | 73,P16,1 1458 | 73,P17,1 1459 | 73,P18,0 1460 | 73,P19,0 1461 | 73,P20,0 1462 | 74,P1,1 1463 | 74,P2,0 1464 | 74,P3,0 1465 | 74,P4,1 1466 | 74,P5,1 1467 | 74,P6,0 1468 | 74,P7,0 1469 | 74,P8,0 1470 | 74,P9,0 1471 | 74,P10,0 1472 | 74,P11,1 1473 | 74,P12,0 1474 | 74,P13,1 1475 | 74,P14,1 1476 | 74,P15,0 1477 | 74,P16,0 1478 | 74,P17,0 1479 | 74,P18,0 1480 | 74,P19,0 1481 | 74,P20,0 1482 | 75,P1,0 1483 | 75,P2,0 1484 | 75,P3,1 1485 | 75,P4,1 1486 | 75,P5,0 1487 | 75,P6,1 1488 | 75,P7,1 1489 | 75,P8,0 1490 | 75,P9,1 1491 | 75,P10,1 1492 | 75,P11,0 1493 | 75,P12,1 1494 | 75,P13,0 1495 | 75,P14,1 1496 | 75,P15,0 1497 | 75,P16,0 1498 | 75,P17,1 1499 | 75,P18,0 1500 | 75,P19,0 1501 | 75,P20,0 1502 | 76,P1,0 1503 | 76,P2,1 1504 | 76,P3,0 1505 | 76,P4,0 1506 | 76,P5,0 1507 | 76,P6,0 1508 | 76,P7,0 1509 | 76,P8,0 1510 | 76,P9,1 1511 | 76,P10,0 1512 | 76,P11,1 1513 | 76,P12,0 1514 | 76,P13,0 1515 | 76,P14,1 1516 | 76,P15,0 1517 | 76,P16,1 1518 | 76,P17,0 1519 | 76,P18,1 1520 | 76,P19,0 1521 | 76,P20,0 1522 | 77,P1,1 1523 | 77,P2,1 1524 | 77,P3,1 1525 | 77,P4,1 1526 | 77,P5,0 1527 | 77,P6,0 1528 | 77,P7,0 1529 | 77,P8,0 1530 | 77,P9,0 1531 | 77,P10,0 1532 | 77,P11,1 1533 | 77,P12,1 1534 | 77,P13,0 1535 | 77,P14,0 1536 | 77,P15,0 1537 | 77,P16,0 1538 | 77,P17,0 1539 | 77,P18,0 1540 | 77,P19,1 1541 | 77,P20,1 1542 | 78,P1,0 1543 | 78,P2,0 1544 | 78,P3,0 1545 | 78,P4,0 1546 | 78,P5,1 1547 | 78,P6,0 1548 | 78,P7,0 1549 | 78,P8,1 1550 | 78,P9,1 1551 | 78,P10,0 1552 | 78,P11,0 1553 | 78,P12,0 1554 | 78,P13,1 1555 | 78,P14,0 1556 | 78,P15,0 1557 | 78,P16,0 1558 | 78,P17,1 1559 | 78,P18,0 1560 | 78,P19,0 1561 | 78,P20,0 1562 | 79,P1,0 1563 | 79,P2,0 1564 | 79,P3,0 1565 | 79,P4,1 1566 | 79,P5,0 1567 | 79,P6,0 1568 | 79,P7,0 1569 | 79,P8,0 1570 | 79,P9,1 1571 | 79,P10,0 1572 | 79,P11,1 1573 | 79,P12,0 1574 | 79,P13,0 1575 | 79,P14,0 1576 | 79,P15,0 1577 | 79,P16,0 1578 | 79,P17,0 1579 | 79,P18,0 1580 | 79,P19,1 1581 | 79,P20,0 1582 | 80,P1,1 1583 | 80,P2,1 1584 | 80,P3,0 1585 | 80,P4,1 1586 | 80,P5,0 1587 | 80,P6,0 1588 | 80,P7,0 1589 | 80,P8,0 1590 | 80,P9,0 1591 | 80,P10,0 1592 | 80,P11,0 1593 | 80,P12,0 1594 | 80,P13,0 1595 | 80,P14,0 1596 | 80,P15,1 1597 | 80,P16,0 1598 | 80,P17,1 1599 | 80,P18,0 1600 | 80,P19,0 1601 | 80,P20,0 1602 | 81,P1,0 1603 | 81,P2,1 1604 | 81,P3,1 1605 | 81,P4,0 1606 | 81,P5,0 1607 | 81,P6,0 1608 | 81,P7,0 1609 | 81,P8,1 1610 | 81,P9,0 1611 | 81,P10,1 1612 | 81,P11,1 1613 | 81,P12,0 1614 | 81,P13,1 1615 | 81,P14,1 1616 | 81,P15,0 1617 | 81,P16,0 1618 | 81,P17,0 1619 | 81,P18,0 1620 | 81,P19,0 1621 | 81,P20,1 1622 | 82,P1,0 1623 | 82,P2,1 1624 | 82,P3,0 1625 | 82,P4,0 1626 | 82,P5,1 1627 | 82,P6,0 1628 | 82,P7,0 1629 | 82,P8,0 1630 | 82,P9,0 1631 | 82,P10,1 1632 | 82,P11,0 1633 | 82,P12,0 1634 | 82,P13,1 1635 | 82,P14,1 1636 | 82,P15,0 1637 | 82,P16,0 1638 | 82,P17,1 1639 | 82,P18,1 1640 | 82,P19,0 1641 | 82,P20,0 1642 | 83,P1,1 1643 | 83,P2,1 1644 | 83,P3,0 1645 | 83,P4,0 1646 | 83,P5,0 1647 | 83,P6,0 1648 | 83,P7,0 1649 | 83,P8,0 1650 | 83,P9,0 1651 | 83,P10,0 1652 | 83,P11,0 1653 | 83,P12,1 1654 | 83,P13,1 1655 | 83,P14,0 1656 | 83,P15,1 1657 | 83,P16,0 1658 | 83,P17,1 1659 | 83,P18,1 1660 | 83,P19,0 1661 | 83,P20,0 1662 | 84,P1,0 1663 | 84,P2,1 1664 | 84,P3,0 1665 | 84,P4,0 1666 | 84,P5,0 1667 | 84,P6,0 1668 | 84,P7,0 1669 | 84,P8,0 1670 | 84,P9,0 1671 | 84,P10,0 1672 | 84,P11,1 1673 | 84,P12,1 1674 | 84,P13,1 1675 | 84,P14,1 1676 | 84,P15,0 1677 | 84,P16,0 1678 | 84,P17,0 1679 | 84,P18,0 1680 | 84,P19,0 1681 | 84,P20,0 1682 | 85,P1,0 1683 | 85,P2,1 1684 | 85,P3,0 1685 | 85,P4,1 1686 | 85,P5,0 1687 | 85,P6,0 1688 | 85,P7,1 1689 | 85,P8,0 1690 | 85,P9,1 1691 | 85,P10,0 1692 | 85,P11,1 1693 | 85,P12,0 1694 | 85,P13,0 1695 | 85,P14,1 1696 | 85,P15,0 1697 | 85,P16,0 1698 | 85,P17,0 1699 | 85,P18,1 1700 | 85,P19,0 1701 | 85,P20,0 1702 | 86,P1,0 1703 | 86,P2,0 1704 | 86,P3,1 1705 | 86,P4,1 1706 | 86,P5,1 1707 | 86,P6,1 1708 | 86,P7,1 1709 | 86,P8,0 1710 | 86,P9,0 1711 | 86,P10,0 1712 | 86,P11,1 1713 | 86,P12,0 1714 | 86,P13,0 1715 | 86,P14,1 1716 | 86,P15,0 1717 | 86,P16,0 1718 | 86,P17,1 1719 | 86,P18,0 1720 | 86,P19,1 1721 | 86,P20,0 1722 | 87,P1,0 1723 | 87,P2,1 1724 | 87,P3,1 1725 | 87,P4,1 1726 | 87,P5,0 1727 | 87,P6,0 1728 | 87,P7,0 1729 | 87,P8,0 1730 | 87,P9,0 1731 | 87,P10,0 1732 | 87,P11,0 1733 | 87,P12,0 1734 | 87,P13,1 1735 | 87,P14,0 1736 | 87,P15,0 1737 | 87,P16,0 1738 | 87,P17,1 1739 | 87,P18,0 1740 | 87,P19,0 1741 | 87,P20,0 1742 | 88,P1,1 1743 | 88,P2,0 1744 | 88,P3,0 1745 | 88,P4,0 1746 | 88,P5,1 1747 | 88,P6,1 1748 | 88,P7,0 1749 | 88,P8,0 1750 | 88,P9,1 1751 | 88,P10,0 1752 | 88,P11,1 1753 | 88,P12,0 1754 | 88,P13,1 1755 | 88,P14,0 1756 | 88,P15,0 1757 | 88,P16,1 1758 | 88,P17,1 1759 | 88,P18,0 1760 | 88,P19,0 1761 | 88,P20,0 1762 | 89,P1,0 1763 | 89,P2,0 1764 | 89,P3,1 1765 | 89,P4,0 1766 | 89,P5,0 1767 | 89,P6,1 1768 | 89,P7,1 1769 | 89,P8,0 1770 | 89,P9,0 1771 | 89,P10,0 1772 | 89,P11,0 1773 | 89,P12,1 1774 | 89,P13,0 1775 | 89,P14,1 1776 | 89,P15,0 1777 | 89,P16,0 1778 | 89,P17,0 1779 | 89,P18,0 1780 | 89,P19,0 1781 | 89,P20,0 1782 | 90,P1,0 1783 | 90,P2,0 1784 | 90,P3,1 1785 | 90,P4,0 1786 | 90,P5,0 1787 | 90,P6,0 1788 | 90,P7,0 1789 | 90,P8,1 1790 | 90,P9,1 1791 | 90,P10,1 1792 | 90,P11,1 1793 | 90,P12,0 1794 | 90,P13,1 1795 | 90,P14,1 1796 | 90,P15,1 1797 | 90,P16,1 1798 | 90,P17,0 1799 | 90,P18,1 1800 | 90,P19,0 1801 | 90,P20,0 1802 | 91,P1,1 1803 | 91,P2,0 1804 | 91,P3,0 1805 | 91,P4,0 1806 | 91,P5,0 1807 | 91,P6,1 1808 | 91,P7,0 1809 | 91,P8,0 1810 | 91,P9,1 1811 | 91,P10,0 1812 | 91,P11,0 1813 | 91,P12,1 1814 | 91,P13,1 1815 | 91,P14,0 1816 | 91,P15,0 1817 | 91,P16,1 1818 | 91,P17,1 1819 | 91,P18,1 1820 | 91,P19,1 1821 | 91,P20,0 1822 | 92,P1,1 1823 | 92,P2,1 1824 | 92,P3,1 1825 | 92,P4,0 1826 | 92,P5,0 1827 | 92,P6,0 1828 | 92,P7,0 1829 | 92,P8,1 1830 | 92,P9,1 1831 | 92,P10,1 1832 | 92,P11,0 1833 | 92,P12,0 1834 | 92,P13,1 1835 | 92,P14,0 1836 | 92,P15,0 1837 | 92,P16,0 1838 | 92,P17,0 1839 | 92,P18,1 1840 | 92,P19,0 1841 | 92,P20,1 1842 | 93,P1,0 1843 | 93,P2,0 1844 | 93,P3,0 1845 | 93,P4,0 1846 | 93,P5,0 1847 | 93,P6,0 1848 | 93,P7,0 1849 | 93,P8,0 1850 | 93,P9,0 1851 | 93,P10,0 1852 | 93,P11,1 1853 | 93,P12,0 1854 | 93,P13,1 1855 | 93,P14,0 1856 | 93,P15,1 1857 | 93,P16,0 1858 | 93,P17,0 1859 | 93,P18,0 1860 | 93,P19,0 1861 | 93,P20,0 1862 | 94,P1,1 1863 | 94,P2,0 1864 | 94,P3,0 1865 | 94,P4,1 1866 | 94,P5,0 1867 | 94,P6,0 1868 | 94,P7,1 1869 | 94,P8,1 1870 | 94,P9,0 1871 | 94,P10,0 1872 | 94,P11,0 1873 | 94,P12,0 1874 | 94,P13,1 1875 | 94,P14,0 1876 | 94,P15,0 1877 | 94,P16,0 1878 | 94,P17,0 1879 | 94,P18,0 1880 | 94,P19,0 1881 | 94,P20,0 1882 | 95,P1,1 1883 | 95,P2,0 1884 | 95,P3,0 1885 | 95,P4,0 1886 | 95,P5,0 1887 | 95,P6,0 1888 | 95,P7,0 1889 | 95,P8,0 1890 | 95,P9,1 1891 | 95,P10,0 1892 | 95,P11,1 1893 | 95,P12,1 1894 | 95,P13,1 1895 | 95,P14,1 1896 | 95,P15,1 1897 | 95,P16,1 1898 | 95,P17,0 1899 | 95,P18,0 1900 | 95,P19,0 1901 | 95,P20,0 1902 | 96,P1,1 1903 | 96,P2,0 1904 | 96,P3,0 1905 | 96,P4,0 1906 | 96,P5,0 1907 | 96,P6,0 1908 | 96,P7,0 1909 | 96,P8,0 1910 | 96,P9,0 1911 | 96,P10,0 1912 | 96,P11,1 1913 | 96,P12,1 1914 | 96,P13,1 1915 | 96,P14,0 1916 | 96,P15,0 1917 | 96,P16,0 1918 | 96,P17,1 1919 | 96,P18,0 1920 | 96,P19,0 1921 | 96,P20,0 1922 | 97,P1,0 1923 | 97,P2,0 1924 | 97,P3,0 1925 | 97,P4,0 1926 | 97,P5,1 1927 | 97,P6,1 1928 | 97,P7,1 1929 | 97,P8,0 1930 | 97,P9,0 1931 | 97,P10,0 1932 | 97,P11,1 1933 | 97,P12,0 1934 | 97,P13,1 1935 | 97,P14,0 1936 | 97,P15,0 1937 | 97,P16,0 1938 | 97,P17,0 1939 | 97,P18,1 1940 | 97,P19,0 1941 | 97,P20,0 1942 | 98,P1,0 1943 | 98,P2,0 1944 | 98,P3,0 1945 | 98,P4,1 1946 | 98,P5,0 1947 | 98,P6,0 1948 | 98,P7,1 1949 | 98,P8,0 1950 | 98,P9,0 1951 | 98,P10,1 1952 | 98,P11,1 1953 | 98,P12,0 1954 | 98,P13,0 1955 | 98,P14,1 1956 | 98,P15,0 1957 | 98,P16,0 1958 | 98,P17,0 1959 | 98,P18,0 1960 | 98,P19,1 1961 | 98,P20,0 1962 | 99,P1,0 1963 | 99,P2,0 1964 | 99,P3,0 1965 | 99,P4,0 1966 | 99,P5,0 1967 | 99,P6,0 1968 | 99,P7,1 1969 | 99,P8,0 1970 | 99,P9,1 1971 | 99,P10,0 1972 | 99,P11,0 1973 | 99,P12,0 1974 | 99,P13,0 1975 | 99,P14,0 1976 | 99,P15,0 1977 | 99,P16,0 1978 | 99,P17,1 1979 | 99,P18,1 1980 | 99,P19,0 1981 | 99,P20,0 1982 | 100,P1,1 1983 | 100,P2,0 1984 | 100,P3,0 1985 | 100,P4,0 1986 | 100,P5,0 1987 | 100,P6,0 1988 | 100,P7,1 1989 | 100,P8,0 1990 | 100,P9,1 1991 | 100,P10,1 1992 | 100,P11,0 1993 | 100,P12,0 1994 | 100,P13,0 1995 | 100,P14,1 1996 | 100,P15,0 1997 | 100,P16,0 1998 | 100,P17,0 1999 | 100,P18,0 2000 | 100,P19,0 2001 | 100,P20,0 2002 | --------------------------------------------------------------------------------