├── src ├── engine.py ├── loss.py ├── utils.py ├── __init__.py ├── dataset.py ├── feature_generator.py ├── dispatcher.py ├── create_folds.py ├── predict.py ├── metrics.py ├── train.py ├── cross_validation.py └── categorical.py ├── run.sh ├── environment.yml ├── .gitignore ├── LICENSE └── notebooks └── Categorical_Features_1.ipynb /src/engine.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/loss.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dataset.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/feature_generator.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dispatcher.py: -------------------------------------------------------------------------------- 1 | from sklearn import ensemble 2 | 0.75091 3 | MODELS = { 4 | "randomforest": ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, verbose=2), 5 | "extratrees": ensemble.ExtraTreesClassifier(n_estimators=200, n_jobs=-1, verbose=2), 6 | } -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | export TRAINING_DATA=input/train_folds.csv 2 | export TEST_DATA=input/test_cat.csv 3 | 4 | export MODEL=$1 5 | 6 | #FOLD=0 python -m src.train 7 | #FOLD=1 python -m src.train 8 | #FOLD=2 python -m src.train 9 | #FOLD=3 python -m src.train 10 | #FOLD=4 python -m src.train 11 | python -m src.predict -------------------------------------------------------------------------------- /src/create_folds.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn import model_selection 3 | 4 | if __name__ == "__main__": 5 | df = pd.read_csv("../input/train_cat.csv") 6 | df["kfold"] = -1 7 | 8 | df = df.sample(frac=1).reset_index(drop=True) 9 | 10 | kf = model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=42) 11 | 12 | for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y=df.target.values)): 13 | print(len(train_idx), len(val_idx)) 14 | df.loc[val_idx, 'kfold'] = fold 15 | 16 | df.to_csv("../input/train_folds.csv", index=False) 17 | -------------------------------------------------------------------------------- /src/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from sklearn import ensemble 4 | from sklearn import preprocessing 5 | from sklearn import metrics 6 | import joblib 7 | import numpy as np 8 | 9 | from . import dispatcher 10 | 11 | 12 | def predict(test_data_path, model_type, model_path): 13 | df = pd.read_csv(test_data_path) 14 | test_idx = df["id"].values 15 | predictions = None 16 | 17 | for FOLD in range(5): 18 | df = pd.read_csv(test_data_path) 19 | encoders = joblib.load(os.path.join(model_path, f"{model_type}_{FOLD}_label_encoder.pkl")) 20 | cols = joblib.load(os.path.join(model_path, f"{model_type}_{FOLD}_columns.pkl")) 21 | for c in encoders: 22 | lbl = encoders[c] 23 | df.loc[:, c] = df.loc[:, c].astype(str).fillna("NONE") 24 | df.loc[:, c] = lbl.transform(df[c].values.tolist()) 25 | 26 | clf = joblib.load(os.path.join(model_path, f"{model_type}_{FOLD}.pkl")) 27 | 28 | df = df[cols] 29 | preds = clf.predict_proba(df)[:, 1] 30 | 31 | if FOLD == 0: 32 | predictions = preds 33 | else: 34 | predictions += preds 35 | 36 | predictions /= 5 37 | 38 | sub = pd.DataFrame(np.column_stack((test_idx, predictions)), columns=["id", "target"]) 39 | return sub 40 | 41 | 42 | if __name__ == "__main__": 43 | submission = predict(test_data_path="input/test_cat.csv", 44 | model_type="randomforest", 45 | model_path="models/") 46 | submission.loc[:, "id"] = submission.loc[:, "id"].astype(int) 47 | submission.to_csv(f"models/rf_submission.csv", index=False) 48 | -------------------------------------------------------------------------------- /src/metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics as skmetrics 2 | 3 | class ClassificationMetrics: 4 | def __init__(self): 5 | self.metrics = { 6 | "accuracy": self._accuracy, 7 | "f1": self._f1, 8 | "precision": self._precision, 9 | "recall": self._recall, 10 | "auc": self._auc, 11 | "logloss": self._logloss 12 | } 13 | 14 | def __call__(self, metric, y_true, y_pred, y_proba=None): 15 | if metric not in self.metrics: 16 | raise Exception("Metric not implemented") 17 | if metric == "auc": 18 | if y_proba is not None: 19 | return self._auc(y_true=y_true, y_pred=y_proba) 20 | else: 21 | raise Exception("y_proba cannot be None for AUC") 22 | elif metric == "logloss": 23 | if y_proba is not None: 24 | return self._logloss(y_true=y_true, y_pred=y_proba) 25 | else: 26 | raise Exception("y_proba cannot be None for logloss") 27 | else: 28 | return self.metrics[metric](y_true=y_true, y_pred=y_pred) 29 | 30 | @staticmethod 31 | def _auc(y_true, y_pred): 32 | return skmetrics.roc_auc_score(y_true=y_true, y_score=y_pred) 33 | 34 | @staticmethod 35 | def _accuracy(y_true, y_pred): 36 | return skmetrics.accuracy_score(y_true=y_true, y_pred=y_pred) 37 | 38 | @staticmethod 39 | def _f1(y_true, y_pred): 40 | return skmetrics.f1_score(y_true=y_true, y_pred=y_pred) 41 | 42 | @staticmethod 43 | def _recall(y_true, y_pred): 44 | return skmetrics.recall_score(y_true=y_true, y_pred=y_pred) 45 | 46 | @staticmethod 47 | def _precision(y_true, y_pred): 48 | return skmetrics.precision_score(y_true=y_true, y_pred=y_pred) 49 | 50 | @staticmethod 51 | def _logloss(y_true, y_pred): 52 | return skmetrics.log_loss(y_true=y_true, y_pred=y_pred) -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from sklearn import ensemble 4 | from sklearn import preprocessing 5 | from sklearn import metrics 6 | import joblib 7 | 8 | from . import dispatcher 9 | 10 | TRAINING_DATA = os.environ.get("TRAINING_DATA") 11 | TEST_DATA = os.environ.get("TEST_DATA") 12 | FOLD = int(os.environ.get("FOLD")) 13 | MODEL = os.environ.get("MODEL") 14 | 15 | FOLD_MAPPPING = { 16 | 0: [1, 2, 3, 4], 17 | 1: [0, 2, 3, 4], 18 | 2: [0, 1, 3, 4], 19 | 3: [0, 1, 2, 4], 20 | 4: [0, 1, 2, 3] 21 | } 22 | 23 | if __name__ == "__main__": 24 | df = pd.read_csv(TRAINING_DATA) 25 | df_test = pd.read_csv(TEST_DATA) 26 | train_df = df[df.kfold.isin(FOLD_MAPPPING.get(FOLD))].reset_index(drop=True) 27 | valid_df = df[df.kfold==FOLD].reset_index(drop=True) 28 | 29 | ytrain = train_df.target.values 30 | yvalid = valid_df.target.values 31 | 32 | train_df = train_df.drop(["id", "target", "kfold"], axis=1) 33 | valid_df = valid_df.drop(["id", "target", "kfold"], axis=1) 34 | 35 | valid_df = valid_df[train_df.columns] 36 | 37 | label_encoders = {} 38 | for c in train_df.columns: 39 | lbl = preprocessing.LabelEncoder() 40 | train_df.loc[:, c] = train_df.loc[:, c].astype(str).fillna("NONE") 41 | valid_df.loc[:, c] = valid_df.loc[:, c].astype(str).fillna("NONE") 42 | df_test.loc[:, c] = df_test.loc[:, c].astype(str).fillna("NONE") 43 | lbl.fit(train_df[c].values.tolist() + 44 | valid_df[c].values.tolist() + 45 | df_test[c].values.tolist()) 46 | train_df.loc[:, c] = lbl.transform(train_df[c].values.tolist()) 47 | valid_df.loc[:, c] = lbl.transform(valid_df[c].values.tolist()) 48 | label_encoders[c] = lbl 49 | 50 | # data is ready to train 51 | clf = dispatcher.MODELS[MODEL] 52 | clf.fit(train_df, ytrain) 53 | preds = clf.predict_proba(valid_df)[:, 1] 54 | print(metrics.roc_auc_score(yvalid, preds)) 55 | 56 | joblib.dump(label_encoders, f"models/{MODEL}_{FOLD}_label_encoder.pkl") 57 | joblib.dump(clf, f"models/{MODEL}_{FOLD}.pkl") 58 | joblib.dump(train_df.columns, f"models/{MODEL}_{FOLD}_columns.pkl") 59 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ml 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - blas=1.0=mkl 8 | - ca-certificates=2019.11.27=0 9 | - certifi=2019.11.28=py37_0 10 | - cffi=1.13.2=py37h2e261b9_0 11 | - cudatoolkit=10.1.243=h6bb024c_0 12 | - freetype=2.9.1=h8a8886c_1 13 | - intel-openmp=2019.4=243 14 | - joblib=0.14.1=py_0 15 | - jpeg=9b=h024ee3a_2 16 | - ld_impl_linux-64=2.33.1=h53a641e_7 17 | - libedit=3.1.20181209=hc058e9b_0 18 | - libffi=3.2.1=hd88cf55_4 19 | - libgcc-ng=9.1.0=hdf63c60_0 20 | - libgfortran-ng=7.3.0=hdf63c60_0 21 | - libpng=1.6.37=hbc83047_0 22 | - libstdcxx-ng=9.1.0=hdf63c60_0 23 | - libtiff=4.1.0=h2733197_0 24 | - mkl=2019.4=243 25 | - mkl-service=2.3.0=py37he904b0f_0 26 | - mkl_fft=1.0.15=py37ha843d7b_0 27 | - mkl_random=1.1.0=py37hd6b4f25_0 28 | - ncurses=6.1=he6710b0_1 29 | - ninja=1.9.0=py37hfd86e86_0 30 | - numpy=1.17.4=py37hc1035e2_0 31 | - numpy-base=1.17.4=py37hde5b4d6_0 32 | - olefile=0.46=py37_0 33 | - openssl=1.1.1d=h7b6447c_3 34 | - pandas=0.25.3=py37he6710b0_0 35 | - pillow=7.0.0=py37hb39fc2d_0 36 | - pip=19.3.1=py37_0 37 | - pycparser=2.19=py37_0 38 | - python=3.7.6=h0371630_1 39 | - python-dateutil=2.8.1=py_0 40 | - pytorch=1.3.1=py3.7_cuda10.1.243_cudnn7.6.3_0 41 | - pytz=2019.3=py_0 42 | - readline=7.0=h7b6447c_5 43 | - setuptools=44.0.0=py37_0 44 | - six=1.13.0=py37_0 45 | - sqlite=3.30.1=h7b6447c_0 46 | - tk=8.6.8=hbc83047_0 47 | - torchvision=0.4.2=py37_cu101 48 | - wheel=0.33.6=py37_0 49 | - xz=5.2.4=h14c3975_4 50 | - zlib=1.2.11=h7b6447c_3 51 | - zstd=1.3.7=h0b5b093_0 52 | - pip: 53 | - attrs==19.3.0 54 | - backcall==0.1.0 55 | - bleach==3.1.0 56 | - catboost==0.20.2 57 | - chardet==3.0.4 58 | - cycler==0.10.0 59 | - decorator==4.4.1 60 | - defusedxml==0.6.0 61 | - entrypoints==0.3 62 | - idna==2.8 63 | - importlib-metadata==1.3.0 64 | - ipykernel==5.1.3 65 | - ipython==7.11.1 66 | - ipython-genutils==0.2.0 67 | - ipywidgets==7.5.1 68 | - jedi==0.15.2 69 | - jinja2==2.10.3 70 | - json5==0.8.5 71 | - jsonschema==3.2.0 72 | - jupyter==1.0.0 73 | - jupyter-client==5.3.4 74 | - jupyter-console==6.0.0 75 | - jupyter-core==4.6.1 76 | - jupyterlab==1.2.4 77 | - jupyterlab-server==1.0.6 78 | - kaggle==1.5.6 79 | - kiwisolver==1.1.0 80 | - lightgbm==2.3.1 81 | - markupsafe==1.1.1 82 | - matplotlib==3.1.2 83 | - mistune==0.8.4 84 | - more-itertools==8.0.2 85 | - nbconvert==5.6.1 86 | - nbformat==5.0.3 87 | - notebook==6.0.2 88 | - pandocfilters==1.4.2 89 | - parso==0.5.2 90 | - pexpect==4.7.0 91 | - pickleshare==0.7.5 92 | - plotly==4.4.1 93 | - prometheus-client==0.7.1 94 | - prompt-toolkit==2.0.10 95 | - ptyprocess==0.6.0 96 | - pygments==2.5.2 97 | - pyparsing==2.4.6 98 | - pyrsistent==0.15.7 99 | - python-graphviz==0.13.2 100 | - python-slugify==4.0.0 101 | - pyzmq==18.1.1 102 | - qtconsole==4.6.0 103 | - requests==2.22.0 104 | - retrying==1.3.3 105 | - scikit-learn==0.22.1 106 | - scipy==1.4.1 107 | - send2trash==1.5.0 108 | - terminado==0.8.3 109 | - testpath==0.4.4 110 | - text-unidecode==1.3 111 | - tornado==6.0.3 112 | - tqdm==4.41.1 113 | - traitlets==4.3.3 114 | - urllib3==1.24.3 115 | - wcwidth==0.1.8 116 | - webencodings==0.5.1 117 | - widgetsnbextension==3.5.1 118 | - xgboost==0.90 119 | - zipp==0.6.0 120 | 121 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | 132 | # input data and models 133 | input/ 134 | models/ 135 | 136 | 137 | # data files 138 | *.csv 139 | *.h5 140 | *.pkl 141 | *.pth 142 | 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot 2 1 0 1 0 T Y Green Trapezoid Hamster Russia Piano b3b4d25d0 fbcb50fc1 3b6dd5612 4cd920251 f83c56c21 1 Grandmaster Hot -------------------------------------------------------------------------------- /src/cross_validation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn import model_selection 3 | 4 | """ 5 | - -- binary classification 6 | - -- multi class classification 7 | - -- multi label classification 8 | - -- single column regression 9 | - -- multi column regression 10 | - -- holdout 11 | """ 12 | 13 | 14 | class CrossValidation: 15 | def __init__( 16 | self, 17 | df, 18 | target_cols, 19 | shuffle, 20 | problem_type="binary_classification", 21 | multilabel_delimiter=",", 22 | num_folds=5, 23 | random_state=42 24 | ): 25 | self.dataframe = df 26 | self.target_cols = target_cols 27 | self.num_targets = len(target_cols) 28 | self.problem_type = problem_type 29 | self.num_folds = num_folds 30 | self.shuffle = shuffle, 31 | self.random_state = random_state 32 | self.multilabel_delimiter = multilabel_delimiter 33 | 34 | if self.shuffle is True: 35 | self.dataframe = self.dataframe.sample(frac=1).reset_index(drop=True) 36 | 37 | self.dataframe["kfold"] = -1 38 | 39 | def split(self): 40 | if self.problem_type in ("binary_classification", "multiclass_classification"): 41 | if self.num_targets != 1: 42 | raise Exception("Invalid number of targets for this problem type") 43 | target = self.target_cols[0] 44 | unique_values = self.dataframe[target].nunique() 45 | if unique_values == 1: 46 | raise Exception("Only one unique value found!") 47 | elif unique_values > 1: 48 | kf = model_selection.StratifiedKFold(n_splits=self.num_folds, 49 | shuffle=False) 50 | 51 | for fold, (train_idx, val_idx) in enumerate(kf.split(X=self.dataframe, y=self.dataframe[target].values)): 52 | self.dataframe.loc[val_idx, 'kfold'] = fold 53 | 54 | elif self.problem_type in ("single_col_regression", "multi_col_regression"): 55 | if self.num_targets != 1 and self.problem_type == "single_col_regression": 56 | raise Exception("Invalid number of targets for this problem type") 57 | if self.num_targets < 2 and self.problem_type == "multi_col_regression": 58 | raise Exception("Invalid number of targets for this problem type") 59 | kf = model_selection.KFold(n_splits=self.num_folds) 60 | for fold, (train_idx, val_idx) in enumerate(kf.split(X=self.dataframe)): 61 | self.dataframe.loc[val_idx, 'kfold'] = fold 62 | 63 | elif self.problem_type.startswith("holdout_"): 64 | holdout_percentage = int(self.problem_type.split("_")[1]) 65 | num_holdout_samples = int(len(self.dataframe) * holdout_percentage / 100) 66 | self.dataframe.loc[:len(self.dataframe) - num_holdout_samples, "kfold"] = 0 67 | self.dataframe.loc[len(self.dataframe) - num_holdout_samples:, "kfold"] = 1 68 | 69 | elif self.problem_type == "multilabel_classification": 70 | if self.num_targets != 1: 71 | raise Exception("Invalid number of targets for this problem type") 72 | targets = self.dataframe[self.target_cols[0]].apply(lambda x: len(str(x).split(self.multilabel_delimiter))) 73 | kf = model_selection.StratifiedKFold(n_splits=self.num_folds) 74 | for fold, (train_idx, val_idx) in enumerate(kf.split(X=self.dataframe, y=targets)): 75 | self.dataframe.loc[val_idx, 'kfold'] = fold 76 | 77 | else: 78 | raise Exception("Problem type not understood!") 79 | 80 | return self.dataframe 81 | 82 | 83 | if __name__ == "__main__": 84 | df = pd.read_csv("../input/train_multilabel.csv") 85 | cv = CrossValidation(df, shuffle=True, target_cols=["attribute_ids"], 86 | problem_type="multilabel_classification", multilabel_delimiter=" ") 87 | df_split = cv.split() 88 | print(df_split.head()) 89 | print(df_split.kfold.value_counts()) 90 | -------------------------------------------------------------------------------- /src/categorical.py: -------------------------------------------------------------------------------- 1 | from sklearn import preprocessing 2 | 3 | 4 | class CategoricalFeatures: 5 | def __init__(self, df, categorical_features, encoding_type, handle_na=False): 6 | """ 7 | df: pandas dataframe 8 | categorical_features: list of column names, e.g. ["ord_1", "nom_0"......] 9 | encoding_type: label, binary, ohe 10 | handle_na: True/False 11 | """ 12 | self.df = df 13 | self.cat_feats = categorical_features 14 | self.enc_type = encoding_type 15 | self.handle_na = handle_na 16 | self.label_encoders = dict() 17 | self.binary_encoders = dict() 18 | self.ohe = None 19 | 20 | if self.handle_na: 21 | for c in self.cat_feats: 22 | self.df.loc[:, c] = self.df.loc[:, c].astype(str).fillna("-9999999") 23 | self.output_df = self.df.copy(deep=True) 24 | 25 | def _label_encoding(self): 26 | for c in self.cat_feats: 27 | lbl = preprocessing.LabelEncoder() 28 | lbl.fit(self.df[c].values) 29 | self.output_df.loc[:, c] = lbl.transform(self.df[c].values) 30 | self.label_encoders[c] = lbl 31 | return self.output_df 32 | 33 | def _label_binarization(self): 34 | for c in self.cat_feats: 35 | lbl = preprocessing.LabelBinarizer() 36 | lbl.fit(self.df[c].values) 37 | val = lbl.transform(self.df[c].values) 38 | self.output_df = self.output_df.drop(c, axis=1) 39 | for j in range(val.shape[1]): 40 | new_col_name = c + f"__bin_{j}" 41 | self.output_df[new_col_name] = val[:, j] 42 | self.binary_encoders[c] = lbl 43 | return self.output_df 44 | 45 | def _one_hot(self): 46 | ohe = preprocessing.OneHotEncoder() 47 | ohe.fit(self.df[self.cat_feats].values) 48 | return ohe.transform(self.df[self.cat_feats].values) 49 | 50 | def fit_transform(self): 51 | if self.enc_type == "label": 52 | return self._label_encoding() 53 | elif self.enc_type == "binary": 54 | return self._label_binarization() 55 | elif self.enc_type == "ohe": 56 | return self._one_hot() 57 | else: 58 | raise Exception("Encoding type not understood") 59 | 60 | def transform(self, dataframe): 61 | if self.handle_na: 62 | for c in self.cat_feats: 63 | dataframe.loc[:, c] = dataframe.loc[:, c].astype(str).fillna("-9999999") 64 | 65 | if self.enc_type == "label": 66 | for c, lbl in self.label_encoders.items(): 67 | dataframe.loc[:, c] = lbl.transform(dataframe[c].values) 68 | return dataframe 69 | 70 | elif self.enc_type == "binary": 71 | for c, lbl in self.binary_encoders.items(): 72 | val = lbl.transform(dataframe[c].values) 73 | dataframe = dataframe.drop(c, axis=1) 74 | 75 | for j in range(val.shape[1]): 76 | new_col_name = c + f"__bin_{j}" 77 | dataframe[new_col_name] = val[:, j] 78 | return dataframe 79 | 80 | elif self.enc_type == "ohe": 81 | return self.ohe(dataframe[self.cat_feats].values) 82 | 83 | else: 84 | raise Exception("Encoding type not understood") 85 | 86 | 87 | if __name__ == "__main__": 88 | import pandas as pd 89 | from sklearn import linear_model 90 | df = pd.read_csv("../input/train_cat.csv") 91 | df_test = pd.read_csv("../input/test_cat.csv") 92 | sample = pd.read_csv("../input/sample_submission.csv") 93 | 94 | train_len = len(df) 95 | 96 | df_test["target"] = -1 97 | full_data = pd.concat([df, df_test]) 98 | 99 | cols = [c for c in df.columns if c not in ["id", "target"]] 100 | cat_feats = CategoricalFeatures(full_data, 101 | categorical_features=cols, 102 | encoding_type="ohe", 103 | handle_na=True) 104 | full_data_transformed = cat_feats.fit_transform() 105 | 106 | X = full_data_transformed[:train_len, :] 107 | X_test = full_data_transformed[train_len:, :] 108 | 109 | clf = linear_model.LogisticRegression() 110 | clf.fit(X, df.target.values) 111 | preds = clf.predict_proba(X_test)[:, 1] 112 | 113 | sample.loc[:, "target"] = preds 114 | sample.to_csv("submission.csv", index=False) 115 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /notebooks/Categorical_Features_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from sklearn import preprocessing" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "df = pd.read_csv(\"../input/train_cat.csv\")" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/html": [ 30 | "
| \n", 48 | " | id | \n", 49 | "bin_0 | \n", 50 | "bin_1 | \n", 51 | "bin_2 | \n", 52 | "bin_3 | \n", 53 | "bin_4 | \n", 54 | "nom_0 | \n", 55 | "nom_1 | \n", 56 | "nom_2 | \n", 57 | "nom_3 | \n", 58 | "... | \n", 59 | "nom_9 | \n", 60 | "ord_0 | \n", 61 | "ord_1 | \n", 62 | "ord_2 | \n", 63 | "ord_3 | \n", 64 | "ord_4 | \n", 65 | "ord_5 | \n", 66 | "day | \n", 67 | "month | \n", 68 | "target | \n", 69 | "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", 74 | "0 | \n", 75 | "0.0 | \n", 76 | "0.0 | \n", 77 | "0.0 | \n", 78 | "F | \n", 79 | "N | \n", 80 | "Red | \n", 81 | "Trapezoid | \n", 82 | "Hamster | \n", 83 | "Russia | \n", 84 | "... | \n", 85 | "02e7c8990 | \n", 86 | "3.0 | \n", 87 | "Contributor | \n", 88 | "Hot | \n", 89 | "c | \n", 90 | "U | \n", 91 | "Pw | \n", 92 | "6.0 | \n", 93 | "3.0 | \n", 94 | "0 | \n", 95 | "
| 1 | \n", 98 | "1 | \n", 99 | "1.0 | \n", 100 | "1.0 | \n", 101 | "0.0 | \n", 102 | "F | \n", 103 | "Y | \n", 104 | "Red | \n", 105 | "Star | \n", 106 | "Axolotl | \n", 107 | "NaN | \n", 108 | "... | \n", 109 | "f37df64af | \n", 110 | "3.0 | \n", 111 | "Grandmaster | \n", 112 | "Warm | \n", 113 | "e | \n", 114 | "X | \n", 115 | "pE | \n", 116 | "7.0 | \n", 117 | "7.0 | \n", 118 | "0 | \n", 119 | "
| 2 | \n", 122 | "2 | \n", 123 | "0.0 | \n", 124 | "1.0 | \n", 125 | "0.0 | \n", 126 | "F | \n", 127 | "N | \n", 128 | "Red | \n", 129 | "NaN | \n", 130 | "Hamster | \n", 131 | "Canada | \n", 132 | "... | \n", 133 | "NaN | \n", 134 | "3.0 | \n", 135 | "NaN | \n", 136 | "Freezing | \n", 137 | "n | \n", 138 | "P | \n", 139 | "eN | \n", 140 | "5.0 | \n", 141 | "9.0 | \n", 142 | "0 | \n", 143 | "
| 3 | \n", 146 | "3 | \n", 147 | "NaN | \n", 148 | "0.0 | \n", 149 | "0.0 | \n", 150 | "F | \n", 151 | "N | \n", 152 | "Red | \n", 153 | "Circle | \n", 154 | "Hamster | \n", 155 | "Finland | \n", 156 | "... | \n", 157 | "f9d456e57 | \n", 158 | "1.0 | \n", 159 | "Novice | \n", 160 | "Lava Hot | \n", 161 | "a | \n", 162 | "C | \n", 163 | "NaN | \n", 164 | "3.0 | \n", 165 | "3.0 | \n", 166 | "0 | \n", 167 | "
| 4 | \n", 170 | "4 | \n", 171 | "0.0 | \n", 172 | "NaN | \n", 173 | "0.0 | \n", 174 | "T | \n", 175 | "N | \n", 176 | "Red | \n", 177 | "Triangle | \n", 178 | "Hamster | \n", 179 | "Costa Rica | \n", 180 | "... | \n", 181 | "c5361037c | \n", 182 | "3.0 | \n", 183 | "Grandmaster | \n", 184 | "Cold | \n", 185 | "h | \n", 186 | "C | \n", 187 | "OZ | \n", 188 | "5.0 | \n", 189 | "12.0 | \n", 190 | "0 | \n", 191 | "
5 rows × 25 columns
\n", 195 | "