├── .gitignore ├── LICENSE ├── README.md ├── config ├── optimization │ ├── cat.yaml │ ├── ensemble.yaml │ └── xgb.yaml └── train │ ├── ensemble.yaml │ ├── lgbm.yaml │ └── xgb.yaml ├── log └── log ├── notebook └── eda.py ├── setup.cfg └── src └── ai-hackaton ├── data ├── dataset.py └── features.py ├── ensemble.py ├── predict.py ├── trainer ├── boosting_tree.py └── gbdt.py ├── tuning └── bayesian.py ├── utils └── utils.py ├── xgb_train.py └── xgb_tuning.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .vscode/ 131 | submit/ 132 | input/ 133 | models/ 134 | outputs/ 135 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Model Architecture 2 | ![슬라이드1](https://user-images.githubusercontent.com/46340424/136568847-347196a4-e53e-4e04-9674-4cd187ceb155.PNG) 3 | 4 | ## Benchmark 5 | |Model|CV|Public|Private| 6 | |-----|--|------|-------| 7 | |LGBM(10-Fold)|0.6784|0.6247|-| 8 | |Cat(10-fold)|0.5701|0.5777|-| 9 | |LightAutoML(10-fold)|0.5610|0.5398|-| 10 | |Stacking-XGB(10-fold)|0.4808|0.4417|0.42097| 11 | |Stacking-XGB(10-fold) + MEM x 4|-|0.4117|0.38599| 12 | 13 | 14 | ## Requirement 15 | + numpy 16 | + pandas 17 | + lightgbm 18 | + xgboost 19 | + catboost 20 | + optuna 21 | + hydra 22 | + neptune-ai 23 | 24 | ## Score 25 | Public 1, Private 2 26 | -------------------------------------------------------------------------------- /config/optimization/cat.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | path: ../../input/ai-hackaton 3 | train: train_preprocess.csv 4 | 5 | optimization: 6 | trials: 100 7 | params: best_cat.yaml 8 | 9 | model: 10 | fold: 5 11 | -------------------------------------------------------------------------------- /config/optimization/ensemble.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | path: ../../input/ai-hackaton 3 | 4 | optimization: 5 | trials: 100 6 | params: ensemble.yaml 7 | 8 | submit: 9 | path: ../../submit -------------------------------------------------------------------------------- /config/optimization/xgb.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | path: ../../input/ai-hackaton 3 | train: train_preprocess.csv 4 | 5 | optimization: 6 | trials: 100 7 | params: best_xgb.yaml 8 | 9 | model: 10 | fold: 5 11 | 12 | submit: 13 | path: ../../submit -------------------------------------------------------------------------------- /config/train/ensemble.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | path: ../../input/ai-hackaton 3 | train: final_train.csv 4 | test: final_test.csv 5 | 6 | weight: 7 | w1: 0.5 8 | w2: 0.5 9 | w3: 0 10 | w4: 0 11 | 12 | submit: 13 | path: ../../submit 14 | name: cnn_cat_lightgbm_ensemble.csv 15 | -------------------------------------------------------------------------------- /config/train/lgbm.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | path: ../../input/ai-hackaton 3 | train: final_train.csv 4 | test: final_test.csv 5 | 6 | model: 7 | fold: 5 8 | verbose: 100 9 | 10 | params: 11 | reg_alpha: 1.0997191680377813e-05 12 | reg_lambda: 0.04104630401883339 13 | max_depth: 15 14 | num_leaves: 191 15 | colsample_bytree: 0.5198042692950159 16 | subsample: 0.6599641068895281 17 | subsample_freq: 9 18 | min_child_samples: 7 19 | min_child_weight: 0.1 20 | max_bin: 334 21 | n_jobs: -1 22 | n_estimators: 20000 23 | learning_rate: 0.05 24 | boosting_type: gbdt 25 | objective: multiclass 26 | eval_metric: multi_logloss 27 | random_state: 42 28 | 29 | submit: 30 | path: ../../submit 31 | name: feature_xgboost.csv 32 | -------------------------------------------------------------------------------- /config/train/xgb.yaml: -------------------------------------------------------------------------------- 1 | dataset: 2 | path: ../../input/ai-hackaton 3 | train: final_train.csv 4 | test: final_test.csv 5 | 6 | model: 7 | fold: 5 8 | verbose: 100 9 | 10 | params: 11 | n_estimators: 20000 12 | max_depth: 8 13 | min_child_weight: 3 14 | gamma: 0.1257072555186611 15 | subsample: 0.5904938061995494 16 | colsample_bytree: 0.40553660142667064 17 | learning_rate: 0.029471899938397533 18 | n_jobs: -1 19 | reg_alpha: 0.000021790982753738414 20 | reg_lambda: 0.08978089573944274 21 | objective: multi:softprob 22 | eval_metric: mlogloss 23 | random_state: 42 24 | 25 | submit: 26 | path: ../../submit 27 | name: feature_xgboost.csv 28 | -------------------------------------------------------------------------------- /log/log: -------------------------------------------------------------------------------- 1 | 2021-10-13 22:53:47,808 [trainer.boosting_tree|INFO] fold 0: 0.5140553490034933 2 | 2021-10-13 22:54:39,255 [trainer.boosting_tree|INFO] fold 1: 0.5164974102880935 3 | 2021-10-13 22:55:30,989 [trainer.boosting_tree|INFO] fold 2: 0.5242156441473969 4 | 2021-10-13 22:56:21,319 [trainer.boosting_tree|INFO] fold 3: 0.5102627515797235 5 | 2021-10-13 22:57:01,388 [trainer.boosting_tree|INFO] fold 4: 0.49124442848873695 6 | 2021-10-13 22:57:01,493 [trainer.boosting_tree|INFO] oof score: 0.5112551167014888 7 | -------------------------------------------------------------------------------- /notebook/eda.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import numpy as np 3 | import pandas as pd 4 | # %% 5 | mem1 = pd.read_csv("../input/ai-hackaton/mem-128-2-model1.csv") 6 | mem1.head() 7 | # %% 8 | mem1.sort_values(by="id") 9 | # %% 10 | cnn_oof = np.load("../submit/total_cat_oof.npy") 11 | cnn_preds = np.load("../submit/total_cat_pred.npy") 12 | 13 | train_labels = pd.read_csv("../input/ai-hackaton/train_labels.csv") 14 | cnn_oof = pd.DataFrame(cnn_oof, columns=[i for i in range(61)]) 15 | 16 | cnn_oof = pd.concat([train_labels.id, cnn_oof], axis=1) 17 | cnn_oof.to_csv("total_cat_oof.csv", index=False) 18 | 19 | submission = pd.read_csv("../input/ai-hackaton/sample_submission.csv") 20 | submission.iloc[:, 1:] = np.load("../submit/total_cat_pred.npy") 21 | submission.to_csv("total_cat_preds.csv", index=False) 22 | # %% 23 | train_y = pd.read_csv("../input/ai-hackaton/train_labels.csv") 24 | train_y.label 25 | # %% 26 | from sklearn.metrics import log_loss 27 | 28 | log_loss(train_y.label, cnn_oof) 29 | # %% 30 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | extend-ignore = E203, W503, E501 4 | 5 | [isort] 6 | multi_line_output = 3 7 | include_trailing_comma = True 8 | force_grid_wrap = 0 9 | use_parentheses = True 10 | ensure_newline_before_comments = True 11 | line_length = 88 12 | -------------------------------------------------------------------------------- /src/ai-hackaton/data/dataset.py: -------------------------------------------------------------------------------- 1 | import math 2 | import warnings 3 | from typing import List, Tuple, Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy.stats import kurtosis, skew 8 | from sklearn.cluster import KMeans 9 | 10 | pi = math.pi 11 | pd.options.display.max_columns = 500 12 | warnings.filterwarnings("ignore") 13 | 14 | 15 | def range_func(x: List[Union[int, float]]) -> float: 16 | max_val = np.max(x) 17 | min_val = np.min(x) 18 | range_val = max_val - min_val 19 | return range_val 20 | 21 | 22 | def iqr_func2(x: List[Union[int, float]]) -> float: 23 | q3, q1 = np.percentile(x, [20, 80]) 24 | iqr = q3 - q1 25 | return iqr 26 | 27 | 28 | def iqr_func3(x: List[Union[int, float]]) -> float: 29 | q3, q1 = np.percentile(x, [40, 60]) 30 | iqr = q3 - q1 31 | return iqr 32 | 33 | 34 | def iqr_func4(x: List[Union[int, float]]) -> float: 35 | q3, q1 = np.percentile(x, [15, 95]) 36 | iqr = q3 - q1 37 | return iqr 38 | 39 | 40 | def premad(x: List[Union[int, float]]) -> float: 41 | return np.median(np.absolute(x - np.median(x, axis=0)), axis=0) 42 | 43 | 44 | def preskew(x: List[Union[int, float]]) -> float: 45 | return skew(x) 46 | 47 | 48 | def prekurt(x: List[Union[int, float]]) -> float: 49 | return kurtosis(x, fisher=True) 50 | 51 | 52 | def load_dataset(path: str) -> Tuple[pd.DataFrame, pd.DataFrame]: 53 | path = "../input/ai-hackaton/" 54 | train = pd.read_csv(path + "train_features.csv") 55 | test = pd.read_csv(path + "test_features.csv") 56 | change_train = train.drop("time", axis=1) 57 | train_change = change_train.groupby("id").diff().reset_index().fillna(0) 58 | change_test = test.drop("time", axis=1) 59 | test_change = change_test.groupby("id").diff().reset_index().fillna(0) 60 | train_change.rename(columns={"index": "id"}, inplace=True) 61 | test_change.rename(columns={"index": "id"}, inplace=True) 62 | train["acc_vector"] = np.sqrt( 63 | (train["acc_x"] ** 2) + (train["acc_y"] ** 2) + (train["acc_z"] ** 2) 64 | ) 65 | train["gy_vector"] = np.sqrt( 66 | (train["gy_x"] ** 2) + (train["gy_y"] ** 2) + (train["gy_z"] ** 2) 67 | ) 68 | 69 | test["acc_vector"] = np.sqrt( 70 | (test["acc_x"] ** 2) + (test["acc_y"] ** 2) + (test["acc_z"] ** 2) 71 | ) 72 | test["gy_vector"] = np.sqrt( 73 | (test["gy_x"] ** 2) + (test["gy_y"] ** 2) + (test["gy_z"] ** 2) 74 | ) 75 | 76 | train["acc_YZvector"] = np.sqrt((train["acc_y"] ** 2) + (train["acc_z"] ** 2)) 77 | train["gy_YZvector"] = np.sqrt((train["gy_y"] ** 2) + (train["gy_z"] ** 2)) 78 | 79 | train["acc_XYvector"] = np.sqrt((train["acc_x"] ** 2) + (train["acc_y"] ** 2)) 80 | train["gy_XYvector"] = np.sqrt((train["gy_x"] ** 2) + (train["gy_y"] ** 2)) 81 | 82 | train["acc_XZvector"] = np.sqrt((train["acc_x"] ** 2) + (train["acc_z"] ** 2)) 83 | train["gy_XZvector"] = np.sqrt((train["gy_x"] ** 2) + (train["gy_z"] ** 2)) 84 | 85 | test["acc_YZvector"] = np.sqrt((test["acc_y"] ** 2) + (test["acc_z"] ** 2)) 86 | test["gy_YZvector"] = np.sqrt((test["gy_y"] ** 2) + (test["gy_z"] ** 2)) 87 | 88 | test["acc_XYvector"] = np.sqrt((test["acc_x"] ** 2) + (test["acc_y"] ** 2)) 89 | test["gy_XYvector"] = np.sqrt((test["gy_x"] ** 2) + (test["gy_y"] ** 2)) 90 | 91 | test["acc_XZvector"] = np.sqrt((test["acc_x"] ** 2) + (test["acc_z"] ** 2)) 92 | test["gy_XZvector"] = np.sqrt((test["gy_x"] ** 2) + (test["gy_z"] ** 2)) 93 | 94 | # 자이로스코프 무게중심 95 | train["gy_Centerofgravity"] = (train["gy_x"] + train["gy_y"] + train["gy_z"]) / 3 96 | test["gy_Centerofgravity"] = (test["gy_x"] + test["gy_y"] + test["gy_z"]) / 3 97 | # roll & pitch 98 | train["roll"] = np.arctan( 99 | train["acc_y"] / np.sqrt(train["acc_x"] ** 2 + train["acc_z"] ** 2) 100 | ) 101 | test["roll"] = np.arctan( 102 | test["acc_y"] / np.sqrt(test["acc_x"] ** 2 + test["acc_z"] ** 2) 103 | ) 104 | 105 | train["pitch"] = np.arctan( 106 | train["acc_x"] / np.sqrt(train["acc_y"] ** 2 + train["acc_z"] ** 2) 107 | ) 108 | test["pitch"] = np.arctan( 109 | test["acc_x"] / np.sqrt(test["acc_y"] ** 2 + test["acc_z"] ** 2) 110 | ) 111 | 112 | train["math_roll"] = np.arctan( 113 | -train["acc_x"] / np.sqrt(train["acc_y"] ** 2 + train["acc_z"] ** 2) 114 | ) * (180 / pi) 115 | train["math_pitch"] = np.arctan( 116 | train["acc_y"] / np.sqrt(train["acc_x"] ** 2 + train["acc_z"] ** 2) 117 | ) * (180 / pi) 118 | 119 | test["math_roll"] = np.arctan( 120 | -test["acc_x"] / np.sqrt(test["acc_y"] ** 2 + test["acc_z"] ** 2) 121 | ) * (180 / pi) 122 | test["math_pitch"] = np.arctan( 123 | test["acc_y"] / np.sqrt(test["acc_x"] ** 2 + test["acc_z"] ** 2) 124 | ) * (180 / pi) 125 | 126 | train["gy_roll"] = np.arctan( 127 | train["gy_y"] / np.sqrt(train["gy_x"] ** 2 + train["gy_z"] ** 2) 128 | ) 129 | test["gy_roll"] = np.arctan( 130 | test["gy_y"] / np.sqrt(test["gy_x"] ** 2 + test["gy_z"] ** 2) 131 | ) 132 | 133 | train["gy_pitch"] = np.arctan( 134 | train["gy_x"] / np.sqrt(train["gy_y"] ** 2 + train["gy_z"] ** 2) 135 | ) 136 | test["gy_pitch"] = np.arctan( 137 | test["gy_x"] / np.sqrt(test["gy_y"] ** 2 + test["gy_z"] ** 2) 138 | ) 139 | 140 | train["gy_math_roll"] = np.arctan( 141 | -train["gy_x"] / np.sqrt(train["gy_y"] ** 2 + train["gy_z"] ** 2) 142 | ) * (180 / pi) 143 | train["gy_math_pitch"] = np.arctan( 144 | train["gy_y"] / np.sqrt(train["gy_x"] ** 2 + train["gy_z"] ** 2) 145 | ) * (180 / pi) 146 | 147 | test["gy_math_roll"] = np.arctan( 148 | -test["gy_x"] / np.sqrt(test["gy_y"] ** 2 + test["gy_z"] ** 2) 149 | ) * (180 / pi) 150 | test["gy_math_pitch"] = np.arctan( 151 | test["gy_y"] / np.sqrt(test["gy_x"] ** 2 + test["gy_z"] ** 2) 152 | ) * (180 / pi) 153 | 154 | features = [ 155 | "id", 156 | "acc_x", 157 | "acc_y", 158 | "acc_z", 159 | "gy_x", 160 | "gy_y", 161 | "gy_z", 162 | "acc_vector", 163 | "gy_vector", 164 | "acc_YZvector", 165 | "gy_YZvector", 166 | "acc_XYvector", 167 | "gy_XYvector", 168 | "acc_XZvector", 169 | "gy_XZvector", 170 | "gy_Centerofgravity", 171 | ] 172 | features2 = [ 173 | "id", 174 | "roll", 175 | "pitch", 176 | "math_roll", 177 | "math_pitch", 178 | "gy_roll", 179 | "gy_pitch", 180 | "gy_math_roll", 181 | "gy_math_pitch", 182 | ] 183 | 184 | train_preprocess = ( 185 | train[features] 186 | .groupby("id") 187 | .agg( 188 | [ 189 | "max", 190 | "min", 191 | "mean", 192 | "std", 193 | "median", 194 | range_func, 195 | iqr_func2, 196 | iqr_func3, 197 | iqr_func4, 198 | premad, 199 | preskew, 200 | prekurt, 201 | ] 202 | ) 203 | ) 204 | temp_train_preprocess = ( 205 | train[features2] 206 | .groupby("id") 207 | .agg([range_func, iqr_func2, iqr_func3, iqr_func4, premad, preskew, prekurt]) 208 | ) 209 | test_preprocess = ( 210 | test[features] 211 | .groupby("id") 212 | .agg( 213 | [ 214 | "max", 215 | "min", 216 | "mean", 217 | "std", 218 | "median", 219 | range_func, 220 | iqr_func2, 221 | iqr_func3, 222 | iqr_func4, 223 | premad, 224 | preskew, 225 | prekurt, 226 | ] 227 | ) 228 | ) 229 | temp_test_preprocess = ( 230 | test[features2] 231 | .groupby("id") 232 | .agg([range_func, iqr_func2, iqr_func3, iqr_func4, premad, preskew, prekurt]) 233 | ) 234 | 235 | train_preprocess = pd.concat([train_preprocess, temp_train_preprocess], axis=1) 236 | test_preprocess = pd.concat([test_preprocess, temp_test_preprocess], axis=1) 237 | 238 | train_preprocess.columns = [i[0] + "_" + i[1] for i in train_preprocess.columns] 239 | test_preprocess.columns = [i[0] + "_" + i[1] for i in test_preprocess.columns] 240 | 241 | train_preprocess["acc_std_mean"] = ( 242 | train_preprocess["acc_x_std"] 243 | + train_preprocess["acc_y_std"] 244 | + train_preprocess["acc_z_std"] 245 | ) / 3 246 | train_preprocess["gy_std_mean"] = ( 247 | train_preprocess["gy_x_std"] 248 | + train_preprocess["gy_y_std"] 249 | + train_preprocess["gy_z_std"] 250 | ) / 3 251 | 252 | test_preprocess["acc_std_mean"] = ( 253 | test_preprocess["acc_x_std"] 254 | + test_preprocess["acc_y_std"] 255 | + test_preprocess["acc_z_std"] 256 | ) / 3 257 | test_preprocess["gy_std_mean"] = ( 258 | test_preprocess["gy_x_std"] 259 | + test_preprocess["gy_y_std"] 260 | + test_preprocess["gy_z_std"] 261 | ) / 3 262 | 263 | # 가속도계 첫번째 데이터 264 | train_acc_head1 = ( 265 | train.groupby(["id"])[["acc_x", "acc_y", "acc_z"]].first().reset_index() 266 | ) 267 | train_acc_head1.columns = ["id", "first_acc_x", "first_acc_y", "first_acc_z"] 268 | train_acc_head1.set_index("id", inplace=True) 269 | 270 | test_acc_head1 = ( 271 | test.groupby(["id"])[["acc_x", "acc_y", "acc_z"]].first().reset_index() 272 | ) 273 | test_acc_head1.columns = ["id", "first_acc_x", "first_acc_y", "first_acc_z"] 274 | test_acc_head1.set_index("id", inplace=True) 275 | 276 | # 가속도계 첫 3초 277 | train_acc_head = ( 278 | train.loc[:, ["id", "time", "acc_x", "acc_y", "acc_z"]][train["time"] < 150] 279 | .drop("time", axis=1) 280 | .groupby("id") 281 | .mean() 282 | .reset_index() 283 | ) 284 | train_acc_head.columns = ["id", "head_acc_x", "head_acc_y", "head_acc_z"] 285 | train_acc_head = train_acc_head.groupby("id").mean() 286 | 287 | test_acc_head = ( 288 | test.loc[:, ["id", "time", "acc_x", "acc_y", "acc_z"]][train["time"] < 150] 289 | .drop("time", axis=1) 290 | .groupby("id") 291 | .mean() 292 | .reset_index() 293 | ) 294 | test_acc_head.columns = ["id", "head_acc_x", "head_acc_y", "head_acc_z"] 295 | test_acc_head = test_acc_head.groupby("id").mean() 296 | 297 | train_preprocess = pd.concat( 298 | [train_preprocess, train_acc_head, train_acc_head1], axis=1 299 | ) 300 | test_preprocess = pd.concat( 301 | [test_preprocess, test_acc_head, test_acc_head1], axis=1 302 | ) 303 | 304 | # 자이로스코프 첫 3초 305 | train_gy_head = ( 306 | train.loc[:, ["id", "time", "gy_x", "gy_y", "gy_z"]][train["time"] < 150] 307 | .drop("time", axis=1) 308 | .groupby("id") 309 | .mean() 310 | .reset_index() 311 | ) 312 | train_gy_head.columns = ["id", "head_gy_x", "head_gy_y", "head_gy_z"] 313 | train_gy_head = train_gy_head.groupby("id").mean() 314 | 315 | test_gy_head = ( 316 | test.loc[:, ["id", "time", "gy_x", "gy_y", "gy_z"]][train["time"] < 150] 317 | .drop("time", axis=1) 318 | .groupby("id") 319 | .mean() 320 | .reset_index() 321 | ) 322 | test_gy_head.columns = ["id", "head_gy_x", "head_gy_y", "head_gy_z"] 323 | test_gy_head = test_gy_head.groupby("id").mean() 324 | 325 | train_preprocess = pd.concat([train_preprocess, train_gy_head], axis=1) 326 | test_preprocess = pd.concat([test_preprocess, test_gy_head], axis=1) 327 | 328 | model = KMeans(n_clusters=5, random_state=20) 329 | model.fit(train_preprocess) 330 | train_predict = model.predict(train_preprocess) 331 | train_preprocess["cluster"] = train_predict 332 | 333 | test_predict = model.predict(test_preprocess) 334 | test_preprocess["cluster"] = test_predict 335 | 336 | column_name = train_preprocess.iloc[:, :247].columns.tolist() 337 | column_name.extend( 338 | [i[0] + "-" + i[1] for i in train_preprocess.iloc[:, 247:-1].columns] 339 | ) 340 | column_name.extend(list(train_preprocess.iloc[:, -1:].columns)) 341 | 342 | train_preprocess.columns = column_name 343 | test_preprocess.columns = column_name 344 | 345 | return train_preprocess, test_preprocess 346 | 347 | 348 | def make_oof_preds( 349 | path: str, oof_path: str, preds_path: str 350 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 351 | # oof load 352 | model_oof = pd.read_csv(path + oof_path) 353 | model_oof = model_oof.sort_values(by="id") 354 | model_oof = model_oof.drop("id", axis=1).values 355 | 356 | # pred_load 357 | model_preds = pd.read_csv(path + preds_path) 358 | model_preds = model_preds.drop("id", axis=1).values 359 | return model_oof, model_preds 360 | -------------------------------------------------------------------------------- /src/ai-hackaton/data/features.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from lightgbm import LGBMClassifier 6 | from shap import TreeExplainer 7 | 8 | 9 | def select_features( 10 | train: pd.DataFrame, label: pd.Series, test: pd.DataFrame 11 | ) -> Tuple[pd.DataFrame, pd.DataFrame]: 12 | 13 | model = LGBMClassifier(random_state=42) 14 | print(f"{model.__class__.__name__} Train Start!") 15 | model.fit(train, label) 16 | explainer = TreeExplainer(model) 17 | 18 | shap_values = explainer.shap_values(test) 19 | shap_sum = np.abs(shap_values).mean(axis=1).sum(axis=0) 20 | 21 | importance_df = pd.DataFrame([test.columns.tolist(), shap_sum.tolist()]).T 22 | importance_df.columns = ["column_name", "shap_importance"] 23 | 24 | importance_df = importance_df.sort_values("shap_importance", ascending=False) 25 | importance_df = importance_df.query("shap_importance != 0") 26 | boosting_shap_col = importance_df.column_name.values.tolist() 27 | print(f"총 {len(train.columns)} 중 {len(boosting_shap_col)}개 선택") 28 | 29 | shap_train = train.loc[:, boosting_shap_col] 30 | shap_test = test.loc[:, boosting_shap_col] 31 | 32 | return shap_train, shap_test 33 | -------------------------------------------------------------------------------- /src/ai-hackaton/ensemble.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import hydra 4 | import numpy as np 5 | import pandas as pd 6 | from data.dataset import make_oof_preds 7 | from hydra.utils import to_absolute_path 8 | from omegaconf import DictConfig 9 | from tuning.bayesian import BayesianOptimizer, oof_objective 10 | 11 | 12 | @hydra.main(config_path="../../config/optimization/", config_name="ensemble.yaml") 13 | def _main(cfg: DictConfig): 14 | path = to_absolute_path(cfg.dataset.path) + "/" 15 | submit_path = to_absolute_path(cfg.submit.path) + "/" 16 | train_y = pd.read_csv(path + "train_labels.csv") 17 | cnn_oof = np.load(submit_path + "10fold_cnn_oof.npy") 18 | cat_oof = np.load(submit_path + "10fold_cat_oof.npy") 19 | auto_oof = np.load(submit_path + "10fold_automl_oof.npy") 20 | 21 | model1_oof, model1_preds = make_oof_preds( 22 | path, "mem-128-2-model1.csv", "mem-128-2-pred1.csv" 23 | ) 24 | 25 | model2_oof, model2_preds = make_oof_preds( 26 | path, "mem-128-2-model2.csv", "mem-128-2-pred2.csv" 27 | ) 28 | 29 | model3_oof, model3_preds = make_oof_preds( 30 | path, "mem-128-2-model3.csv", "mem-128-2-pred3.csv" 31 | ) 32 | 33 | model4_oof, model4_preds = make_oof_preds( 34 | path, "mem-128-2-model4.csv", "mem-128-2-pred4.csv" 35 | ) 36 | 37 | features = [ 38 | f"{c}_prob_{i}" 39 | for c in ["model1", "model2", "model3", "model4", "cnn", "cat", "automl"] 40 | for i in range(61) 41 | ] 42 | X = np.concatenate( 43 | [model1_oof, model2_oof, model3_oof, model4_oof, cnn_oof, cat_oof, auto_oof], 44 | axis=1, 45 | ) 46 | X = pd.DataFrame(X, columns=features) 47 | y = train_y["label"] 48 | 49 | objective = partial( 50 | oof_objective, 51 | model1_oof=model1_oof, 52 | model2_oof=model2_oof, 53 | model3_oof=model3_oof, 54 | model4_oof=model4_oof, 55 | cnn_oof=cnn_oof, 56 | automl_oof=auto_oof, 57 | y=y, 58 | ) 59 | bayesian_optim = BayesianOptimizer(objective) 60 | study = bayesian_optim.build_study(trials=cfg.optimization.trials) 61 | bayesian_optim.display_study_statistics(study) 62 | 63 | 64 | if __name__ == "__main__": 65 | _main() 66 | -------------------------------------------------------------------------------- /src/ai-hackaton/predict.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | import pandas as pd 3 | from hydra.utils import to_absolute_path 4 | from omegaconf import DictConfig 5 | 6 | 7 | @hydra.main(config_path="../../config/train/", config_name="ensemble.yaml") 8 | def _main(cfg: DictConfig): 9 | path = to_absolute_path(cfg.dataset.path) + "/" 10 | submit_path = to_absolute_path(cfg.submit.path) + "/" 11 | submission = pd.read_csv(path + "sample_submission.csv") 12 | 13 | mem_preds = pd.read_csv(submit_path + "ensembled-submission.csv") 14 | stacking_preds = pd.read_csv(submit_path + "final_xgb_stacking_10fold.csv") 15 | 16 | submission.iloc[:, 1:] = ( 17 | cfg.weight.w1 * mem_preds.iloc[:, 1:] 18 | + cfg.weight.w2 * stacking_preds.iloc[:, 1:] 19 | ) 20 | 21 | submission.to_csv(submit_path + cfg.submit.name, index=False) 22 | 23 | 24 | if __name__ == "__main__": 25 | _main() 26 | -------------------------------------------------------------------------------- /src/ai-hackaton/trainer/boosting_tree.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import warnings 3 | from typing import Any, Callable, Dict, NamedTuple, Optional, Union 4 | 5 | import neptune.new as neptune 6 | import numpy as np 7 | import pandas as pd 8 | from neptune.new.integrations.xgboost import NeptuneCallback 9 | from sklearn.model_selection import StratifiedKFold 10 | from utils.utils import LoggerFactory 11 | from xgboost import XGBClassifier 12 | 13 | logger = LoggerFactory().getLogger(__name__) 14 | warnings.filterwarnings("ignore") 15 | 16 | 17 | class ModelResult(NamedTuple): 18 | oof_preds: np.ndarray 19 | preds: Optional[np.ndarray] 20 | models: Dict[str, any] 21 | scores: Dict[str, float] 22 | 23 | 24 | class XGBTrainer: 25 | def __init__(self, n_fold: int, metric: Callable): 26 | self.metric = metric 27 | self.n_fold = n_fold 28 | self.result = None 29 | 30 | def train( 31 | self, 32 | X: pd.DataFrame, 33 | y: pd.DataFrame, 34 | params: Optional[Dict[str, Any]] = None, 35 | verbose: Union[int, bool] = False, 36 | ) -> bool: 37 | models = dict() 38 | scores = dict() 39 | 40 | kf = StratifiedKFold(n_splits=self.n_fold, random_state=42, shuffle=True) 41 | splits = kf.split(X, y) 42 | xgb_oof = np.zeros((X.shape[0], 61)) 43 | run = neptune.init( 44 | project="ds-wook/ai-hackaton", tags=["XGBoost", "Stratified KFold"] 45 | ) 46 | for fold, (train_idx, valid_idx) in enumerate(splits): 47 | print("Fold :", fold) 48 | neptune_callback = NeptuneCallback( 49 | run=run, 50 | base_namespace=f"fold_{fold}", 51 | log_tree=[0, 1, 2, 3], 52 | max_num_features=10, 53 | ) 54 | # create dataset 55 | X_train, y_train = X.iloc[train_idx], y.iloc[train_idx] 56 | X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx] 57 | # model 58 | model = XGBClassifier(**params) 59 | model.fit( 60 | X_train, 61 | y_train, 62 | eval_set=[(X_train, y_train), (X_valid, y_valid)], 63 | early_stopping_rounds=50, 64 | callbacks=[neptune_callback], 65 | verbose=verbose, 66 | ) 67 | # validation 68 | xgb_oof[valid_idx, :] = model.predict_proba(X_valid) 69 | 70 | models[f"fold_{fold}"] = model 71 | score = self.metric(y_valid, xgb_oof[valid_idx, :]) 72 | scores[f"fold_{fold}"] = score 73 | logger.info(f"fold {fold}: {score}") 74 | 75 | gc.collect() 76 | 77 | oof_score = self.metric(y.values, xgb_oof) 78 | logger.info(f"oof score: {oof_score}") 79 | 80 | self.result = ModelResult( 81 | oof_preds=xgb_oof, 82 | models=models, 83 | preds=None, 84 | scores={ 85 | "oof_score": oof_score, 86 | "KFoldsScores": scores, 87 | }, 88 | ) 89 | return True 90 | 91 | def predict(self, X_test: pd.DataFrame) -> np.ndarray: 92 | folds = self.n_fold 93 | xgb_preds = np.zeros((X_test.shape[0], 61)) 94 | 95 | for fold in range(folds): 96 | model = self.result.models[f"fold_{fold}"] 97 | xgb_preds += model.predict_proba(X_test) / self.n_fold 98 | 99 | return xgb_preds 100 | -------------------------------------------------------------------------------- /src/ai-hackaton/trainer/gbdt.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Any, Dict, Optional, Tuple, Union 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from catboost import CatBoostClassifier, Pool 7 | from sklearn.metrics import log_loss 8 | from sklearn.model_selection import StratifiedKFold 9 | 10 | warnings.filterwarnings("ignore") 11 | 12 | 13 | def train_kfold_cat( 14 | n_fold: int, 15 | X: pd.DataFrame, 16 | y: pd.DataFrame, 17 | X_test: pd.DataFrame, 18 | params: Optional[Dict[str, Any]] = None, 19 | verbose: Union[int, bool] = False, 20 | ) -> Tuple[np.ndarray, np.ndarray]: 21 | folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42) 22 | splits = folds.split(X, y) 23 | cb_oof = np.zeros((X.shape[0], 61)) 24 | cb_preds = np.zeros((X_test.shape[0], 61)) 25 | 26 | for fold, (train_idx, valid_idx) in enumerate(splits, 1): 27 | if verbose: 28 | print(f"\tFold {fold}\n") 29 | X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx] 30 | y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx] 31 | train_data = Pool(data=X_train, label=y_train) 32 | valid_data = Pool(data=X_valid, label=y_valid) 33 | 34 | model = CatBoostClassifier(**params) 35 | 36 | model.fit( 37 | train_data, 38 | eval_set=valid_data, 39 | early_stopping_rounds=50, 40 | use_best_model=True, 41 | verbose=verbose, 42 | ) 43 | 44 | cb_oof[valid_idx] = model.predict_proba(X_valid) 45 | cb_preds += model.predict_proba(X_test) / n_fold 46 | 47 | log_score = log_loss(y, cb_oof) 48 | print(f"Log Loss Score: {log_score:.5f}\n") 49 | return cb_oof, cb_preds 50 | -------------------------------------------------------------------------------- /src/ai-hackaton/tuning/bayesian.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Callable, Sequence, Union 3 | 4 | import neptune.new as neptune 5 | import neptune.new.integrations.optuna as optuna_utils 6 | import numpy as np 7 | import optuna 8 | import pandas as pd 9 | import yaml 10 | from catboost import CatBoostClassifier, Pool 11 | from hydra.utils import to_absolute_path 12 | from neptune.new.exceptions import NeptuneMissingApiTokenException 13 | from optuna.integration import XGBoostPruningCallback 14 | from optuna.pruners import MedianPruner 15 | from optuna.samplers import TPESampler 16 | from optuna.study import Study 17 | from optuna.trial import FrozenTrial, Trial 18 | from sklearn.metrics import log_loss 19 | from sklearn.model_selection import StratifiedKFold, train_test_split 20 | from xgboost import XGBClassifier 21 | 22 | warnings.filterwarnings("ignore") 23 | 24 | 25 | class BayesianOptimizer: 26 | def __init__( 27 | self, objective_function: Callable[[Trial], Union[float, Sequence[float]]] 28 | ): 29 | self.objective_function = objective_function 30 | 31 | def build_study(self, trials: FrozenTrial, verbose: bool = False): 32 | try: 33 | run = neptune.init(project="ds-wook/ai-hackaton", tags="optimization") 34 | 35 | neptune_callback = optuna_utils.NeptuneCallback( 36 | run, plots_update_freq=1, log_plot_slice=False, log_plot_contour=False 37 | ) 38 | sampler = TPESampler(seed=42) 39 | study = optuna.create_study( 40 | study_name="TPE Optimization", 41 | direction="minimize", 42 | sampler=sampler, 43 | pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=5), 44 | ) 45 | study.optimize( 46 | self.objective_function, n_trials=trials, callbacks=[neptune_callback] 47 | ) 48 | run.stop() 49 | 50 | except NeptuneMissingApiTokenException: 51 | sampler = TPESampler(seed=42) 52 | study = optuna.create_study( 53 | study_name="optimization", direction="minimize", sampler=sampler 54 | ) 55 | study.optimize(self.objective_function, n_trials=trials) 56 | if verbose: 57 | self.display_study_statistics(study) 58 | 59 | return study 60 | 61 | @staticmethod 62 | def display_study_statistics(study: Study): 63 | print("Best trial:") 64 | trial = study.best_trial 65 | print(" Value: ", trial.value) 66 | print(" Params: ") 67 | for key, value in trial.params.items(): 68 | print(f" '{key}': {value},") 69 | 70 | @staticmethod 71 | def xgb_save_params(study: optuna.create_study, params_name: str): 72 | params = study.best_trial.params 73 | params["random_state"] = 42 74 | params["n_estimators"] = 10000 75 | params["n_jobs"] = -1 76 | params["objective"] = "multi:softmax" 77 | params["eval_metric"] = "mlogloss" 78 | with open(to_absolute_path("../../config/train/xgb.yaml")) as f: 79 | train_dict = yaml.load(f, Loader=yaml.FullLoader) 80 | train_dict["params"] = params 81 | 82 | with open(to_absolute_path("../../config/train/" + params_name), "w") as p: 83 | yaml.dump(train_dict, p) 84 | 85 | @staticmethod 86 | def cat_save_params(study: optuna.create_study, params_name: str): 87 | params = study.best_trial.params 88 | params["random_seed"] = 20 89 | params["n_estimators"] = 20000 90 | params["loss_function"] = "MultiClass" 91 | params["eval_metric"] = "MultiClass" 92 | with open(to_absolute_path("../../config/train/cat.yaml")) as f: 93 | train_dict = yaml.load(f, Loader=yaml.FullLoader) 94 | train_dict["params"] = params 95 | 96 | with open(to_absolute_path("../../config/train/" + params_name), "w") as p: 97 | yaml.dump(train_dict, p) 98 | 99 | 100 | def xgb_objective( 101 | trial: FrozenTrial, 102 | X: pd.DataFrame, 103 | y: pd.Series, 104 | n_fold: int, 105 | ) -> Callable[[Trial], float]: 106 | params = { 107 | "random_state": 42, 108 | "n_estimators": 10000, 109 | "objective": "multi:softmax", 110 | "eval_metric": "mlogloss", 111 | "n_jobs": -1, 112 | "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05), 113 | "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1e-2), 114 | "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 1e-1), 115 | "max_depth": trial.suggest_int("max_depth", 2, 10), 116 | "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0), 117 | "min_child_weight": trial.suggest_int("min_child_weight", 3, 10), 118 | "subsample": trial.suggest_float("subsample", 0.3, 1.0), 119 | "gamma": trial.suggest_float("gamma", 0.1, 1), 120 | } 121 | 122 | kf = StratifiedKFold(n_splits=n_fold, random_state=42, shuffle=True) 123 | splits = kf.split(X, y) 124 | xgb_oof = np.zeros((X.shape[0], 61)) 125 | pruning_callback = XGBoostPruningCallback(trial, "validation_1-mlogloss") 126 | 127 | for fold, (train_idx, valid_idx) in enumerate(splits, 1): 128 | # create dataset 129 | X_train, y_train = X.iloc[train_idx], y.iloc[train_idx] 130 | X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx] 131 | 132 | # model 133 | model = XGBClassifier(**params) 134 | model.fit( 135 | X_train, 136 | y_train, 137 | eval_set=[(X_train, y_train), (X_valid, y_valid)], 138 | early_stopping_rounds=50, 139 | verbose=100, 140 | callbacks=[pruning_callback], 141 | ) 142 | # validation 143 | xgb_oof[valid_idx] = model.predict_proba(X_valid) 144 | 145 | score = log_loss(y, xgb_oof) 146 | return score 147 | 148 | 149 | def cat_objective( 150 | trial: FrozenTrial, 151 | X: pd.DataFrame, 152 | y: pd.Series, 153 | n_fold: int, 154 | ) -> Callable[[Trial], float]: 155 | params = { 156 | "loss_function": "MultiClass", 157 | "eval_metric": "MultiClass", 158 | "od_type": "Iter", 159 | "od_wait": 500, 160 | "random_seed": 42, 161 | "iterations": 26000, 162 | "learning_rate": trial.suggest_uniform("learning_rate", 1e-3, 1e-2), 163 | "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-1, 1.0, log=True), 164 | "max_depth": trial.suggest_int("max_depth", 3, 10), 165 | "bagging_temperature": trial.suggest_int("bagging_temperature", 1, 10), 166 | } 167 | X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2) 168 | # create dataset 169 | train_data = Pool(data=X_train, label=y_train) 170 | valid_data = Pool(data=X_valid, label=y_valid) 171 | 172 | # model 173 | model = CatBoostClassifier(**params) 174 | model.fit( 175 | train_data, 176 | eval_set=valid_data, 177 | early_stopping_rounds=100, 178 | use_best_model=True, 179 | verbose=1000, 180 | ) 181 | # validation 182 | cat_oof = model.predict_proba(X_valid) 183 | 184 | log_score = log_loss(y, cat_oof) 185 | return log_score 186 | 187 | 188 | def oof_objective( 189 | trial: FrozenTrial, 190 | model1_oof: np.ndarray, 191 | model2_oof: np.ndarray, 192 | model3_oof: np.ndarray, 193 | model4_oof: np.ndarray, 194 | y: pd.Series, 195 | ) -> Callable[[Trial], float]: 196 | 197 | params = { 198 | "alpha": trial.suggest_float("alpha", 0, 1), 199 | "beta": trial.suggest_float("beta", 0, 1), 200 | "gamma": trial.suggest_float("gamma", 0, 1), 201 | } 202 | 203 | def ensemble_oof(alpha: float, beta: float, gamma: float): 204 | delta = 1 - (alpha + beta + gamma) 205 | ensemble_model = ( 206 | alpha * model1_oof 207 | + beta * model2_oof 208 | + gamma * model3_oof 209 | + delta * model4_oof 210 | ) 211 | 212 | score = log_loss(y, ensemble_model) 213 | return score 214 | 215 | score = ensemble_oof(**params) 216 | 217 | return score 218 | -------------------------------------------------------------------------------- /src/ai-hackaton/utils/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.handlers 3 | import os 4 | import random 5 | import time 6 | from contextlib import contextmanager 7 | from pathlib import Path 8 | 9 | import numpy as np 10 | 11 | 12 | def seed_everything(seed=42): 13 | random.seed(seed) 14 | os.environ["PYTHONHASHSEED"] = str(seed) 15 | np.random.seed(seed) 16 | 17 | 18 | class Singleton(type): 19 | _instances = {} 20 | 21 | def __call__(cls, *args, **kwargs): 22 | if cls not in cls._instances: 23 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 24 | return cls._instances[cls] 25 | 26 | 27 | class LoggerFactory(metaclass=Singleton): 28 | def __init__(self, log_path: str = None, loglevel=logging.INFO): 29 | self.loglevel = loglevel 30 | if log_path is None: 31 | self.log_path = Path("../../log/log") 32 | else: 33 | self.log_path = Path(log_path) 34 | self.log_path.parent.mkdir(parents=True, exist_ok=True) 35 | 36 | def getLogger(self, log_name: str) -> logging.getLogger: 37 | fmt = "%(asctime)s [%(name)s|%(levelname)s] %(message)s" 38 | formatter = logging.Formatter(fmt) 39 | logger = logging.getLogger(log_name) 40 | 41 | # add stream Handler 42 | handler = logging.StreamHandler() 43 | handler.setFormatter(formatter) 44 | logger.addHandler(handler) 45 | 46 | # add file Handler 47 | handler = logging.handlers.RotatingFileHandler( 48 | filename=self.log_path, maxBytes=2 * 1024 * 1024 * 1024, backupCount=10 49 | ) 50 | handler.setFormatter(formatter) 51 | logger.addHandler(handler) 52 | 53 | logger.setLevel(self.loglevel) 54 | 55 | return logger 56 | 57 | 58 | @contextmanager 59 | def timer(name, logger: logging.getLogger): 60 | t0 = time.time() 61 | logger.debug(f"[{name}] start") 62 | yield 63 | logger.debug(f"[{name}] done in {time.time() - t0:.0f} s") 64 | -------------------------------------------------------------------------------- /src/ai-hackaton/xgb_train.py: -------------------------------------------------------------------------------- 1 | import hydra 2 | import numpy as np 3 | import pandas as pd 4 | from hydra.utils import to_absolute_path 5 | from omegaconf import DictConfig 6 | from sklearn.metrics import log_loss 7 | from trainer.boosting_tree import XGBTrainer 8 | 9 | 10 | @hydra.main(config_path="../../config/train/", config_name="xgb.yaml") 11 | def _main(cfg: DictConfig): 12 | path = to_absolute_path(cfg.dataset.path) + "/" 13 | submit_path = to_absolute_path(cfg.submit.path) + "/" 14 | train_y = pd.read_csv(path + "train_labels.csv") 15 | 16 | cnn_oof = np.load(submit_path + "10fold_cnn_oof.npy") 17 | cnn_preds = np.load(submit_path + "10fold_cnn_preds.npy") 18 | cat_oof = np.load(submit_path + "10fold_cat_oof.npy") 19 | cat_preds = np.load(submit_path + "10fold_cat_preds.npy") 20 | auto_oof = np.load(submit_path + "10fold_automl_oof.npy") 21 | auto_preds = np.load(submit_path + "10fold_automl_preds.npy") 22 | 23 | features = [f"{c}_prob_{i}" for c in ["cnn", "cat", "automl"] for i in range(61)] 24 | X = np.concatenate([cnn_oof, cat_oof, auto_oof], axis=1) 25 | X = pd.DataFrame(X, columns=features) 26 | 27 | X_test = np.concatenate([cnn_preds, cat_preds, auto_preds], axis=1) 28 | X_test = pd.DataFrame(X_test, columns=features) 29 | y = train_y["label"] 30 | 31 | # model 32 | xgb_trainer = XGBTrainer(cfg.model.fold, log_loss) 33 | xgb_trainer.train(X, y, cfg.params, cfg.model.verbose) 34 | xgb_preds = xgb_trainer.predict(X_test) 35 | 36 | submission = pd.read_csv(path + "sample_submission.csv") 37 | submission.iloc[:, 1:] = xgb_preds 38 | submission.to_csv(submit_path + cfg.submit.name, index=False) 39 | 40 | 41 | if __name__ == "__main__": 42 | _main() 43 | -------------------------------------------------------------------------------- /src/ai-hackaton/xgb_tuning.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import hydra 4 | import numpy as np 5 | import pandas as pd 6 | from hydra.utils import to_absolute_path 7 | from omegaconf import DictConfig 8 | from tuning.bayesian import BayesianOptimizer, xgb_objective 9 | 10 | 11 | @hydra.main(config_path="../../config/optimization/", config_name="xgb.yaml") 12 | def _main(cfg: DictConfig): 13 | path = to_absolute_path(cfg.dataset.path) + "/" 14 | submit_path = to_absolute_path(cfg.submit.path) + "/" 15 | train_y = pd.read_csv(path + "train_labels.csv") 16 | cnn_oof = np.load(submit_path + "10fold_cnn_oof.npy") 17 | cat_oof = np.load(submit_path + "10fold_cat_oof.npy") 18 | auto_oof = np.load(submit_path + "10fold_automl_oof.npy") 19 | 20 | features = [f"{c}_prob_{i}" for c in ["cnn", "cat", "automl"] for i in range(61)] 21 | X = np.concatenate([cnn_oof, cat_oof, auto_oof], axis=1) 22 | X = pd.DataFrame(X, columns=features) 23 | y = train_y["label"] 24 | 25 | objective = partial(xgb_objective, X=X, y=y, n_fold=cfg.model.fold) 26 | bayesian_optim = BayesianOptimizer(objective) 27 | study = bayesian_optim.build_study(trials=cfg.optimization.trials) 28 | bayesian_optim.xgb_save_params(study, cfg.optimization.params) 29 | 30 | 31 | if __name__ == "__main__": 32 | _main() 33 | --------------------------------------------------------------------------------