├── ch01 ├── .gitignore ├── requirements.txt └── README.md ├── ch02 ├── .gitignore ├── requirements.txt ├── config.py ├── dataloader_test.py ├── README.md ├── ipynb │ ├── voting.ipynb │ ├── split_file.ipynb │ └── MAE.ipynb ├── test.py ├── src │ ├── utils.py │ ├── model.py │ └── get_score.py └── train.py ├── ch05 ├── requirements.txt └── README.md ├── ch03 ├── weather.csv ├── df_location.csv ├── life_location.csv ├── .gitignore ├── requirements.txt ├── README.md ├── rain.csv └── submission.ipynb ├── .gitignore ├── requirements.txt ├── README.md └── ch04 ├── environment.yml ├── README.md └── problem.ipynb /ch01/.gitignore: -------------------------------------------------------------------------------- 1 | dataset/ -------------------------------------------------------------------------------- /ch02/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | bin/ -------------------------------------------------------------------------------- /ch05/requirements.txt: -------------------------------------------------------------------------------- 1 | -r ../requirements.txt 2 | pandas 3 | statsmodels -------------------------------------------------------------------------------- /ch03/weather.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wikibook/dacon/master/ch03/weather.csv -------------------------------------------------------------------------------- /ch01/requirements.txt: -------------------------------------------------------------------------------- 1 | -r ../requirements.txt 2 | pandas<=1.1 3 | scikit-learn 4 | xgboost 5 | -------------------------------------------------------------------------------- /ch03/df_location.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wikibook/dacon/master/ch03/df_location.csv -------------------------------------------------------------------------------- /ch03/life_location.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wikibook/dacon/master/ch03/life_location.csv -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.xlsx 3 | *.bak 4 | 5 | .ipynb_checkpoints/ 6 | __pycache__/ 7 | 8 | .vscode/ -------------------------------------------------------------------------------- /ch03/.gitignore: -------------------------------------------------------------------------------- 1 | !weather.csv 2 | !rain.csv 3 | !df_location.csv 4 | !life_location.csv 5 | 6 | submission/ 7 | -------------------------------------------------------------------------------- /ch03/requirements.txt: -------------------------------------------------------------------------------- 1 | -r ../requirements.txt 2 | pandas 3 | geopy 4 | scikit-learn 5 | xgboost 6 | lightgbm 7 | tqdm 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | notebook 3 | numpy==1.19.3 # Workaround for issue https://tinyurl.com/y3dm3h86 4 | seaborn<=0.11 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 데이콘 경진대회 1등 솔루션 2 | 3 | * [1장 타자 OPS 모델링](ch01) 4 | * [2장 반도체 박막 두께 분석](ch02) 5 | * [3장 버스 승차 인원 예측](ch03) 6 | * [4장 상점 매출 예측](ch04) 7 | * [5장 투수 스카우트 최적화](ch05) 8 | -------------------------------------------------------------------------------- /ch02/requirements.txt: -------------------------------------------------------------------------------- 1 | -r ../requirements.txt 2 | pandas 3 | scikit-learn 4 | xgboost 5 | tqdm 6 | 7 | --find-links https://download.pytorch.org/whl/torch_stable.html 8 | torch>=1.4.0 9 | -------------------------------------------------------------------------------- /ch01/README.md: -------------------------------------------------------------------------------- 1 | # 1장 KBO 타자 OPS 예측 2 | 3 | * 실습에 필요한 패키지: [requirements.txt](requirements.txt) 4 | 5 | * 예제 코드: [batter_OPS_prediction.ipynb](batter_OPS_prediction.ipynb) 6 | 7 | * 데이터 다운로드: [https://dacon.io/competitions/official/62540/data/](https://dacon.io/competitions/official/62540/data/) (DACON 사이트 회원 가입 및 대회 참여 후 다운로드 가능) 8 | -------------------------------------------------------------------------------- /ch04/environment.yml: -------------------------------------------------------------------------------- 1 | name: store_amount_prediction 2 | dependencies: 3 | - python=3.7 4 | - r::rpy2==2.9.4 5 | - jupyter 6 | - notebook 7 | - pip 8 | - pip: 9 | - tzlocal 10 | - numpy==1.19.1 11 | - pandas==0.25.1 12 | - pmdarima==1.5.3 13 | - statsmodels==0.11.1 14 | - seaborn==0.11.0 15 | - tqdm==4.51.0 -------------------------------------------------------------------------------- /ch02/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 경로와 학습에 필요한 configuration을 관리합니다. 리눅스 환경 기준으로 경로가 설정되어 있지만, 3 | # 윈도우 환경에서 학습한다고 해도 파이썬 및 파이토치(cpu버전)이 설치되어 있다면 문제가 발생하지 않습니다. 4 | TRAIN_PATH = 'data/train_splited.csv' 5 | VAL_PATH = 'data/val.csv' 6 | 7 | LR = 1e-03 8 | ADAM_EPSILON = 1e-06 9 | EPOCHS = 100 10 | BATCH_SIZE = 2048 11 | WARMUP_STEPS = 2000 12 | -------------------------------------------------------------------------------- /ch05/README.md: -------------------------------------------------------------------------------- 1 | # 5장 KBO 외국인 투수 스카우팅 최적화 경진대회 2 | 3 | * 실습에 필요한 패키지: [requirements.txt](requirements.txt) 4 | * 예제 코드 5 | + [eda.ipynb](eda.ipynb) 6 | - 6.2. 탐색적 데이터 분석 7 | + [evaluation.ipynb](evaluation.ipynb) 8 | - 6.3. 데이터 전처리 9 | - 6.4. 모델 구축과 검증 10 | - 6.5. 성능 향상을 위한 방법 11 | * 데이터 다운로드: https://dacon.io/competitions/official/68346/data/ 12 | -------------------------------------------------------------------------------- /ch04/README.md: -------------------------------------------------------------------------------- 1 | # 4장 상점 신용카드 매출 예측 2 | 3 | * 실습 환경: [environment.yml](environment.yml) 4 | * 예제 코드 5 | + [problem.ipynb](problem.ipynb) 6 | - 4.1 문제 정의 7 | + [store_amount_prediction.ipynb](store_amount_prediction.ipynb) 8 | - 4.2 데이터 전처리 9 | - 4.3 탐색적 데이터 분석 10 | - 4.4 모델 구축과 검증 11 | - 4.5 성능 향상을 위한 방법 12 | * 데이터 다운로드: https://dacon.io/competitions/official/140472/data/ 13 | -------------------------------------------------------------------------------- /ch02/dataloader_test.py: -------------------------------------------------------------------------------- 1 | import config 2 | 3 | from torch.utils.data import DataLoader 4 | from src.utils import PandasDataset 5 | 6 | # 배치 사이즈는 하이퍼파라미터로 사용자가 직접 정의할 수 있습니다. 7 | batch_size=32 8 | 9 | # 학습 데이터 csv와 검증 데이터 csv 경로를 지정해 줍니다. 10 | train_path = config.TRAIN_PATH #'data/train_splited.csv' 11 | val_path = config.VAL_PATH #'data/val.csv' 12 | 13 | # Loader를 통해 Batch 크기로 데이터를 반환합니다. 14 | train_dataset = PandasDataset(train_path) 15 | train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0) 16 | 17 | val_dataset = PandasDataset(val_path) 18 | val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=0) 19 | -------------------------------------------------------------------------------- /ch03/README.md: -------------------------------------------------------------------------------- 1 | # 3장 퇴근시간 버스승차인원 예측 2 | 3 | * 실습에 필요한 패키지: [requirements.txt](requirements.txt) 4 | 5 | * 예제 코드 6 | + [eda.ipynb](eda.ipynb): 3.2 탐색적 데이터 분석 7 | + [prediction.ipynb](prediction.ipynb) 8 | - 3.3 데이터 전처리 9 | - 3.4 모델 구축과 검증 10 | + [jeju_life_location.ipynb](jeju_life_location.ipynb) 11 | - 행정구역 이름 추출 12 | + [submission.ipynb](submission.ipynb) 13 | - 3.5 성능 향상을 위한 방법 14 | 15 | * 데이터 16 | + 데이콘 데이터 다운로드 17 | - 퇴근시간 버스승차인원 예측 경진대회: https://dacon.io/competitions/official/229255/data/ 18 | - KCB 금융스타일 시각화 경진대회: https://dacon.io/competitions/official/82407/data/ 19 | + 외부 데이터 20 | - 기상 데이터: [weather.csv](weather.csv), [rain.csv](rain.csv) 21 | - 위치 관련 데이터: [df_location.csv](df_location.csv), [life_location.csv](life_location.csv) 22 | -------------------------------------------------------------------------------- /ch02/README.md: -------------------------------------------------------------------------------- 1 | # 2장 반도체 박막 두께 분석 2 | 3 | * 실습에 필요한 패키지: [requirements.txt](requirements.txt) 4 | * 예제 코드 5 | - [2장 코드 통합.ipynb](https://github.com/wikibook/dacon/blob/master/ch02/2%EC%9E%A5%20%EC%BD%94%EB%93%9C%20%ED%86%B5%ED%95%A9.ipynb) 6 | - [config.py](config.py): 공통 파일 7 | - [dataloader_test.py](dataloader_test.py): 2.3.3 커스텀 데이터 클래스 8 | - [train.py](train.py): 2.5.2.1 옵티마이저 및 스케줄러 조정 9 | - [test.py](test.py): 2.5.2.2 하이퍼파라미터 10 | - ipynb/ 11 | - [eda.ipynb](ipynb/eda.ipynb): 2.2 탐색적 데이터 분석 12 | - [split_file.ipynb](ipynb/split_file.ipynb): 2.3 데이터 전처리 13 | - [voting.ipynb](ipynb/voting.ipynb): 2.5.3 앙상블 14 | + src/ 15 | - [model.py](src/model.py): 2.5 성능 향상을 위한 방법 16 | - [utils.py](src/utils.py): 2.3.3 커스텀 데이터 클래스 17 | 18 | * 데이터 다운로드: https://dacon.io/competitions/official/235554/data/ 19 | -------------------------------------------------------------------------------- /ch02/ipynb/voting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "[2, 1, 0]\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "output_1 = [0.7, 0.1, 0.2]\n", 18 | "output_2 = [0.5, 0.2, 0.3]\n", 19 | "output_3 = [0.3, 0.4, 0.3]\n", 20 | "\n", 21 | "\n", 22 | "def hard_voting(output_1, output2, output_3):\n", 23 | " \n", 24 | " result = [0,0,0]\n", 25 | " \n", 26 | " # 각 output에서 가장 큰 수의 인덱스를 찾습니다.\n", 27 | " output_1_max_value = max(output_1)\n", 28 | " output_1_max_index = output_1.index(output_1_max_value)\n", 29 | " result[output_1_max_index] += 1\n", 30 | "\n", 31 | " output_2_max_value = max(output_2)\n", 32 | " output_2_max_index = output_2.index(output_2_max_value)\n", 33 | " result[output_2_max_index] += 1\n", 34 | "\n", 35 | " output_3_max_value = max(output_3)\n", 36 | " output_3_max_index = output_3.index(output_3_max_value)\n", 37 | " result[output_3_max_index] += 1\n", 38 | "\n", 39 | " return result\n", 40 | "\n", 41 | "\n", 42 | "result = hard_voting(output_1, output_2, output_3)\n", 43 | "print(result)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "[0.5, 0.23333333333333336, 0.26666666666666666]\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "output_1 = [0.7, 0.1, 0.2]\n", 61 | "output_2 = [0.5, 0.2, 0.3]\n", 62 | "output_3 = [0.3, 0.4, 0.3]\n", 63 | "\n", 64 | "\n", 65 | "def soft_voting(output_1, output2, output_3):\n", 66 | " \n", 67 | " result = [0,0,0]\n", 68 | " \n", 69 | " # 각 output의 소프트 맥스 확률을 더합니다.\n", 70 | " result = [(x+y+z)/3 for x,y,z in zip(output_1, output_2, output_3)]\n", 71 | " \n", 72 | " return result\n", 73 | "\n", 74 | "\n", 75 | "result = soft_voting(output_1, output_2, output_3)\n", 76 | "print(result)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.8.5" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 4 108 | } 109 | -------------------------------------------------------------------------------- /ch02/ipynb/split_file.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "(810000, 230)\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "from itertools import chain\n", 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "\n", 21 | "\"\"\"\n", 22 | "원래 제공된 학습 데이터셋에서 일정 비율을 검증 데이터셋으로 나누어 다시 저장합니다.\n", 23 | "train.csv에 있는 데이터를 분할하여 학습 데이터는 train_splited.csv 파일에 저장하고\n", 24 | "검증 데이터는 val.csv 파일에 저장합니다.\n", 25 | "\"\"\"\n", 26 | "path_train = '../data/train.csv'\n", 27 | "\n", 28 | "# 데이터를 섞어 다시 저장 시 인덱스 재정렬을 위한 작업을 진행합니다.\n", 29 | "layers = [['layer_1','layer_2','layer_3','layer_4'], \\\n", 30 | " [str(i) for i in np.arange(0,226).tolist()]]\n", 31 | "layers = list(chain(*layers))\n", 32 | "\n", 33 | "# train의 row를 random으로 섞어줍니다.\n", 34 | "train = pd.read_csv(path_train)\n", 35 | "print(train.shape)\n", 36 | "train = train.sample(frac=1)\n", 37 | "rows, cols = train.shape" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "train file saved....\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "# 학습 데이터에서 일정 비율(13%)을 잘라 검증 데이터로 구성한 후 저장합니다.\n", 55 | "train1 = train.iloc[:rows - 80000,:]\n", 56 | "train1 = train1.values\n", 57 | "train1 = pd.DataFrame(data=train1,columns=layers)\n", 58 | "\n", 59 | "# 판다스 라이브러리의 to_csv() 함수를 사용해 CSV 파일로 학습 데이터를 저장합니다.\n", 60 | "train1.to_csv('../data/train_splited.csv', index_label='id')\n", 61 | "print(\"train file saved....\")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "validation file saved....\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "# 마찬가지로 나머지 부분은 검증 데이터를 CSV 파일로 저장합니다.\n", 79 | "val = train.iloc[rows - 80000:,:]\n", 80 | "val = val.values\n", 81 | "val = pd.DataFrame(data=val,columns=layers)\n", 82 | "val.to_csv('../data/val.csv', index_label='id')\n", 83 | "print(\"validation file saved....\")" 84 | ] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.8.5" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 2 108 | } 109 | -------------------------------------------------------------------------------- /ch02/ipynb/MAE.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## evaluate test data \n", 8 | "\n", 9 | "- 측정 : base.csv ~ *.csv\n", 10 | "- overfitting 지표로는 사용." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "0.45674784462451934" 22 | ] 23 | }, 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "output_type": "execute_result" 27 | } 28 | ], 29 | "source": [ 30 | "import pandas as pd\n", 31 | "import numpy as np\n", 32 | "\n", 33 | "best_path = 'test_(20, 57)_0.001_100.csv'\n", 34 | "my_path = 'test_(9, 45)_0.001_150.csv'\n", 35 | "\n", 36 | "def mae(best_path, my_path):\n", 37 | " best = pd.read_csv(best_path)\n", 38 | " best_value = best.iloc[:,1:].values\n", 39 | "\n", 40 | " value = pd.read_csv(my_path)\n", 41 | " my_value = value.iloc[:,1:].values\n", 42 | "\n", 43 | " abs_value = abs(best_value - my_value)\n", 44 | " size = abs_value.shape\n", 45 | " return sum(sum(abs_value)) / (size[0]*size[1])\n", 46 | "\n", 47 | "mae(best_path, my_path)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "from torch.optim import lr_scheduler" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 12, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "0.38016" 68 | ] 69 | }, 70 | "execution_count": 12, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "528*180 / 250000" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 15, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "0.002112" 88 | ] 89 | }, 90 | "execution_count": 15, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "528/250000" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.8.5" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 2 170 | } 171 | -------------------------------------------------------------------------------- /ch02/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import torch 4 | from src.model import SkipConnectionModel 5 | from src.utils import TestDataset 6 | from torch.utils.data import DataLoader 7 | 8 | # 모델 평가 시 GPU를 사용하기 위해서 설정. 9 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 10 | 11 | # 테스트 데이터 위치 12 | path_test = 'data/test.csv' 13 | 14 | # pth 파일(모델 한 개 예시) 15 | # 학습을 통해 저장된 pth 파일을 가져옵니다. 16 | pth_bin = 'bin/test.pth' # 학습해서 이미 모델이 저장되어 있어야합니다. 17 | 18 | # CSV가 저장될 디렉터리를 미리 만들어 놓습니다. 19 | if not os.path.exists('test'): # 'test' 는 USER에 맞게 지정하시면 됩니다. 20 | os.mkdir('test') 21 | 22 | ######################################## 23 | ######### 모델 하나에 대한 테스트 ########## 24 | ######################################## 25 | 26 | # Test Model 27 | # 모델을 테스트하기 위해서 모델을 다시 정의합니다. 28 | test_model = SkipConnectionModel(fn_in=226, fn_out=4) 29 | test_model = test_model.to(device) 30 | 31 | # Test dataset을 불러옵니다. 32 | test_data = TestDataset(path_test) 33 | test_loader = DataLoader(test_data, batch_size=10000, num_workers=0) 34 | 35 | # 테스트 데이터를 불러와서 모델로 결과를 예측하고 그 결과를 파일로 씁니다. 36 | with torch.no_grad(): 37 | for data in test_loader: 38 | data = data.to(device) 39 | outputs = test_model(data.float()) 40 | pred_test = outputs 41 | 42 | sample_sub = pd.read_csv('data/sample_submission.csv', index_col=0) 43 | layers = ['layer_1','layer_2','layer_3','layer_4'] # 데이터의 컬럼명을 정의해줍니다. 44 | submission = sample_sub.values + pred_test.cpu().numpy() # 파일을 쓸 때 CPU에서 진행합니다. 45 | 46 | submission = pd.DataFrame(data=submission,columns=layers) 47 | submission.to_csv('./test/submission.csv', index_label='id') 48 | 49 | 50 | ####################################################################### 51 | ### 아래 버전은 앙상블용 예제 코드입니다. 52 | ####################################################################### 53 | # test 파일 경로 54 | # path_test = 'data/test.csv' 55 | 56 | # pth_list = os.listdir('bin') # 'outputs' pth들이 저장된 경로 57 | # print(pth_list) # pth 파일 리스트들을 확인합니다. 58 | 59 | # # 모델들을 dictionary 형태로 정의하여 바로 사용할 수 있게 합니다. 60 | # models = { 61 | # 'model':TestModel(), 62 | # 'model1': TestModel1(), 63 | # 'model2': TestModel2(), 64 | # 'model3': TestModel3(), 65 | # 'model4': TestModel4(), 66 | # 'model5': TestModel5(), 67 | # 'model6': TestModel6() 68 | # } 69 | 70 | 71 | # 모델에 학습된 가중치를 올립니다. 72 | # USER_BIN = 'bin/model.pth' 73 | # weights = torch.load(USER_BIN, map_location='cuda:1') 74 | # test_model.load_state_dict(weights) 75 | # test_model = test_model.to(device) 76 | # test_model.eval() 77 | 78 | 79 | # 앙상블 할 모델에 대해서 파일을 씁니다. 80 | # for pth in sorted(pth_list): 81 | # if pth[-3:] != 'pth': 82 | # pass 83 | # else: 84 | # if int(pth[0]) == 0: 85 | # test_model(pth, test_loader, model_type='model') 86 | # elif int(pth[0]) == 1: 87 | # test_model(pth, test_loader, model_type='model1') 88 | # elif int(pth[0]) == 2: 89 | # #test_model(pth, test_loader, model_type='model2') 90 | # pass 91 | # elif int(pth[0]) == 3: 92 | # test_model(pth, test_loader, model_type='model4') 93 | # elif int(pth[0]) > 3 and int(pth[0]) <7: 94 | # test_model(pth, test_loader, model_type='model5') 95 | # elif int(pth[0])>= 7: 96 | # test_model(pth, test_loader, model_type='model6') 97 | 98 | # pth 가중치를 불러와서 모델을 테스트하고 그 결과 csv 파일을 씁니다. 99 | # def test_model(path_pth, test_loader, model_type:str): 100 | # model = models[model_type] 101 | # ws = torch.load(f'./outputs/{path_pth}', map_location='cpu') # 불러옴 102 | # model.load_state_dict(ws) 103 | # model.eval() 104 | 105 | # with torch.no_grad(): 106 | # for data in test_loader: 107 | # outputs = model(data.float()) # 모델을 테스트 108 | # pred_test = outputs 109 | 110 | # sample_sub = pd.read_csv('sample_submission.csv', index_col=0) 111 | # layers = ['layer_1','layer_2','layer_3','layer_4'] 112 | # submission = sample_sub.values + pred_test.numpy() 113 | 114 | # submission = pd.DataFrame(data=submission,columns=layers) 115 | # submission.to_csv(f'./test/{path_pth[:-4]}.csv', index_label='id') # test 경로에 csv 파일 저장 116 | 117 | # def check_state(model): 118 | # for val in model.state_dict().keys(): 119 | # if val[-4:] =='bias': 120 | # pass 121 | # else: 122 | # print(f'{val} : {model.state_dict()[val].shape}') 123 | -------------------------------------------------------------------------------- /ch02/src/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pandas as pd 3 | import torch 4 | from torch.optim.lr_scheduler import LambdaLR 5 | from torch.utils.data import Dataset, DataLoader 6 | 7 | class PandasDataset(Dataset): 8 | """ Train dataset을 가져와서 torch 모델이 학습할 수 있는 tensor 형태로 반환합니다.""" 9 | def __init__(self, path): 10 | super(PandasDataset, self).__init__() 11 | train = pd.read_csv(path).iloc[:,1:] 12 | self.train_X, self.train_Y = train.iloc[:,4:], train.iloc[:,0:4] 13 | self.tmp_x , self.tmp_y = self.train_X.values, self.train_Y.values 14 | 15 | def __len__(self): 16 | return len(self.train_X) 17 | 18 | def __getitem__(self, idx): 19 | return { 20 | 'X':torch.from_numpy(self.tmp_x)[idx], 21 | 'Y':torch.from_numpy(self.tmp_y)[idx] 22 | } 23 | 24 | class TestDataset(Dataset): 25 | def __init__(self, path_test): 26 | super(TestDataset, self).__init__() 27 | test = pd.read_csv(path_test) 28 | self.test_X = test.iloc[:,1:] 29 | self.tmp_x = self.test_X.values 30 | 31 | def __len__(self): 32 | return len(self.test_X) 33 | 34 | def __getitem__(self, idx): 35 | return torch.from_numpy(self.tmp_x)[idx] 36 | 37 | """ 38 | 학습 최적화를 위해 스케줄러를 활용합니다. 39 | Pytorch 및 transformer의 스케줄러를 참고. 40 | https://github.com/huggingface/transformers/blob/master/src/transformers/optimization.py 41 | """ 42 | def get_constant_schedule(optimizer, last_epoch=-1): 43 | """ Create a schedule with a constant learning rate. 44 | """ 45 | return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch) 46 | 47 | 48 | def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1): 49 | """ Create a schedule with a constant learning rate preceded by a warmup 50 | period during which the learning rate increases linearly between 0 and 1. 51 | """ 52 | 53 | def lr_lambda(current_step): 54 | if current_step < num_warmup_steps: 55 | return float(current_step) / float(max(1.0, num_warmup_steps)) 56 | return 1.0 57 | 58 | return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) 59 | 60 | 61 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): 62 | """ Create a schedule with a learning rate that decreases linearly after 63 | linearly increasing during a warmup period. 64 | """ 65 | 66 | def lr_lambda(current_step): 67 | if current_step < num_warmup_steps: 68 | return float(current_step) / float(max(1, num_warmup_steps)) 69 | return max( 70 | 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) 71 | ) 72 | 73 | return LambdaLR(optimizer, lr_lambda, last_epoch) 74 | 75 | 76 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1): 77 | """ Create a schedule with a learning rate that decreases following the 78 | values of the cosine function between 0 and `pi * cycles` after a warmup 79 | period during which it increases linearly between 0 and 1. 80 | """ 81 | 82 | def lr_lambda(current_step): 83 | if current_step < num_warmup_steps: 84 | return float(current_step) / float(max(1, num_warmup_steps)) 85 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 86 | return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) 87 | 88 | return LambdaLR(optimizer, lr_lambda, last_epoch) 89 | 90 | 91 | def get_cosine_with_hard_restarts_schedule_with_warmup( 92 | optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1 93 | ): 94 | """ 학습률이 웜업 기간 이후 몇 번의 하드 리스타트를 하는 코사인 함수 값에 따라 감소하는 95 | 스케줄러를 만듭니다. 웜업 기간에는 학습률이 0과 1 사이에서 선형으로 증가합니다. 96 | """ 97 | 98 | def lr_lambda(current_step): 99 | if current_step < num_warmup_steps: 100 | return float(current_step) / float(max(1, num_warmup_steps)) 101 | progress = float(current_step - num_warmup_steps) / \ 102 | float(max(1, num_training_steps - num_warmup_steps)) 103 | if progress >= 1.0: 104 | return 0.0 105 | return max(0.0, \ 106 | 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))) 107 | 108 | return LambdaLR(optimizer, lr_lambda, last_epoch) -------------------------------------------------------------------------------- /ch02/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import time 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from tqdm.auto import tqdm 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch.utils.data import Dataset, DataLoader 14 | from torch.optim import AdamW 15 | 16 | import config 17 | from src.model import SkipConnectionModel 18 | from src.utils import PandasDataset, get_cosine_with_hard_restarts_schedule_with_warmup 19 | 20 | # 모델을 학습시키기 위한 하이퍼 파라미터를 설정합니다. 21 | lr = config.LR 22 | adam_epsilon = config.ADAM_EPSILON 23 | epochs = config.EPOCHS 24 | batch_size = config.BATCH_SIZE 25 | warmup_step = config.WARMUP_STEPS 26 | 27 | # original data : train.csv => random으로 섞어 미리 train_split data를 만들고 28 | # evaluation을 위해서 val data를 따로 분리했습니다. (9:1) 29 | # 모델 학습 데이터 경로를 설정합니다. 30 | train_path = config.TRAIN_PATH 31 | val_path = config.VAL_PATH 32 | 33 | # Loader를 통해 Batch 데이터로 반환합니다. 34 | train_dataset = PandasDataset(train_path) 35 | train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0) 36 | 37 | val_dataset = PandasDataset(val_path) 38 | val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=0) 39 | 40 | # 모델이 학습하는 전체 step을 계산합니다. 41 | total_step = len(train_loader) * epochs 42 | print(f"Total step is....{total_step}") 43 | 44 | # 모델 인스턴스를 정의합니다. 45 | model = SkipConnectionModel(fn_in=226, fn_out=4) # channel은 모델에서 수정합니다. 46 | 47 | # GPU 및 CUDA 환경이 마련되어 있다면, 모델 학습을 위해 CUDA 환경을 직접 설정합니다. 48 | # 그렇지 않은 경우 자동으로 CPU를 설정하게 됩니다. 49 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 50 | 51 | # 모델을 GPU 메모리에 올립니다. gpu가 없는 환경은 자동으로 cpu가 설정됩니다. 52 | model = model.to(device) 53 | 54 | # 손실함수와 옵티마이저 55 | # 신경망 모델을 최적화할 수 있게 손실함수를 정의합니다. 56 | # MAE를 사용합니다. 57 | loss_fn = nn.L1Loss() 58 | 59 | # 옵티마이저와 스케줄러의 파라미터를 정의합니다. 60 | no_decay = ["bias", "LayerNorm.weight"] # decay하지 않을 영역 지정 61 | optimizer_grouped_parameters = [ 62 | { 63 | "params": [p for n, p in model.named_parameters() \ 64 | if not any(nd in n for nd in no_decay)], 65 | "weight_decay": 0.0, 66 | }, 67 | {"params": [p for n, p in model.named_parameters() \ 68 | if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, 69 | ] 70 | 71 | # 옵티마이저와 스케줄러 객체를 정의합니다. 72 | optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon) 73 | scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( 74 | optimizer, num_warmup_steps=warmup_step, num_training_steps=total_step 75 | ) 76 | 77 | # 모델 이름을 위해서 변수를 만듭니다. 78 | version = time.localtime()[3:5] 79 | curr_lr = lr 80 | 81 | # train loss와 val loss를 지정합니다. 82 | total_loss = 0.0 83 | total_val_loss = 0.0 84 | n_val_loss = 10000000. # best validation loss를 저장하기 위해서 변수 설정합니다. 85 | 86 | if not os.path.exists('bin'): 87 | os.mkdir('bin') 88 | 89 | for epoch in range(epochs): 90 | total_loss = 0 91 | total_val_loss = 0 92 | for i, data in enumerate(tqdm(train_loader, desc='*********Train mode*******')): # train 데이터를 부르고 학습합니다. 93 | # 학습 데이터를 부르고 학습합니다. 94 | # 순방향 정의 95 | pred = model(data['X'].float().to(device)) 96 | loss = loss_fn(pred, data['Y'].float().to(device)) 97 | 98 | # 역방향 정의 99 | # optimizer 객체를 사용해서 학습 가능한 가중치 변수에 대한 모든 변화도를 100 | # 0으로 만듭니다. 101 | optimizer.zero_grad() 102 | 103 | # loss에 따른 오차 역전파를 구합니다. 104 | loss.backward() 105 | 106 | # optimizer 객체의 파라미터들을 업데이트합니다. 107 | optimizer.step() 108 | 109 | # scheduler 객체의 파라미터들을 업데이트합니다. 110 | scheduler.step() 111 | 112 | total_loss += loss.item() 113 | 114 | train_loss = total_loss / len(train_loader) 115 | print ("Epoch [{}/{}], Train Loss: {:.4f}".format(epoch+1, epochs, train_loss)) 116 | 117 | # 평가 118 | # 검증 데이터를 부르고 에포크마다 학습된 모델을 부르고 평가합니다. 119 | model.eval() 120 | with torch.no_grad(): 121 | for i, data in enumerate(tqdm(val_loader, \ 122 | desc='*********Evaluation mode*******')): 123 | pred = model(data['X'].float().to(device)) 124 | loss_val = loss_fn(pred, data['Y'].float().to(device)) 125 | 126 | total_val_loss += loss_val.item() 127 | val_loss = total_val_loss / len(val_loader) 128 | print ("Epoch [{}/{}], Eval Loss: {:.4f}".format(epoch+1, epochs, val_loss)) 129 | 130 | # 검증 데이터에서 가장 낮은 평균 절대 오차를 보인 에포크의 모델을 저장합니다. 131 | if val_loss < n_val_loss: 132 | n_val_loss = val_loss 133 | #torch.save(model.state_dict(), f'bin/test_{version}.pth') 134 | torch.save(model.state_dict(), f'bin/test.pth') 135 | print("Best Model saved......") 136 | -------------------------------------------------------------------------------- /ch03/rain.csv: -------------------------------------------------------------------------------- 1 | 지점,일시,기온(°C),강수량(mm) 2 | gosan,2019-09-01,23.916666666666668,0.0 3 | gosan,2019-09-02,23.7,12.1 4 | gosan,2019-09-03,23.549999999999997,1.9 5 | gosan,2019-09-04,23.53333333333333,12.84 6 | gosan,2019-09-05,26.033333333333335,0.0 7 | gosan,2019-09-06,27.066666666666666, 8 | gosan,2019-09-07,25.133333333333336,0.0 9 | gosan,2019-09-08,24.383333333333336,0.2 10 | gosan,2019-09-09,26.666666666666668,4.750000000000001 11 | gosan,2019-09-10,26.183333333333334, 12 | gosan,2019-09-11,25.95,0.6 13 | gosan,2019-09-12,25.483333333333334, 14 | gosan,2019-09-13,23.75, 15 | gosan,2019-09-14,23.566666666666666, 16 | gosan,2019-09-15,23.900000000000002, 17 | gosan,2019-09-16,23.8, 18 | gosan,2019-09-17,23.866666666666664, 19 | gosan,2019-09-18,22.683333333333334, 20 | gosan,2019-09-19,22.716666666666665, 21 | gosan,2019-09-20,22.21666666666667, 22 | gosan,2019-09-21,20.216666666666665,1.44 23 | gosan,2019-09-22,20.016666666666666,0.72 24 | gosan,2019-09-23,20.48333333333333, 25 | gosan,2019-09-24,20.533333333333335, 26 | gosan,2019-09-25,21.46666666666667, 27 | gosan,2019-09-26,22.266666666666666, 28 | gosan,2019-09-27,24.416666666666668, 29 | gosan,2019-09-28,24.46666666666667, 30 | gosan,2019-09-29,23.766666666666666, 31 | gosan,2019-09-30,23.866666666666664, 32 | gosan,2019-10-01,23.883333333333336,0.30000000000000004 33 | gosan,2019-10-02,25.099999999999998,5.550000000000001 34 | gosan,2019-10-03,23.13333333333333,0.07500000000000001 35 | gosan,2019-10-04,23.233333333333334, 36 | gosan,2019-10-05,21.666666666666668, 37 | gosan,2019-10-06,20.400000000000002, 38 | gosan,2019-10-07,22.516666666666666,0.0 39 | gosan,2019-10-08,19.86666666666667, 40 | gosan,2019-10-09,17.166666666666668, 41 | gosan,2019-10-10,19.583333333333332, 42 | gosan,2019-10-11,20.566666666666666, 43 | gosan,2019-10-12,21.216666666666665, 44 | gosan,2019-10-13,19.666666666666668, 45 | gosan,2019-10-14,19.466666666666665, 46 | gosan,2019-10-15,16.633333333333336, 47 | gosan,2019-10-16,18.46666666666667, 48 | jeju,2019-09-01,24.650000000000002,0.0 49 | jeju,2019-09-02,22.8,4.74 50 | jeju,2019-09-03,24.03333333333333,3.1 51 | jeju,2019-09-04,24.383333333333336,6.9799999999999995 52 | jeju,2019-09-05,26.11666666666667,0.1 53 | jeju,2019-09-06,27.533333333333335,0.0 54 | jeju,2019-09-07,27.983333333333334,0.04 55 | jeju,2019-09-08,25.133333333333336,0.0 56 | jeju,2019-09-09,26.666666666666668,1.116666666666667 57 | jeju,2019-09-10,26.583333333333332, 58 | jeju,2019-09-11,25.26666666666667,1.25 59 | jeju,2019-09-12,24.28333333333333,0.19999999999999998 60 | jeju,2019-09-13,23.51666666666667, 61 | jeju,2019-09-14,23.983333333333334, 62 | jeju,2019-09-15,24.583333333333332, 63 | jeju,2019-09-16,24.566666666666666, 64 | jeju,2019-09-17,23.833333333333332, 65 | jeju,2019-09-18,23.383333333333336, 66 | jeju,2019-09-19,22.849999999999998, 67 | jeju,2019-09-20,22.099999999999998, 68 | jeju,2019-09-21,20.400000000000002,1.75 69 | jeju,2019-09-22,20.25,7.766666666666667 70 | jeju,2019-09-23,20.7, 71 | jeju,2019-09-24,20.333333333333332, 72 | jeju,2019-09-25,21.183333333333334, 73 | jeju,2019-09-26,22.883333333333336,0.2 74 | jeju,2019-09-27,24.233333333333334, 75 | jeju,2019-09-28,24.066666666666663, 76 | jeju,2019-09-29,23.983333333333334,0.0 77 | jeju,2019-09-30,23.866666666666664, 78 | jeju,2019-10-01,24.76666666666667,0.3666666666666667 79 | jeju,2019-10-02,24.716666666666665,11.216666666666669 80 | jeju,2019-10-03,22.899999999999995,0.05 81 | jeju,2019-10-04,23.350000000000005, 82 | jeju,2019-10-05,21.950000000000003, 83 | jeju,2019-10-06,20.549999999999997,0.25 84 | jeju,2019-10-07,22.516666666666666, 85 | jeju,2019-10-08,20.466666666666665, 86 | jeju,2019-10-09,17.45, 87 | jeju,2019-10-10,18.2, 88 | jeju,2019-10-11,20.416666666666668, 89 | jeju,2019-10-12,21.316666666666666, 90 | jeju,2019-10-13,19.366666666666667, 91 | jeju,2019-10-14,19.783333333333335, 92 | jeju,2019-10-15,16.900000000000002, 93 | jeju,2019-10-16,17.98333333333333, 94 | po,2019-09-01,23.78333333333333,0.0 95 | po,2019-09-02,24.966666666666665,1.38 96 | po,2019-09-03,24.866666666666664,2.8 97 | po,2019-09-04,24.349999999999998,4.42 98 | po,2019-09-05,25.41666666666666,1.0799999999999998 99 | po,2019-09-06,26.549999999999997,0.25 100 | po,2019-09-07,25.183333333333334,0.20000000000000004 101 | po,2019-09-08,24.5,0.13333333333333333 102 | po,2019-09-09,26.600000000000005,1.725 103 | po,2019-09-10,26.116666666666664, 104 | po,2019-09-11,27.33333333333333, 105 | po,2019-09-12,25.100000000000005, 106 | po,2019-09-13,24.03333333333333, 107 | po,2019-09-14,25.083333333333332, 108 | po,2019-09-15,25.25, 109 | po,2019-09-16,24.733333333333334, 110 | po,2019-09-17,23.400000000000002, 111 | po,2019-09-18,23.483333333333334, 112 | po,2019-09-19,23.700000000000003, 113 | po,2019-09-20,22.633333333333336, 114 | po,2019-09-21,20.216666666666665,1.075 115 | po,2019-09-22,20.883333333333333,5.1000000000000005 116 | po,2019-09-23,20.816666666666666,0.1 117 | po,2019-09-24,20.666666666666668, 118 | po,2019-09-25,22.11666666666667, 119 | po,2019-09-26,23.899999999999995,0.1 120 | po,2019-09-27,25.2, 121 | po,2019-09-28,24.96666666666667, 122 | po,2019-09-29,25.0, 123 | po,2019-09-30,24.2, 124 | po,2019-10-01,24.683333333333334,0.02 125 | po,2019-10-02,24.766666666666666,9.816666666666668 126 | po,2019-10-03,24.03333333333333, 127 | po,2019-10-04,23.0, 128 | po,2019-10-05,22.05, 129 | po,2019-10-06,21.7,0.06666666666666667 130 | po,2019-10-07,22.416666666666668,0.0 131 | po,2019-10-08,20.883333333333336, 132 | po,2019-10-09,18.5, 133 | po,2019-10-10,17.900000000000002, 134 | po,2019-10-11,20.966666666666665, 135 | po,2019-10-12,21.7, 136 | po,2019-10-13,19.083333333333336, 137 | po,2019-10-14,18.883333333333333, 138 | po,2019-10-15,19.183333333333334, 139 | po,2019-10-16,18.299999999999997, 140 | seongsan,2019-09-01,23.53333333333333, 141 | seongsan,2019-09-02,24.566666666666666,2.0 142 | seongsan,2019-09-03,24.533333333333335,4.6 143 | seongsan,2019-09-04,25.400000000000002,3.0 144 | seongsan,2019-09-05,25.766666666666666,0.5666666666666667 145 | seongsan,2019-09-06,27.183333333333334, 146 | seongsan,2019-09-07,26.53333333333333,0.05 147 | seongsan,2019-09-08,24.083333333333332,0.1 148 | seongsan,2019-09-09,27.149999999999995,0.13333333333333333 149 | seongsan,2019-09-10,26.566666666666663, 150 | seongsan,2019-09-11,27.3, 151 | seongsan,2019-09-12,24.900000000000002, 152 | seongsan,2019-09-13,23.933333333333337, 153 | seongsan,2019-09-14,23.583333333333332, 154 | seongsan,2019-09-15,25.150000000000002, 155 | seongsan,2019-09-16,24.583333333333332, 156 | seongsan,2019-09-17,23.583333333333332, 157 | seongsan,2019-09-18,23.166666666666668, 158 | seongsan,2019-09-19,23.616666666666664, 159 | seongsan,2019-09-20,23.350000000000005, 160 | seongsan,2019-09-21,21.033333333333335,0.9166666666666666 161 | seongsan,2019-09-22,21.266666666666666,5.349999999999999 162 | seongsan,2019-09-23,19.833333333333332,0.0 163 | seongsan,2019-09-24,19.900000000000002, 164 | seongsan,2019-09-25,21.400000000000002, 165 | seongsan,2019-09-26,24.266666666666666,0.0 166 | seongsan,2019-09-27,24.200000000000003,0.25 167 | seongsan,2019-09-28,25.5, 168 | seongsan,2019-09-29,24.5, 169 | seongsan,2019-09-30,24.033333333333335, 170 | seongsan,2019-10-01,25.166666666666668, 171 | seongsan,2019-10-02,25.316666666666663,11.680000000000001 172 | seongsan,2019-10-03,23.549999999999997, 173 | seongsan,2019-10-04,23.53333333333333, 174 | seongsan,2019-10-05,21.733333333333334, 175 | seongsan,2019-10-06,21.333333333333332,0.05 176 | seongsan,2019-10-07,23.516666666666666, 177 | seongsan,2019-10-08,18.96666666666667, 178 | seongsan,2019-10-09,17.016666666666666, 179 | seongsan,2019-10-10,17.983333333333334, 180 | seongsan,2019-10-11,20.51666666666667, 181 | seongsan,2019-10-12,21.3, 182 | seongsan,2019-10-13,19.3, 183 | seongsan,2019-10-14,18.583333333333332, 184 | seongsan,2019-10-15,16.21666666666667,2.9 185 | seongsan,2019-10-16,18.88333333333333, 186 | -------------------------------------------------------------------------------- /ch04/problem.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 4. 상점 신용카드 매출 예측\n", 8 | "## 4.1. 문제 정의\n", 9 | "### 4.1.4. 문제 해결을 위한 접근 방식 소개" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import os\n", 20 | "import warnings\n", 21 | "\n", 22 | "warnings.filterwarnings(\"ignore\")\n", 23 | "\n", 24 | "os.chdir('C:/dacon/ch04')\n", 25 | "train = pd.read_csv('./funda_train.csv')\n", 26 | "submission = pd.read_csv('./submission.csv')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "(6556613, 9)" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "train.shape" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/html": [ 57 | "
\n", 58 | "\n", 71 | "\n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | "
store_idcard_idcard_companytransacted_datetransacted_timeinstallment_termregiontype_of_businessamount
000b2016-06-0113:130NaN기타 미용업1857.142857
101h2016-06-0118:120NaN기타 미용업857.142857
202c2016-06-0118:520NaN기타 미용업2000.000000
303a2016-06-0120:220NaN기타 미용업7857.142857
404c2016-06-0211:060NaN기타 미용업2000.000000
\n", 149 | "
" 150 | ], 151 | "text/plain": [ 152 | " store_id card_id card_company transacted_date transacted_time \\\n", 153 | "0 0 0 b 2016-06-01 13:13 \n", 154 | "1 0 1 h 2016-06-01 18:12 \n", 155 | "2 0 2 c 2016-06-01 18:52 \n", 156 | "3 0 3 a 2016-06-01 20:22 \n", 157 | "4 0 4 c 2016-06-02 11:06 \n", 158 | "\n", 159 | " installment_term region type_of_business amount \n", 160 | "0 0 NaN 기타 미용업 1857.142857 \n", 161 | "1 0 NaN 기타 미용업 857.142857 \n", 162 | "2 0 NaN 기타 미용업 2000.000000 \n", 163 | "3 0 NaN 기타 미용업 7857.142857 \n", 164 | "4 0 NaN 기타 미용업 2000.000000 " 165 | ] 166 | }, 167 | "execution_count": 3, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "train.head()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 4, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "\n", 186 | "RangeIndex: 6556613 entries, 0 to 6556612\n", 187 | "Data columns (total 9 columns):\n", 188 | "store_id int64\n", 189 | "card_id int64\n", 190 | "card_company object\n", 191 | "transacted_date object\n", 192 | "transacted_time object\n", 193 | "installment_term int64\n", 194 | "region object\n", 195 | "type_of_business object\n", 196 | "amount float64\n", 197 | "dtypes: float64(1), int64(3), object(5)\n", 198 | "memory usage: 450.2+ MB\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "train.info()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 5, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/html": [ 214 | "
\n", 215 | "\n", 228 | "\n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | "
store_idcard_idcard_companytransacted_datetransacted_timeinstallment_termregiontype_of_businessamount
000b2016-06-0113:130NaN기타 미용업1857.142857
101h2016-06-0118:120NaN기타 미용업857.142857
202c2016-06-0118:520NaN기타 미용업2000.000000
303a2016-06-0120:220NaN기타 미용업7857.142857
404c2016-06-0211:060NaN기타 미용업2000.000000
\n", 306 | "
" 307 | ], 308 | "text/plain": [ 309 | " store_id card_id card_company transacted_date transacted_time \\\n", 310 | "0 0 0 b 2016-06-01 13:13 \n", 311 | "1 0 1 h 2016-06-01 18:12 \n", 312 | "2 0 2 c 2016-06-01 18:52 \n", 313 | "3 0 3 a 2016-06-01 20:22 \n", 314 | "4 0 4 c 2016-06-02 11:06 \n", 315 | "\n", 316 | " installment_term region type_of_business amount \n", 317 | "0 0 NaN 기타 미용업 1857.142857 \n", 318 | "1 0 NaN 기타 미용업 857.142857 \n", 319 | "2 0 NaN 기타 미용업 2000.000000 \n", 320 | "3 0 NaN 기타 미용업 7857.142857 \n", 321 | "4 0 NaN 기타 미용업 2000.000000 " 322 | ] 323 | }, 324 | "execution_count": 5, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "train[:5]" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 6, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/html": [ 341 | "
\n", 342 | "\n", 355 | "\n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | "
store_idcard_idcard_companytransacted_datetransacted_timeinstallment_termregiontype_of_businessamount
41040a2016-06-1017:262NaN기타 미용업-8571.428571
3470285a2016-08-0417:520NaN기타 미용업-1857.142857
7310473g2016-10-1710:320NaN기타 미용업-2000.000000
8310230b2016-11-0315:360NaN기타 미용업-85.714286
9440138a2016-11-2813:210NaN기타 미용업-57.142857
..............................
655624221364663626b2019-02-0121:190제주 제주시기타 주점업-13428.571429
655644821364663760d2019-02-1500:460제주 제주시기타 주점업-6928.571429
655648521364663779b2019-02-1802:450제주 제주시기타 주점업-5571.428571
655648921364663780d2019-02-1821:430제주 제주시기타 주점업-8571.428571
655660821364663855d2019-02-2823:200제주 제주시기타 주점업-4500.000000
\n", 505 | "

73100 rows × 9 columns

\n", 506 | "
" 507 | ], 508 | "text/plain": [ 509 | " store_id card_id card_company transacted_date transacted_time \\\n", 510 | "41 0 40 a 2016-06-10 17:26 \n", 511 | "347 0 285 a 2016-08-04 17:52 \n", 512 | "731 0 473 g 2016-10-17 10:32 \n", 513 | "831 0 230 b 2016-11-03 15:36 \n", 514 | "944 0 138 a 2016-11-28 13:21 \n", 515 | "... ... ... ... ... ... \n", 516 | "6556242 2136 4663626 b 2019-02-01 21:19 \n", 517 | "6556448 2136 4663760 d 2019-02-15 00:46 \n", 518 | "6556485 2136 4663779 b 2019-02-18 02:45 \n", 519 | "6556489 2136 4663780 d 2019-02-18 21:43 \n", 520 | "6556608 2136 4663855 d 2019-02-28 23:20 \n", 521 | "\n", 522 | " installment_term region type_of_business amount \n", 523 | "41 2 NaN 기타 미용업 -8571.428571 \n", 524 | "347 0 NaN 기타 미용업 -1857.142857 \n", 525 | "731 0 NaN 기타 미용업 -2000.000000 \n", 526 | "831 0 NaN 기타 미용업 -85.714286 \n", 527 | "944 0 NaN 기타 미용업 -57.142857 \n", 528 | "... ... ... ... ... \n", 529 | "6556242 0 제주 제주시 기타 주점업 -13428.571429 \n", 530 | "6556448 0 제주 제주시 기타 주점업 -6928.571429 \n", 531 | "6556485 0 제주 제주시 기타 주점업 -5571.428571 \n", 532 | "6556489 0 제주 제주시 기타 주점업 -8571.428571 \n", 533 | "6556608 0 제주 제주시 기타 주점업 -4500.000000 \n", 534 | "\n", 535 | "[73100 rows x 9 columns]" 536 | ] 537 | }, 538 | "execution_count": 6, 539 | "metadata": {}, 540 | "output_type": "execute_result" 541 | } 542 | ], 543 | "source": [ 544 | "train[train['amount']<0]" 545 | ] 546 | } 547 | ], 548 | "metadata": { 549 | "kernelspec": { 550 | "display_name": "[store_amount]", 551 | "language": "python", 552 | "name": "store_amount" 553 | }, 554 | "language_info": { 555 | "codemirror_mode": { 556 | "name": "ipython", 557 | "version": 3 558 | }, 559 | "file_extension": ".py", 560 | "mimetype": "text/x-python", 561 | "name": "python", 562 | "nbconvert_exporter": "python", 563 | "pygments_lexer": "ipython3", 564 | "version": "3.7.9" 565 | } 566 | }, 567 | "nbformat": 4, 568 | "nbformat_minor": 4 569 | } 570 | -------------------------------------------------------------------------------- /ch02/src/model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | """ 5 | code for mlp with skip connection model. 6 | scalable model for ensemble. 7 | """ 8 | 9 | ############################## 10 | ###### Activation ############ 11 | ############################## 12 | 13 | class GELU(nn.Module): 14 | """ 15 | Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU 16 | """ 17 | def forward(self, x): 18 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 19 | 20 | class LayerNorm(nn.Module): 21 | def __init__(self, hidden_size, eps=1e-5): 22 | """Construct a layernorm module in the TF style (epsilon inside the square root). 23 | """ 24 | super(LayerNorm, self).__init__() 25 | self.weight = nn.Parameter(torch.ones(hidden_size)) 26 | self.bias = nn.Parameter(torch.zeros(hidden_size)) 27 | self.variance_epsilon = eps 28 | 29 | self.init_weights() 30 | 31 | def init_weights(self): 32 | self.weight.data.fill_(1.0) 33 | self.bias.data.zero_() 34 | 35 | def forward(self, x): 36 | u = x.mean(-1, keepdim=True) 37 | s = (x - u).pow(2).mean(-1, keepdim=True) 38 | x = (x - u) / torch.sqrt(s + self.variance_epsilon) 39 | return self.weight * x + self.bias 40 | 41 | ################################## 42 | ######## Free format ############# 43 | ################################## 44 | """ 45 | - Baseline : 46 | - MLP(ANN) 구성 47 | - Skip Connection idea 적용 => 이전 정보를 효율적으로 활용 48 | - LayerNorm : 블록마다 feature Normalization 사용하여 수렴을 촉진 49 | - GELU 활성화 함수 적용 (미분 가능 및 음수 값에 대한 계산 확대) 50 | """ 51 | 52 | # 이 SkipConnectionModel 클래스는 앞서 정의한 SkipConnectionModel 53 | # 클래스에 계층 정규화를 추가한 클래스입니다. 54 | class SkipConnectionModel(nn.Module): 55 | """ 56 | >> model = Model(f_in, f_out, 300, 2000, 4000, 7000, 10000) 57 | 300, 2000, 4000, 7000, 10000 : channels 58 | """ 59 | def __init__(self, fn_in=226, fn_out=4, *args): 60 | super(SkipConnectionModel, self).__init__() 61 | self.ln = LayerNorm(10000) #10000 62 | self.ln1 = LayerNorm(7000) # 7000 63 | self.ln2 = LayerNorm(4000) # 4000 64 | self.ln3 = LayerNorm(2000) # 2000 65 | 66 | self.upblock1 = nn.Sequential(nn.Linear(fn_in, 2000),GELU(),nn.BatchNorm1d(2000)) 67 | self.upblock2 = nn.Sequential(nn.Linear(2000,4000),GELU(),nn.BatchNorm1d(4000)) 68 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000)) 69 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000)) 70 | 71 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000)) 72 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000)) 73 | self.downblock3 = nn.Sequential(nn.Linear(4000, 2000),GELU(),nn.BatchNorm1d(2000)) 74 | self.downblock4 = nn.Sequential(nn.Linear(2000, 300),GELU(),nn.BatchNorm1d(300)) 75 | 76 | self.fclayer = nn.Sequential(nn.Linear(300, fn_out)) 77 | self.dropout = nn.Dropout(0.1) 78 | 79 | def forward(self, x): 80 | upblock1_out = self.upblock1(x) 81 | upblock2_out = self.upblock2(upblock1_out) 82 | upblock3_out = self.upblock3(upblock2_out) 83 | upblock4_out = self.upblock4(upblock3_out) 84 | 85 | # upblock에서 나온 결괏값들의 정규화를 진행합니다. 86 | downblock1_out = self.downblock1(self.ln(upblock4_out)) 87 | skipblock1 = downblock1_out + upblock3_out 88 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 89 | skipblock2 = downblock2_out + upblock2_out 90 | downblock3_out = self.downblock3(self.ln2(skipblock2)) 91 | skipblock3 = downblock3_out + upblock1_out 92 | downblock4_out = self.downblock4(self.ln3(skipblock3)) 93 | 94 | output = self.fclayer(downblock4_out) 95 | 96 | return output 97 | 98 | ######################################## 99 | ######################################## 100 | 101 | """ 102 | - Test Models for Ensemble 103 | """ 104 | 105 | class TestModel(nn.Module): 106 | def __init__(self): 107 | super(TestModel, self).__init__() 108 | 109 | # self.ln = LayerNorm(1args[4]0) 110 | self.ln = LayerNorm(10000) 111 | self.ln1 = LayerNorm(7000) 112 | self.ln2 = LayerNorm(4000) 113 | self.ln3 = LayerNorm(2000) 114 | 115 | self.upblock1 = nn.Sequential(nn.Linear(226, 2000),GELU(),nn.BatchNorm1d(2000)) 116 | self.upblock2 = nn.Sequential(nn.Linear(2000,4000),GELU(),nn.BatchNorm1d(4000)) 117 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000)) 118 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000)) 119 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000)) 120 | 121 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000)) 122 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000)) 123 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000)) 124 | self.downblock3 = nn.Sequential(nn.Linear(4000, 2000),GELU(),nn.BatchNorm1d(2000)) 125 | self.downblock4 = nn.Sequential(nn.Linear(2000, 300),GELU(),nn.BatchNorm1d(300)) 126 | 127 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 128 | self.dropout = nn.Dropout(0.1) 129 | 130 | def forward(self, x): 131 | upblock1_out = self.upblock1(x) 132 | upblock2_out = self.upblock2(upblock1_out) 133 | upblock3_out = self.upblock3(upblock2_out) 134 | upblock4_out = self.upblock4(upblock3_out) 135 | #upblock5_out = self.upblock5(upblock4_out) 136 | 137 | downblock1_out = self.downblock1(self.ln(upblock4_out)) 138 | skipblock1 = downblock1_out + upblock3_out 139 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 140 | skipblock2 = downblock2_out + upblock2_out 141 | downblock3_out = self.downblock3(self.ln2(skipblock2)) 142 | skipblock3 = downblock3_out + upblock1_out 143 | downblock4_out = self.downblock4(self.ln3(skipblock3)) 144 | 145 | output = self.fclayer(downblock4_out) 146 | 147 | return output 148 | 149 | 150 | class TestModel1(nn.Module): 151 | def __init__(self): 152 | super(TestModel1, self).__init__() 153 | 154 | # self.ln = LayerNorm(13000) 155 | self.ln = LayerNorm(10000) 156 | self.ln1 = LayerNorm(7000) 157 | self.ln2 = LayerNorm(4000) 158 | self.ln3 = LayerNorm(1000) 159 | 160 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000)) 161 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000)) 162 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000)) 163 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000)) 164 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000)) 165 | 166 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000)) 167 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000)) 168 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000)) 169 | self.downblock3 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000)) 170 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300)) 171 | 172 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 173 | self.dropout = nn.Dropout(0.1) 174 | 175 | def forward(self, x): 176 | upblock1_out = self.upblock1(x) 177 | upblock2_out = self.upblock2(upblock1_out) 178 | upblock3_out = self.upblock3(upblock2_out) 179 | upblock4_out = self.upblock4(upblock3_out) 180 | #upblock5_out = self.upblock5(upblock4_out) 181 | 182 | downblock1_out = self.downblock1(self.ln(upblock4_out)) 183 | skipblock1 = downblock1_out + upblock3_out 184 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 185 | skipblock2 = downblock2_out + upblock2_out 186 | downblock3_out = self.downblock3(self.ln2(skipblock2)) 187 | skipblock3 = downblock3_out + upblock1_out 188 | downblock4_out = self.downblock4(self.ln3(skipblock3)) 189 | 190 | output = self.fclayer(downblock4_out) 191 | 192 | return output 193 | 194 | # Model 2 195 | 196 | class TestModel2(nn.Module): 197 | def __init__(self): 198 | super(TestModel2, self).__init__() 199 | 200 | # self.ln = LayerNorm(13000) 201 | self.ln = LayerNorm(20000) 202 | self.ln1 = LayerNorm(13000) 203 | self.ln2 = LayerNorm(7000) 204 | self.ln3 = LayerNorm(4000) 205 | self.ln4 = LayerNorm(1000) 206 | self.ln5 = LayerNorm(13000) 207 | 208 | 209 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 210 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),nn.ReLU(),nn.BatchNorm1d(4000)) 211 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000),nn.ReLU(),nn.BatchNorm1d(7000)) 212 | self.upblock4 = nn.Sequential(nn.Linear(7000,13000),nn.ReLU(),nn.BatchNorm1d(13000)) 213 | self.upblock5 = nn.Sequential(nn.Linear(13000,20000),nn.ReLU(),nn.BatchNorm1d(20000)) 214 | self.upblock6 = nn.Sequential(nn.Linear(20000,13000),nn.ReLU(),nn.BatchNorm1d(13000)) 215 | 216 | self.downblock1 = nn.Sequential(nn.Linear(13000, 20000),nn.ReLU(),nn.BatchNorm1d(20000)) 217 | self.downblock2 = nn.Sequential(nn.Linear(20000, 13000),nn.ReLU(),nn.BatchNorm1d(13000)) 218 | self.downblock3 = nn.Sequential(nn.Linear(13000, 7000),nn.ReLU(),nn.BatchNorm1d(7000)) 219 | self.downblock4 = nn.Sequential(nn.Linear(7000, 4000),nn.ReLU(),nn.BatchNorm1d(4000)) 220 | self.downblock5 = nn.Sequential(nn.Linear(4000, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 221 | self.downblock6 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300)) 222 | 223 | 224 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 225 | self.dropout = nn.Dropout(0.1) 226 | 227 | def forward(self, x): 228 | upblock1_out = self.upblock1(x) 229 | upblock2_out = self.upblock2(upblock1_out) 230 | upblock3_out = self.upblock3(upblock2_out) 231 | upblock4_out = self.upblock4(upblock3_out) 232 | upblock5_out = self.upblock5(upblock4_out) 233 | upblock6_out = self.upblock6(upblock5_out) 234 | 235 | 236 | downblock1_out = self.downblock1(self.ln1(upblock6_out)) 237 | skipblock1 = downblock1_out + upblock5_out # 20000 238 | downblock2_out = self.downblock2(self.ln(skipblock1)) 239 | skipblock2 = downblock2_out + upblock4_out # 13000 240 | downblock3_out = self.downblock3(self.ln5(skipblock2)) 241 | skipblock3 = downblock3_out + upblock3_out # 7000 242 | downblock4_out = self.downblock4(self.ln2(skipblock3)) 243 | skipblock4 = downblock4_out + upblock2_out # 4000 244 | 245 | downblock5_out = self.downblock5(self.ln3(skipblock4)) 246 | skipblock5 = downblock5_out + upblock1_out 247 | downblock6_out = self.downblock6(self.ln4(skipblock5)) 248 | 249 | output = self.fclayer(downblock6_out) 250 | 251 | return output 252 | 253 | # Model3 254 | class TestModel3(nn.Module): 255 | """ 256 | Model for (20,40) 257 | """ 258 | def __init__(self): 259 | super(TestModel3, self).__init__() 260 | 261 | self.ln = LayerNorm(17000) 262 | self.ln1 = LayerNorm(13000) 263 | self.ln2 = LayerNorm(7000) 264 | self.ln3 = LayerNorm(5000) 265 | 266 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 267 | self.upblock2 = nn.Sequential(nn.Linear(1000,3000),nn.ReLU(),nn.BatchNorm1d(3000)) 268 | self.upblock3 = nn.Sequential(nn.Linear(3000,5000),nn.ReLU(),nn.BatchNorm1d(5000)) 269 | self.upblock4 = nn.Sequential(nn.Linear(5000,7000),nn.ReLU(),nn.BatchNorm1d(7000)) 270 | self.upblock5 = nn.Sequential(nn.Linear(7000,13000),nn.ReLU(),nn.BatchNorm1d(13000)) 271 | self.upblock6 = nn.Sequential(nn.Linear(13000,17000),nn.ReLU(),nn.BatchNorm1d(17000)) 272 | 273 | self.downblock1 = nn.Sequential(nn.Linear(17000,13000),nn.ReLU(),nn.BatchNorm1d(13000)) 274 | self.downblock2 = nn.Sequential(nn.Linear(13000, 7000),nn.ReLU(),nn.BatchNorm1d(7000)) 275 | self.downblock3 = nn.Sequential(nn.Linear(7000, 5000),nn.ReLU(),nn.BatchNorm1d(5000)) 276 | self.downblock4 = nn.Sequential(nn.Linear(5000, 3000),nn.ReLU(),nn.BatchNorm1d(3000)) 277 | self.downblock5 = nn.Sequential(nn.Linear(3000, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 278 | self.downblock6 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300)) 279 | 280 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 281 | self.dropout = nn.Dropout(0.1) 282 | 283 | def forward(self, x): 284 | upblock1_out = self.upblock1(x) 285 | upblock2_out = self.upblock2(upblock1_out) 286 | upblock3_out = self.upblock3(upblock2_out) 287 | upblock4_out = self.upblock4(upblock3_out) 288 | upblock5_out = self.upblock5(upblock4_out) 289 | upblock6_out = self.upblock6(upblock5_out) 290 | 291 | downblock1_out = self.dropout(self.downblock1(self.ln(upblock6_out))) 292 | skipblock1 = downblock1_out + upblock5_out 293 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 294 | skipblock2 = downblock2_out + upblock4_out 295 | downblock3_out = self.dropout(self.downblock3(self.ln2(skipblock2))) 296 | skipblock3 = downblock3_out + upblock3_out 297 | downblock4_out = self.downblock4(self.ln3(skipblock3)) 298 | skipblock4 = downblock4_out + upblock2_out 299 | downblock5_out = self.downblock5(skipblock4) 300 | skipblock5 = self.dropout(downblock5_out + upblock1_out) 301 | downblock6_out = self.downblock6(skipblock5) 302 | 303 | output = self.fclayer(downblock6_out) 304 | 305 | return output 306 | 307 | class TestModel4(nn.Module): 308 | def __init__(self): 309 | super(TestModel4, self).__init__() 310 | 311 | self.ln = LayerNorm(10000) 312 | self.ln1 = LayerNorm(7000) 313 | self.ln2 = LayerNorm(4000) 314 | self.ln3 = LayerNorm(1000) 315 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 316 | self.upblock2 = nn.Sequential(nn.Linear(1000,10000),nn.ReLU(),nn.BatchNorm1d(10000)) 317 | self.upblock3 = nn.Sequential(nn.Linear(10000,7000), nn.ReLU(),nn.BatchNorm1d(7000)) 318 | self.upblock4 = nn.Sequential(nn.Linear(7000,4000),nn.ReLU(),nn.BatchNorm1d(4000)) 319 | 320 | self.downblock1 = nn.Sequential(nn.Linear(4000, 7000),nn.ReLU(),nn.BatchNorm1d(7000)) 321 | self.downblock2 = nn.Sequential(nn.Linear(7000, 10000),nn.ReLU(),nn.BatchNorm1d(10000)) 322 | self.downblock3 = nn.Sequential(nn.Linear(10000, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 323 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300)) 324 | 325 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 326 | self.dropout = nn.Dropout(0.1) 327 | 328 | def forward(self, x): 329 | upblock1_out = self.upblock1(x) 330 | upblock2_out = self.dropout(self.upblock2(upblock1_out)) 331 | upblock3_out = self.dropout(self.upblock3(upblock2_out)) 332 | upblock4_out = self.dropout(self.upblock4(upblock3_out)) 333 | 334 | downblock1_out = self.downblock1(self.ln2(upblock4_out)) 335 | skipblock1 = downblock1_out + upblock3_out # 7000 336 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 337 | skipblock2 = downblock2_out + upblock2_out # 10000 338 | downblock3_out = self.downblock3(self.ln(skipblock2)) 339 | skipblock3 = downblock3_out + upblock1_out 340 | downblock4_out = self.downblock4(self.ln3(skipblock3)) 341 | 342 | output = self.fclayer(downblock4_out) 343 | 344 | return output 345 | 346 | class TestModel5(nn.Module): 347 | def __init__(self): 348 | super(TestModel5, self).__init__() 349 | 350 | self.ln = LayerNorm(13000) 351 | self.ln1 = LayerNorm(11000) 352 | self.ln2 = LayerNorm(7000) 353 | self.ln3 = LayerNorm(4000) 354 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000)) 355 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000)) 356 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000)) 357 | self.upblock4 = nn.Sequential(nn.Linear(7000,11000),GELU(),nn.BatchNorm1d(11000)) 358 | self.upblock5 = nn.Sequential(nn.Linear(11000,13000),GELU(),nn.BatchNorm1d(13000)) 359 | 360 | self.downblock1 = nn.Sequential(nn.Linear(13000, 11000),GELU(),nn.BatchNorm1d(11000)) 361 | self.downblock2 = nn.Sequential(nn.Linear(11000, 7000),GELU(),nn.BatchNorm1d(7000)) 362 | self.downblock3 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000)) 363 | self.downblock4 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000)) 364 | self.downblock5 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300)) 365 | 366 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 367 | self.dropout = nn.Dropout(0.1) 368 | 369 | def forward(self, x): 370 | upblock1_out = self.upblock1(x) 371 | upblock2_out = self.upblock2(upblock1_out) 372 | upblock3_out = self.upblock3(upblock2_out) 373 | upblock4_out = self.upblock4(upblock3_out) 374 | upblock5_out = self.upblock5(upblock4_out) 375 | 376 | downblock1_out = self.downblock1(self.ln(upblock5_out)) 377 | skipblock1 = downblock1_out + upblock4_out 378 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 379 | skipblock2 = downblock2_out + upblock3_out 380 | downblock3_out = self.downblock3(self.ln2(skipblock2)) 381 | skipblock3 = downblock3_out + upblock2_out 382 | downblock4_out = self.dropout(self.downblock4(self.ln3(skipblock3))) 383 | skipblock4 = downblock4_out + upblock1_out 384 | downblock5_out = self.downblock5(skipblock4) 385 | 386 | output = self.fclayer(downblock5_out) 387 | 388 | return output 389 | 390 | 391 | class TestModel6(nn.Module): 392 | def __init__(self): 393 | super(TestModel6, self).__init__() 394 | 395 | # self.ln = LayerNorm(13000) 396 | self.ln = LayerNorm(10000) 397 | self.ln1 = LayerNorm(7000) 398 | self.ln2 = LayerNorm(4000) 399 | 400 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000)) 401 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000)) 402 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000)) 403 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000)) 404 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000)) 405 | 406 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000)) 407 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000)) 408 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000)) 409 | self.downblock3 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000)) 410 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300)) 411 | 412 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 413 | self.dropout = nn.Dropout(0.1) 414 | 415 | def forward(self, x): 416 | upblock1_out = self.upblock1(x) 417 | upblock2_out = self.upblock2(upblock1_out) 418 | upblock3_out = self.upblock3(upblock2_out) 419 | upblock4_out = self.upblock4(upblock3_out) 420 | #upblock5_out = self.upblock5(upblock4_out) 421 | 422 | downblock1_out = self.downblock1(self.ln(upblock4_out)) 423 | skipblock1 = downblock1_out + upblock3_out 424 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 425 | skipblock2 = downblock2_out + upblock2_out 426 | downblock3_out = self.downblock3(self.ln2(skipblock2)) 427 | skipblock3 = downblock3_out + upblock1_out 428 | downblock4_out = self.downblock4(skipblock3) 429 | 430 | output = self.fclayer(downblock4_out) 431 | 432 | return output 433 | -------------------------------------------------------------------------------- /ch02/src/get_score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # ## 점수 복원 코드 5 | # 6 | # - Baseline : - MLP(ANN) 모델 7 | # - Skip Connection idea 적용 => 이전 정보를 효율적으로 활용 8 | # - LayerNorm : 블록마다 feature Normalization 사용하여 수렴을 촉진 9 | # - GELU 활성화 함수 적용 (미분 가능 및 음수 값에 대한 계산 확대) 10 | # 11 | # - 조합을 통해 대표로 3가지 구성 - 파생 6가지 12 | 13 | import os 14 | import math 15 | import time 16 | from itertools import chain 17 | 18 | import numpy as np 19 | import pandas as pd 20 | import torch 21 | import torch.nn as nn 22 | 23 | from torch.utils.data import Dataset, DataLoader 24 | from tqdm.auto import tqdm 25 | 26 | 27 | # ## Model define 28 | # >Test Model ~ Test Model 6 29 | # 30 | # weight list: 31 | # 'test_(9, 49)_0.001_150.pth'-(819),'test_(20, 40)_0.001_80.pth-(2.92)', 'test_(20, 57)_0.001_100.pth'-(819), 32 | # 'test_(12, 20)_0.001_100.pth'-(853),'test_(10, 7)_0.001_100.pth'-(867), 'test_(0, 54)_0.001_70.pth'-(819), 33 | # 'test_(16, 47)_0.0001_80.pth'-(2.02),'test_(11, 43)_0.0001_70.pth-(2.02G)', 'test_(0, 9)_0.0001_70.pth'-(819), 34 | # 'test_(19, 4)_0.001_200.pth'-(819) 35 | # 36 | # 37 | # > 38 | # 39 | # TestModel : (12,20) 40 | # TestModel1 : (0, 54) 41 | # TestModel2 : (2.40) 42 | # TestModel 4 : (10,7) 43 | # TestModel 5 : (16,47), (11,43), (0,9) 44 | # TestModel 6 : (19, 4), (20, 57), (9, 49) 45 | # 46 | 47 | # In[2]: 48 | 49 | 50 | # 각각의 모델들 51 | 52 | class GELU(nn.Module): 53 | """ 54 | Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU 55 | """ 56 | def forward(self, x): 57 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 58 | 59 | class LayerNorm(nn.Module): 60 | def __init__(self, hidden_size, eps=1e-5): 61 | """Construct a layernorm module in the TF style (epsilon inside the square root). 62 | """ 63 | super(LayerNorm, self).__init__() 64 | self.weight = nn.Parameter(torch.ones(hidden_size)) 65 | self.bias = nn.Parameter(torch.zeros(hidden_size)) 66 | self.variance_epsilon = eps 67 | 68 | self.init_weights() 69 | 70 | def init_weights(self): 71 | self.weight.data.fill_(1.0) 72 | self.bias.data.zero_() 73 | 74 | def forward(self, x): 75 | u = x.mean(-1, keepdim=True) 76 | s = (x - u).pow(2).mean(-1, keepdim=True) 77 | x = (x - u) / torch.sqrt(s + self.variance_epsilon) 78 | return self.weight * x + self.bias 79 | 80 | # Model 1 81 | class TestModel(nn.Module): 82 | def __init__(self): 83 | super(TestModel, self).__init__() 84 | 85 | # self.ln = LayerNorm(13000) 86 | self.ln = LayerNorm(10000) 87 | self.ln1 = LayerNorm(7000) 88 | self.ln2 = LayerNorm(4000) 89 | self.ln3 = LayerNorm(2000) 90 | 91 | self.upblock1 = nn.Sequential(nn.Linear(226, 2000),GELU(),nn.BatchNorm1d(2000)) 92 | self.upblock2 = nn.Sequential(nn.Linear(2000,4000),GELU(),nn.BatchNorm1d(4000)) 93 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000)) 94 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000)) 95 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000)) 96 | 97 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000)) 98 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000)) 99 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000)) 100 | self.downblock3 = nn.Sequential(nn.Linear(4000, 2000),GELU(),nn.BatchNorm1d(2000)) 101 | self.downblock4 = nn.Sequential(nn.Linear(2000, 300),GELU(),nn.BatchNorm1d(300)) 102 | 103 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 104 | self.dropout = nn.Dropout(0.1) 105 | 106 | def forward(self, x): 107 | upblock1_out = self.upblock1(x) 108 | upblock2_out = self.upblock2(upblock1_out) 109 | upblock3_out = self.upblock3(upblock2_out) 110 | upblock4_out = self.upblock4(upblock3_out) 111 | #upblock5_out = self.upblock5(upblock4_out) 112 | 113 | downblock1_out = self.downblock1(self.ln(upblock4_out)) 114 | skipblock1 = downblock1_out + upblock3_out 115 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 116 | skipblock2 = downblock2_out + upblock2_out 117 | downblock3_out = self.downblock3(self.ln2(skipblock2)) 118 | skipblock3 = downblock3_out + upblock1_out 119 | downblock4_out = self.downblock4(self.ln3(skipblock3)) 120 | 121 | output = self.fclayer(downblock4_out) 122 | 123 | return output 124 | 125 | class TestModel1(nn.Module): 126 | def __init__(self): 127 | super(TestModel1, self).__init__() 128 | 129 | # self.ln = LayerNorm(13000) 130 | self.ln = LayerNorm(10000) 131 | self.ln1 = LayerNorm(7000) 132 | self.ln2 = LayerNorm(4000) 133 | self.ln3 = LayerNorm(1000) 134 | 135 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000)) 136 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000)) 137 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000)) 138 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000)) 139 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000)) 140 | 141 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000)) 142 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000)) 143 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000)) 144 | self.downblock3 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000)) 145 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300)) 146 | 147 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 148 | self.dropout = nn.Dropout(0.1) 149 | 150 | def forward(self, x): 151 | upblock1_out = self.upblock1(x) 152 | upblock2_out = self.upblock2(upblock1_out) 153 | upblock3_out = self.upblock3(upblock2_out) 154 | upblock4_out = self.upblock4(upblock3_out) 155 | #upblock5_out = self.upblock5(upblock4_out) 156 | 157 | downblock1_out = self.downblock1(self.ln(upblock4_out)) 158 | skipblock1 = downblock1_out + upblock3_out 159 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 160 | skipblock2 = downblock2_out + upblock2_out 161 | downblock3_out = self.downblock3(self.ln2(skipblock2)) 162 | skipblock3 = downblock3_out + upblock1_out 163 | downblock4_out = self.downblock4(self.ln3(skipblock3)) 164 | 165 | output = self.fclayer(downblock4_out) 166 | 167 | return output 168 | 169 | # Model 2 170 | 171 | class TestModel2(nn.Module): 172 | def __init__(self): 173 | super(TestModel2, self).__init__() 174 | 175 | # self.ln = LayerNorm(13000) 176 | self.ln = LayerNorm(20000) 177 | self.ln1 = LayerNorm(13000) 178 | self.ln2 = LayerNorm(7000) 179 | self.ln3 = LayerNorm(4000) 180 | self.ln4 = LayerNorm(1000) 181 | self.ln5 = LayerNorm(13000) 182 | 183 | 184 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 185 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),nn.ReLU(),nn.BatchNorm1d(4000)) 186 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000),nn.ReLU(),nn.BatchNorm1d(7000)) 187 | self.upblock4 = nn.Sequential(nn.Linear(7000,13000),nn.ReLU(),nn.BatchNorm1d(13000)) 188 | self.upblock5 = nn.Sequential(nn.Linear(13000,20000),nn.ReLU(),nn.BatchNorm1d(20000)) 189 | self.upblock6 = nn.Sequential(nn.Linear(20000,13000),nn.ReLU(),nn.BatchNorm1d(13000)) 190 | 191 | self.downblock1 = nn.Sequential(nn.Linear(13000, 20000),nn.ReLU(),nn.BatchNorm1d(20000)) 192 | self.downblock2 = nn.Sequential(nn.Linear(20000, 13000),nn.ReLU(),nn.BatchNorm1d(13000)) 193 | self.downblock3 = nn.Sequential(nn.Linear(13000, 7000),nn.ReLU(),nn.BatchNorm1d(7000)) 194 | self.downblock4 = nn.Sequential(nn.Linear(7000, 4000),nn.ReLU(),nn.BatchNorm1d(4000)) 195 | self.downblock5 = nn.Sequential(nn.Linear(4000, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 196 | self.downblock6 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300)) 197 | 198 | 199 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 200 | self.dropout = nn.Dropout(0.1) 201 | 202 | def forward(self, x): 203 | upblock1_out = self.upblock1(x) 204 | upblock2_out = self.upblock2(upblock1_out) 205 | upblock3_out = self.upblock3(upblock2_out) 206 | upblock4_out = self.upblock4(upblock3_out) 207 | upblock5_out = self.upblock5(upblock4_out) 208 | upblock6_out = self.upblock6(upblock5_out) 209 | 210 | 211 | downblock1_out = self.downblock1(self.ln1(upblock6_out)) 212 | skipblock1 = downblock1_out + upblock5_out # 20000 213 | downblock2_out = self.downblock2(self.ln(skipblock1)) 214 | skipblock2 = downblock2_out + upblock4_out # 13000 215 | downblock3_out = self.downblock3(self.ln5(skipblock2)) 216 | skipblock3 = downblock3_out + upblock3_out # 7000 217 | downblock4_out = self.downblock4(self.ln2(skipblock3)) 218 | skipblock4 = downblock4_out + upblock2_out # 4000 219 | 220 | downblock5_out = self.downblock5(self.ln3(skipblock4)) 221 | skipblock5 = downblock5_out + upblock1_out 222 | downblock6_out = self.downblock6(self.ln4(skipblock5)) 223 | 224 | output = self.fclayer(downblock6_out) 225 | 226 | return output 227 | 228 | # Model3 229 | class TestModel3(nn.Module): 230 | """ 231 | Model for (20,40) 232 | """ 233 | def __init__(self): 234 | super(TestModel3, self).__init__() 235 | 236 | self.ln = LayerNorm(17000) 237 | self.ln1 = LayerNorm(13000) 238 | self.ln2 = LayerNorm(7000) 239 | self.ln3 = LayerNorm(5000) 240 | 241 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 242 | self.upblock2 = nn.Sequential(nn.Linear(1000,3000),nn.ReLU(),nn.BatchNorm1d(3000)) 243 | self.upblock3 = nn.Sequential(nn.Linear(3000,5000),nn.ReLU(),nn.BatchNorm1d(5000)) 244 | self.upblock4 = nn.Sequential(nn.Linear(5000,7000),nn.ReLU(),nn.BatchNorm1d(7000)) 245 | self.upblock5 = nn.Sequential(nn.Linear(7000,13000),nn.ReLU(),nn.BatchNorm1d(13000)) 246 | self.upblock6 = nn.Sequential(nn.Linear(13000,17000),nn.ReLU(),nn.BatchNorm1d(17000)) 247 | 248 | self.downblock1 = nn.Sequential(nn.Linear(17000,13000),nn.ReLU(),nn.BatchNorm1d(13000)) 249 | self.downblock2 = nn.Sequential(nn.Linear(13000, 7000),nn.ReLU(),nn.BatchNorm1d(7000)) 250 | self.downblock3 = nn.Sequential(nn.Linear(7000, 5000),nn.ReLU(),nn.BatchNorm1d(5000)) 251 | self.downblock4 = nn.Sequential(nn.Linear(5000, 3000),nn.ReLU(),nn.BatchNorm1d(3000)) 252 | self.downblock5 = nn.Sequential(nn.Linear(3000, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 253 | self.downblock6 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300)) 254 | 255 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 256 | self.dropout = nn.Dropout(0.1) 257 | 258 | def forward(self, x): 259 | upblock1_out = self.upblock1(x) 260 | upblock2_out = self.upblock2(upblock1_out) 261 | upblock3_out = self.upblock3(upblock2_out) 262 | upblock4_out = self.upblock4(upblock3_out) 263 | upblock5_out = self.upblock5(upblock4_out) 264 | upblock6_out = self.upblock6(upblock5_out) 265 | 266 | downblock1_out = self.dropout(self.downblock1(self.ln(upblock6_out))) 267 | skipblock1 = downblock1_out + upblock5_out 268 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 269 | skipblock2 = downblock2_out + upblock4_out 270 | downblock3_out = self.dropout(self.downblock3(self.ln2(skipblock2))) 271 | skipblock3 = downblock3_out + upblock3_out 272 | downblock4_out = self.downblock4(self.ln3(skipblock3)) 273 | skipblock4 = downblock4_out + upblock2_out 274 | downblock5_out = self.downblock5(skipblock4) 275 | skipblock5 = self.dropout(downblock5_out + upblock1_out) 276 | downblock6_out = self.downblock6(skipblock5) 277 | 278 | output = self.fclayer(downblock6_out) 279 | 280 | return output 281 | 282 | class TestModel4(nn.Module): 283 | def __init__(self): 284 | super(TestModel4, self).__init__() 285 | 286 | self.ln = LayerNorm(10000) 287 | self.ln1 = LayerNorm(7000) 288 | self.ln2 = LayerNorm(4000) 289 | self.ln3 = LayerNorm(1000) 290 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 291 | self.upblock2 = nn.Sequential(nn.Linear(1000,10000),nn.ReLU(),nn.BatchNorm1d(10000)) 292 | self.upblock3 = nn.Sequential(nn.Linear(10000,7000), nn.ReLU(),nn.BatchNorm1d(7000)) 293 | self.upblock4 = nn.Sequential(nn.Linear(7000,4000),nn.ReLU(),nn.BatchNorm1d(4000)) 294 | 295 | self.downblock1 = nn.Sequential(nn.Linear(4000, 7000),nn.ReLU(),nn.BatchNorm1d(7000)) 296 | self.downblock2 = nn.Sequential(nn.Linear(7000, 10000),nn.ReLU(),nn.BatchNorm1d(10000)) 297 | self.downblock3 = nn.Sequential(nn.Linear(10000, 1000),nn.ReLU(),nn.BatchNorm1d(1000)) 298 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300)) 299 | 300 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 301 | self.dropout = nn.Dropout(0.1) 302 | 303 | def forward(self, x): 304 | upblock1_out = self.upblock1(x) 305 | upblock2_out = self.dropout(self.upblock2(upblock1_out)) 306 | upblock3_out = self.dropout(self.upblock3(upblock2_out)) 307 | upblock4_out = self.dropout(self.upblock4(upblock3_out)) 308 | 309 | downblock1_out = self.downblock1(self.ln2(upblock4_out)) 310 | skipblock1 = downblock1_out + upblock3_out # 7000 311 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 312 | skipblock2 = downblock2_out + upblock2_out # 10000 313 | downblock3_out = self.downblock3(self.ln(skipblock2)) 314 | skipblock3 = downblock3_out + upblock1_out 315 | downblock4_out = self.downblock4(self.ln3(skipblock3)) 316 | 317 | output = self.fclayer(downblock4_out) 318 | 319 | return output 320 | 321 | class TestModel5(nn.Module): 322 | def __init__(self): 323 | super(TestModel5, self).__init__() 324 | 325 | self.ln = LayerNorm(13000) 326 | self.ln1 = LayerNorm(11000) 327 | self.ln2 = LayerNorm(7000) 328 | self.ln3 = LayerNorm(4000) 329 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000)) 330 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000)) 331 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000)) 332 | self.upblock4 = nn.Sequential(nn.Linear(7000,11000),GELU(),nn.BatchNorm1d(11000)) 333 | self.upblock5 = nn.Sequential(nn.Linear(11000,13000),GELU(),nn.BatchNorm1d(13000)) 334 | 335 | self.downblock1 = nn.Sequential(nn.Linear(13000, 11000),GELU(),nn.BatchNorm1d(11000)) 336 | self.downblock2 = nn.Sequential(nn.Linear(11000, 7000),GELU(),nn.BatchNorm1d(7000)) 337 | self.downblock3 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000)) 338 | self.downblock4 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000)) 339 | self.downblock5 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300)) 340 | 341 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 342 | self.dropout = nn.Dropout(0.1) 343 | 344 | def forward(self, x): 345 | upblock1_out = self.upblock1(x) 346 | upblock2_out = self.upblock2(upblock1_out) 347 | upblock3_out = self.upblock3(upblock2_out) 348 | upblock4_out = self.upblock4(upblock3_out) 349 | upblock5_out = self.upblock5(upblock4_out) 350 | 351 | downblock1_out = self.downblock1(self.ln(upblock5_out)) 352 | skipblock1 = downblock1_out + upblock4_out 353 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 354 | skipblock2 = downblock2_out + upblock3_out 355 | downblock3_out = self.downblock3(self.ln2(skipblock2)) 356 | skipblock3 = downblock3_out + upblock2_out 357 | downblock4_out = self.dropout(self.downblock4(self.ln3(skipblock3))) 358 | skipblock4 = downblock4_out + upblock1_out 359 | downblock5_out = self.downblock5(skipblock4) 360 | 361 | output = self.fclayer(downblock5_out) 362 | 363 | return output 364 | 365 | 366 | class TestModel6(nn.Module): 367 | def __init__(self): 368 | super(TestModel6, self).__init__() 369 | 370 | # self.ln = LayerNorm(13000) 371 | self.ln = LayerNorm(10000) 372 | self.ln1 = LayerNorm(7000) 373 | self.ln2 = LayerNorm(4000) 374 | 375 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000)) 376 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000)) 377 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000)) 378 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000)) 379 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000)) 380 | 381 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000)) 382 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000)) 383 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000)) 384 | self.downblock3 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000)) 385 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300)) 386 | 387 | self.fclayer = nn.Sequential(nn.Linear(300,4)) 388 | self.dropout = nn.Dropout(0.1) 389 | 390 | def forward(self, x): 391 | upblock1_out = self.upblock1(x) 392 | upblock2_out = self.upblock2(upblock1_out) 393 | upblock3_out = self.upblock3(upblock2_out) 394 | upblock4_out = self.upblock4(upblock3_out) 395 | #upblock5_out = self.upblock5(upblock4_out) 396 | 397 | downblock1_out = self.downblock1(self.ln(upblock4_out)) 398 | skipblock1 = downblock1_out + upblock3_out 399 | downblock2_out = self.downblock2(self.ln1(skipblock1)) 400 | skipblock2 = downblock2_out + upblock2_out 401 | downblock3_out = self.downblock3(self.ln2(skipblock2)) 402 | skipblock3 = downblock3_out + upblock1_out 403 | downblock4_out = self.downblock4(skipblock3) 404 | 405 | output = self.fclayer(downblock4_out) 406 | 407 | return output 408 | 409 | 410 | # ## Load weights and Test 411 | # 412 | # - train 된 pth 파일을 갖고 옵니다. 413 | # - test file을 읽어서 evaluation mode로 모델을 테스트합니다. 414 | # - 결과를 csv 파일로 반환합니다. 415 | # 416 | 417 | # In[11]: 418 | 419 | 420 | # 모델들을 dictionary 형태로 정의하여 바로 사용할 수 있게 합니다. 421 | models = { 422 | 'model':TestModel(), 423 | 'model1': TestModel1(), 424 | 'model2': TestModel2(), 425 | 'model3': TestModel3(), 426 | 'model4': TestModel4(), 427 | 'model5': TestModel5(), 428 | 'model6': TestModel6() 429 | } 430 | 431 | 432 | # 테스트 파일 경로 433 | path_test = 'test.csv' 434 | # pth 파일 리스트들 435 | pth_list = os.listdir('./outputs') # 'outputs' pth들이 저장된 경로 436 | 437 | print(pth_list) # 2.pth > test_(10, 12)_0.0005_200 로 변경 예정 438 | 439 | # csv가 저장될 디렉토리를 미리 만들어 놓습니다. 440 | if os.path.exists('test'): # 'test' 는 USER에 맞게 지정하시면 됩니다. 441 | pass 442 | else: 443 | os.mkdir('test') 444 | 445 | 446 | # In[10]: 447 | 448 | 449 | os.path.exists('test') 450 | 451 | 452 | # In[4]: 453 | 454 | 455 | # Test 456 | # 테스트 데이터셋을 정의하고 부릅니다. 457 | class TestDataset(Dataset): 458 | def __init__(self, path_test): 459 | super(TestDataset, self).__init__() 460 | test = pd.read_csv(path_test) 461 | self.test_X = test.iloc[:,1:] 462 | self.tmp_x = self.test_X.values 463 | 464 | def __len__(self): 465 | return len(self.test_X) 466 | 467 | def __getitem__(self, idx): 468 | return torch.from_numpy(self.tmp_x)[idx] 469 | 470 | test_data = TestDataset(path_test) 471 | test_loader = DataLoader(test_data, batch_size=10000, num_workers=4) 472 | 473 | 474 | # In[5]: 475 | 476 | 477 | # pth 가중치를 불러와서 모델을 테스트하고 그 결과 csv 파일을 씁니다. 478 | def test_model(path_pth, test_loader, model_type:str): 479 | model = models[model_type] 480 | ws = torch.load(f'./outputs/{path_pth}', map_location='cpu') # 불러옴 481 | model.load_state_dict(ws) 482 | model.eval() 483 | 484 | with torch.no_grad(): 485 | for data in test_loader: 486 | outputs = model(data.float()) # 모델을 테스트 487 | pred_test = outputs 488 | 489 | sample_sub = pd.read_csv('sample_submission.csv', index_col=0) 490 | layers = ['layer_1','layer_2','layer_3','layer_4'] 491 | submission = sample_sub.values + pred_test.numpy() 492 | 493 | submission = pd.DataFrame(data=submission,columns=layers) 494 | submission.to_csv(f'./test/{path_pth[:-4]}.csv', index_label='id') # test 경로에 csv 파일 저장 495 | 496 | 497 | # In[6]: 498 | 499 | 500 | # 앙상블 할 모델에 대해서 파일을 씁니다. 501 | for pth in sorted(pth_list): 502 | if pth[-3:] != 'pth': 503 | pass 504 | else: 505 | if int(pth[0]) == 0: 506 | test_model(pth, test_loader, model_type='model') 507 | elif int(pth[0]) == 1: 508 | test_model(pth, test_loader, model_type='model1') 509 | elif int(pth[0]) == 2: 510 | #test_model(pth, test_loader, model_type='model2') 511 | pass 512 | elif int(pth[0]) == 3: 513 | test_model(pth, test_loader, model_type='model4') 514 | elif int(pth[0]) > 3 and int(pth[0]) <7: 515 | test_model(pth, test_loader, model_type='model5') 516 | elif int(pth[0])>= 7: 517 | test_model(pth, test_loader, model_type='model6') 518 | 519 | 520 | # In[7]: 521 | 522 | 523 | def check_state(model): 524 | for val in model.state_dict().keys(): 525 | if val[-4:] =='bias': 526 | pass 527 | else: 528 | print(f'{val} : {model.state_dict()[val].shape}') 529 | -------------------------------------------------------------------------------- /ch03/submission.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 3.5. 성능 향상을 위한 방법" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "import numpy as np\n", 18 | "import pandas as pd\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "import seaborn as sns" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "sub_dir = \"C:/dacon/ch03/submission/\"\n", 30 | "os.chdir(sub_dir)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### 3.5.1. 앙상블" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "5개의 submission 파일명을 각각 '모델 번호_사용한 모델=임시 스코어.csv' 형식으로 변경한 뒤 다음 과정을 진행합니다.
예를 들어, 'model1_lgbm.csv' 파일의 임시 스코어가 2.29라면 'model1_lgbm=2.29.csv' 형식으로 변경하는 것입니다." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "sub_list = [] # 작업 경로 안에 있는 파일 중 확장자가 .csv인 파일을 담을 리스트 생성\n", 54 | "fname_list = [] # 확장자를 제외한 파일명을 담을 리스트 생성\n", 55 | "\n", 56 | "for filename in os.listdir(): # 작업 경로 안에 있는 모든 파일의 리스트 불러오기\n", 57 | " fname, ext = os.path.splitext(filename) # filename을 파일명과 확장자로 분리\n", 58 | " if ext == '.csv': # 확장자가 .csv인 파일인 경우\n", 59 | " sub_list.append(filename) # filename을 sub_list에 추가\n", 60 | " fname_list.append(fname) # 파일명을 fname_list에 추가" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "#### 3.5.1.1. 결괏값 간 상관계수 확인" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/html": [ 78 | "
\n", 79 | "\n", 92 | "\n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | "
model1_lgbm=2.29.csvmodel2_rf=2.34.csvmodel3_rf=2.38.csvmodel4_rf=2.36.csvmodel5_rf=2.31.csv
model1_lgbm=2.29.csv1.0000000.9772460.9725290.9736760.981939
model2_rf=2.34.csv0.9772461.0000000.9957760.9928850.990955
model3_rf=2.38.csv0.9725290.9957761.0000000.9952660.986214
model4_rf=2.36.csv0.9736760.9928850.9952661.0000000.983549
model5_rf=2.31.csv0.9819390.9909550.9862140.9835491.000000
\n", 146 | "
" 147 | ], 148 | "text/plain": [ 149 | " model1_lgbm=2.29.csv model2_rf=2.34.csv \\\n", 150 | "model1_lgbm=2.29.csv 1.000000 0.977246 \n", 151 | "model2_rf=2.34.csv 0.977246 1.000000 \n", 152 | "model3_rf=2.38.csv 0.972529 0.995776 \n", 153 | "model4_rf=2.36.csv 0.973676 0.992885 \n", 154 | "model5_rf=2.31.csv 0.981939 0.990955 \n", 155 | "\n", 156 | " model3_rf=2.38.csv model4_rf=2.36.csv \\\n", 157 | "model1_lgbm=2.29.csv 0.972529 0.973676 \n", 158 | "model2_rf=2.34.csv 0.995776 0.992885 \n", 159 | "model3_rf=2.38.csv 1.000000 0.995266 \n", 160 | "model4_rf=2.36.csv 0.995266 1.000000 \n", 161 | "model5_rf=2.31.csv 0.986214 0.983549 \n", 162 | "\n", 163 | " model5_rf=2.31.csv \n", 164 | "model1_lgbm=2.29.csv 0.981939 \n", 165 | "model2_rf=2.34.csv 0.990955 \n", 166 | "model3_rf=2.38.csv 0.986214 \n", 167 | "model4_rf=2.36.csv 0.983549 \n", 168 | "model5_rf=2.31.csv 1.000000 " 169 | ] 170 | }, 171 | "execution_count": 4, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "# 상관계수 행렬을 저장할 데이터프레임 생성\n", 178 | "corr_df = pd.DataFrame()\n", 179 | "\n", 180 | "for file in sub_list:\n", 181 | " # 각 submission 파일의 18~20_ride 변수를 sub_df에 저장\n", 182 | " sub_df = pd.read_csv(file, engine = 'python').iloc[:,1:]\n", 183 | " # 변수명을 파일의 이름으로 지정\n", 184 | " sub_df.columns = [str(file)]\n", 185 | " # sub_df를 corr_df에 병합\n", 186 | " corr_df = pd.concat([corr_df, sub_df], axis = 1)\n", 187 | "\n", 188 | "# 상관계수 행렬 출력 \n", 189 | "corr_df.corr()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 5, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/html": [ 200 | "
\n", 201 | "\n", 214 | "\n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | "
modelpublic_rmsecor
0model1_lgbm2.290.981078
1model2_rf2.340.991372
2model3_rf2.380.989957
3model4_rf2.360.989075
4model5_rf2.310.988531
\n", 256 | "
" 257 | ], 258 | "text/plain": [ 259 | " model public_rmse cor\n", 260 | "0 model1_lgbm 2.29 0.981078\n", 261 | "1 model2_rf 2.34 0.991372\n", 262 | "2 model3_rf 2.38 0.989957\n", 263 | "3 model4_rf 2.36 0.989075\n", 264 | "4 model5_rf 2.31 0.988531" 265 | ] 266 | }, 267 | "execution_count": 5, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "corr_list = np.array(corr_df.corr().mean(axis = 0)) \n", 274 | "\n", 275 | "model_list = [] # 모델명을 담을 리스트 생성\n", 276 | "public_rmse_list = [] # 각 submission 파일의 임시 스코어를 담을 리스트 생성\n", 277 | "\n", 278 | "for fname in fname_list:\n", 279 | " model = fname.split('=')[0] # '=' 기호를 기준으로 모델명을 분리\n", 280 | " model_list.append(model) # 모델명을 model_list에 추가\n", 281 | "\n", 282 | " score = fname.split('=')[-1] # '=' 기호를 기준으로 임시 스코어를 분리\n", 283 | " score = float(score) # 실수형으로 변환\n", 284 | " public_rmse_list.append(score) # 임시 스코어를 public_rmse_list에 추가\n", 285 | " \n", 286 | "# model_list, public_rmse_list, corr_list를 변수로 하는 데이터프레임 생성\n", 287 | "score_df = pd.DataFrame({'model': model_list, 'public_rmse': public_rmse_list,\n", 288 | " 'cor': corr_list})\n", 289 | "score_df" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 6, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnUAAAE9CAYAAABtFJTIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAuHklEQVR4nO3de3hV5Zn///fNUSAY0khVbAXqVBFt0TYqI04AxeNQoNBRO0SkTtWZ+hWoQxnijLZMi0M66Ggvtdra1raoDNoDauWrSA0iaks8IFXoWCotZWR+ms5XQRAMeX5/ZIuEYwLZO8ni/bquXOy91vOsfa+bfZEP67B3pJSQJElS+9ahtQuQJEnSgTPUSZIkZYChTpIkKQMMdZIkSRlgqJMkScoAQ50kSVIGdGrtAlrSYYcdlvr169faZWTWO++8Q48ePVq7jMyzz4VjrwvHXheOvS6cA+31c88992ZKqXdL1ZOpUNevXz9qampau4zMqq6uZtiwYa1dRubZ58Kx14VjrwvHXhfOgfY6Iv7QctV4+lWSJCkTDHWSJEkZYKiTJEnKAEOdJElSBhjqJEnSXtXU1BARTJw4cZ9j+/XrR1FREQC/+c1vOP744znkkEPo1asXf/3Xf826deua9dr19fVccsklFBUVERE8/vjj+7MLBwVDnSRJyosOHTpw8cUX853vfIexY8fyyCOPMGPGjCbPr6urY+XKlcyZM4djjz2We+65h0984hN5rLh9M9RJkpQxa9asISIoLy/n/PPPp2fPnlRVVXHjjTdSXFzMSSedxJo1a1i7di1jxoyhpKSEPn36MGXKFLZs2QLAokWL6N+/P3379mXu3LmNtr9u3TrGjRu3fd706dOpr6/fpY6BAwdSWVnJeeedx+mnnw40BL09ufvuu4kILrroIk444QQuvPBCTjzxRABeeOEFxo8fz+bNm1uqTZljqJMkKaOeeeYZzj77bEpLS6msrGTBggVMnDiR5cuXc/PNNzN+/Hgeeughpk2bxrnnnsstt9zCzJkz2bJlCxUVFdTW1jJt2jSWLVvWaLsVFRUsXLiQyZMnM2rUKKqqqrj99tt3W8MjjzzC4YcfzuWXX84JJ5zQpCN1jz76KFdeeSUTJkxg5syZAJSXl3PffffRu3eLfVZv5uQ11EXERyPiiYhYGREvR8Tk3YwZHREvRcSLEVETEWfssO7LuXm/iYj7IuKQfNYrSVKWnHbaaVxzzTUMGTKElBKVlZVMmjQJgBUrVrBkyRIGDx5MZWUld9xxBx06dGDBggWsWrWK9evXM3r0aK666iquv/767dvcuHEjixcvZsOGDcyYMYM777wTgMcee2y3NQwZMoQFCxYwefJkXn755e3j9+ayyy5j0qRJjBkzhnPOOQeA/v37c/HFF/ttGXuR7yN1dcA/ppSOBwYDV0XEwJ3GLAIGpZROAi4D7gKIiKOASUBZSulEoCNwcZ7rlSSp3dm0tY61f97Epq11jZb36tULgM6dOwNQXFxMx44dG42JiH1uP6W0y7JBgwaxcOHC7T/XXXfdbuf27t2b8847jxtvvJEOHTowb968fb5enz599jlGu8rr14SllF4HXs893hARK4GjgFd2GLNxhyk9gB3fOZ2AbhHxHtAd+O981itJUnuz8vW3+frDr/D06lpOP6aUCf12vbZtd7p160Z5eTlLly5l1qxZvPrqq9TX13PBBRcwYMAAjjjiCObPn89tt93WKIgVFRUxdOhQnnzySZYsWcJRRx3FU089xYABAzjllFMavca//du/8fbbb3Pcccfxy1/+kvr6egYO3PnYjlpKwa6pi4h+wMnAr3az7rMRsQr4BQ1H60gprQNmA3+kIRi+lVLa/bFdSZIOQpu21m0PdABPr67l9bc2s3mnI3Z7MmfOHEaOHMmsWbN45JFHmDRpEtdeey1du3Zlzpw5lJaWcsMNN3DqqafuMm/s2LHceuutTJ06ldWrV+8yBhqO0t17771cccUVPProo3z+85/n1ltvPfAd127F7g6ptviLRBQBi4GZKaWf7mVcOXB9SmlERJQAPwEuAv4fcD/wQEppzk5zrgCuADj88MM/vfMdOmo5Gzdu3P7ZQ8of+1w49rpw7HV+vLetnlXrNzRadng3KCnuSeeObfdeyLq6Ot55551Gyzp27Nju3iMH+r4ePnz4cymlspaqJ++hLiI6Aw8Dj6aUbmrC+NeAU4DhwHkppb/LLZ8ADE4pfWlPc8vKylJNTU3LFK5dVFdXM2zYsNYuI/Psc+HY68Kx1/mxaWsdX/xhzfYjdQBfLYOLRp1L9y55vcLqgFRXVzN8+PBGy4YOHUp1dXXrFLSfDvR9HREtGury+jceDVdffg9YuadAFxF/AaxOKaWI+BTQBail4bTr4IjoDmwGzgJMbJIk5XTv0onrRg7cfgp2yDGlHFm8qU0HOvjgJosdlZSUtFI12ZHvv/UhwCXAioh4MbfsWuBogJTSHcA4YELuZojNwEWp4fDhryLiAeB5Gu6ifQH4Tp7rlSSpXTn+yEO569Iyat/ZSmmPLvz66adau6R9KikpYcSIEa1dRubk++7Xp4C93iudUqoCqvaw7qvAV/NQmiRJmdG9S6c2f3RO+dd2r6KUJElSkxnqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJElqZTU1NUQEEydO3OfYfv36UVRU1GjZqlWr6Nq1KxHB4sWLm/Xa9fX1XHLJJRQVFRERPP74482ar7bDUCdJUjuWUuLyyy+nU6dOzZ5bV1fHypUrmTNnDsceeyz33HMPn/jEJ/JQpQrBUCdJUjOtWbOGiKC8vJzzzz+fnj17UlVVxY033khxcTEnnXQSa9asYe3atYwZM4aSkhL69OnDlClT2LJlCwCLFi2if//+9O3bl7lz5zba/rp16xg3btz2edOnT6e+vn63tXz7299mzZo1XHnllfus++677yYiuOiiizjhhBO48MILOfHEEwF44YUXGD9+PJs3bz7A7qi1GOokSdpPzzzzDGeffTalpaVUVlayYMECJk6cyPLly7n55psZP348Dz30ENOmTePcc8/llltuYebMmWzZsoWKigpqa2uZNm0ay5Yta7TdiooKFi5cyOTJkxk1ahRVVVXcfvvtu7z+unXrqKys5Nvf/jaHHnpok+t+9NFHufLKK5kwYQIzZ84EoLy8nPvuu4/evXsfWFPUavIa6iLioxHxRESsjIiXI2LybsaMjoiXIuLFiKiJiDN2WNcrIh6IiFW5bfxlPuuVJKk5TjvtNK655hqGDBlCSonKykomTZoEwIoVK1iyZAmDBw+msrKSO+64gw4dOrBgwQJWrVrF+vXrGT16NFdddRXXX3/99m1u3LiRxYsXs2HDBmbMmMGdd94JwGOPPbbL60+fPp2ysjIGDBjAn//8ZwD+/Oc/s3Hjxr3WfdlllzFp0iTGjBnDOeecA0D//v25+OKL6dGjR4v0RoXX/BPwzVMH/GNK6fmI6Ak8FxELU0qv7DBmEfBgSilFxCeBecCA3LpbgP+bUvpcRHQBuue5XkmSdrFpax21G7dSWtSF7l0++NXZq1cvADp37gxAcXExHTt2bDQ3Iva5/ZTSLssGDRrE7Nmztz8vLi7eZczatWtZvHgxH//4x7cv+9a3vsUpp5xCRUXFHl+vT58++6xJ7U9eQ11K6XXg9dzjDRGxEjgKeGWHMTv+d6IHkAAi4lCgHJiYG7cV2JrPeiVJ2tnK19/m6w+/wtOrazn9mFKuGzmQbk2Y161bN8rLy1m6dCmzZs3i1Vdfpb6+ngsuuIABAwZwxBFHMH/+fG677TbmzZu3fV5RURFDhw7lySefZMmSJRx11FE89dRTDBgwgFNOOaXRa8yYMYM33ngDgHnz5nH//fdz4YUXUl5e3pItUDtRsGvqIqIfcDLwq92s+2xErAJ+AVyWW/wx4A3gBxHxQkTcFREeE5YkFcymrXXbAx3A06tr+cbDr7B5a12T5s+ZM4eRI0cya9YsHnnkESZNmsS1115L165dmTNnDqWlpdxwww2ceuqpu8wbO3Yst956K1OnTmX16tW7jAEYOnQon/vc5/jc5z7HwIEDARg4cCBHH330Ae652qPY3SHfFn+RiCJgMTAzpfTTvYwrB65PKY2IiDLgWWBISulXEXEL8HZK6bqd5lwBXAFw+OGHf3rnO4jUcjZu3LjLZyOp5dnnwrHXhdNee/3etnpWrd+wy/IBR/Skc8e2ea/hxo0bOeSQQ3jnnXcaLe/YsWO7/Dtoyw70fT18+PDnUkplLVVP3kNdRHQGHgYeTSnd1ITxrwGn0HBq+NmUUr/c8r8CpqeU/npPc8vKylJNTU2L1K1dVVdXM2zYsNYuI/Psc+HY68Jpr73etLWOL/6wZvuROoAhx5Ty3UvLGl1b15ZUV1cDMHz48EbLhw4dun2dWsaBvq8jokVDXV7fkdFwdej3gJV7CnQR8RfA6tyNEp8CugC1uedrI+K4lNJvgbPY4Vo8SZLyrXuXTlw3cuD2U7BDjinlX0YObLOB7n2DBg1i4cKFjZaVlJS0UjUqlHy/K4cAlwArIuLF3LJrgaMBUkp3AOOACRHxHrAZuCh9cPjwauCe3J2vvwe+kOd6JUlq5PgjD+WuS8uofWcrpT26tPlABw0BbsSIEa1dhgos33e/PgXs9V7ulFIVULWHdS8CLXZYUpKk/dG9S6d2EeZ0cGubV3lKkiSpWQx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJyoAmh7poUBER1+eeHx0Rp+avNEmSJDVVc47U3Q78JfD53PMNwG0tXpEkSZKarVMzxp6WUvpURLwAkFL634jokqe6JEmS1AzNOVL3XkR0BBJARPQG6vNSlSRJkpqlOaHuW8DPgA9HxEzgKeCGvFQlSZKkZmny6deU0j0R8RxwFhDAmJTSyrxVJkmSpCZrzt2vxwCvpZRuA34DnB0RvfJVmCRJkpquOadffwJsi4i/AO4C+gP35qUqSZKkNqKmpoaIYOLEifsc269fP4qKigB48MEH+dSnPkXPnj057LDDuOyyy9i8eXOzXjsiOkTEjyNiY0SkiBixp7HNCXX1KaU6YCxwS0rpy8CRzapMkiTpILF8+XIGDhzITTfdxKc//Wl+8IMf8M1vfrPJ8yOiE3A8UAH8FzAeWLGn8c29+/XzwATg4dyyzs2YL0mSlHdr1qwhIigvL+f888+nZ8+eVFVVceONN1JcXMxJJ53EmjVrWLt2LWPGjKGkpIQ+ffowZcoUtmzZAsCiRYvo378/ffv2Ze7cuY22v27dOsaNG8dnPvMZ+vTpw/Tp06mv3/UDQf7pn/6JOXPmcPnllzN79mwAXn755T3WHRETc0fj/jMiXgbm0XDJG8DJwD1Atz3Nb06o+wINHz48M6X0WkT0B+Y0Y74kSVLBPPPMM5x99tmUlpZSWVnJggULmDhxIsuXL+fmm29m/PjxPPTQQ0ybNo1zzz2XW265hZkzZ7JlyxYqKiqora1l2rRpLFu2rNF2KyoqWLhwIePGjWPUqFFUVVVx++237/L6Xbp88HG+jz76KADl5eVNKf1c4E7gR8A/55Y9ScMXQLyxx1kppbz9AB8FngBWAi8Dk3czZjTwEvAiUAOcsdP6jsALwMP7er1Pf/rTSfnzxBNPtHYJBwX7XDj2unDsdeEcjL1+Z8t76Y+176R3tryXUkrptddeS0AaMmRISimlv/3bv01Aevzxx9Pvfve7BKQzzzwzAen0009PKaX07rvvpg4dOqSysrL04osvJiBVVFSklFJ6/PHHE5AuvfTStGHDhhQRiYbP7d3+85nPfCallFLfvn1Tjx49GtX3wAMPpM6dO6cLLrgg1dXVbV8O1KTGmWdibns37bCsLLfs7rSPHNTkjzSJiJHA14G+NHwUSjRkwnToXqbVAf+YUno+InoCz0XEwpTSKzuMWQQ8mFJKEfFJGg41Dthh/eRcKNzb60iSpIPQytff5usPv8LTq2s5/ZhSrhs5cPv5yV69egHQuXPD1WLFxcV07Nix0fyI2Odr5MJVI4MGDeKSSy5h0KBB27e9O//5n/9JRUUFZ555Jj/5yU92ef09+O+mDNpZc06/3gxcCpSmlA5NKfXcR6AjpfR6Sun53OMNNISzo3YaszF90K0e5L6xAiAiPgL8NQ1320qSJG23aWvd9kAH8PTqWr7x8Cts3lq3z7ndunWjvLycZ599llmzZvGlL32J+vp6LrjgAgYMGMARRxzB/Pnzue222/j617++fV5RURFDhw5lxYoVvPTSS/z+97/nRz/6EQsXLtzlNX7xi18wfvx4evXqxec//3l+/vOf88tf/rLlGrCT5nz361rgN2l3cbUJIqIfDRf5/Wo36z4L/BvwYRpC3PtuBqYBPfey3SuAKwAOP/xwqqur96c8NcHGjRvtbwHY58Kx14VjrwvnYOr1e9vq+cvuG/jLT+y49H+oWfZ7AGpra6murmb9+vUAPPfcc9uPqNXW1vLlL3+Zbdu28Y1vfIOuXbsyduxYhgwZwjPPPMPUqVOZPXs2X/va1zjrrLMAWL9+PdXV1XzpS18ipcTPfvYzHnjgAfr378+gQYOorq7m3XffZdu2bVRXV3P//fezbds23nzzTb7whS8ADUf4br755rz0I5qa0SLiFBpOvy4Gtry/PKV0UxPmFuXmzUwp/XQv48qB61NKI3Kney9IKX0pIoYBU1NKI/f2OmVlZammpqYpu6P9UF1dzbBhw1q7jMyzz4VjrwvHXhfOwdTrTVvr+OIPa7YfqQMYckwp3720jO5dmnPcav/sb6/fe+893nrrLXr37r0ceP9z595LKb11IPU05/TrTGATcAgNR87e/9mriOhMwwcX37O3QAeQUnoSOCYiDgOGAKMiYg0wFzgzIrzbVpIkAdC9SyeuGzmQ048pBRoC3b+MHFiQQHcgli5dSu/evQEG0XA36xvA/APdbnP2+kMppXOas/FouPrwe8DKPR3Ry31DxercjRKfAroAtSmlSqAyN2YYDUfqKprz+pIkKduOP/JQ7rq0jNp3tlLao0ubD3TQcAp24cKFnH322f8FXJVb/L8Hut3m7PnjEXFOSumxZswZAlwCrIiIF3PLrgWOBkgp3QGMAyZExHvAZuCi/b1uT5IkHXy6d+nULsLc+0pKShgxYgTAhpTS4y213eZ04CpgWkRsAd6jCR9pklJ6Kjduj1JKVUDVPsZUA9XNqFWSJOmg0qRQFxEdgPNSSkvzXI8kSZL2Q5NulEgp1QOz81yLJEmS9lNz7n59LCLGRVM+elmSJEkF1Zxr6q6h4Rsf6iLiXZr2NWGSJEkqgCaHupTSXj+TLiJOSCm9fOAlSZIkqbmac/p1X37cgtuSJElSM7RkqPNaO0mSpFbSkqHODwyWJElqJS0Z6iRJktRKWjLUbW3BbUmSJKkZmhzqIuKzEVG8w/NeETHm/ecppcEtXJskSZKaqDlH6r6aUnrr/Scppf8HfLXFK5IkSVKzNSfU7W5scz68WJIkSXnSnFBXExE3RcQxEfGxiPgP4Ll8FSZJkqSma06ou5qGmyH+E7gfeBe4Kh9FSZIkqXma8zVh7wDT81iLJEmS9tM+Q11E3JxSmhIRD7GbDxhOKY3KS2WSJElqsqYcqXv/O11n57MQSZIk7b99hrqU0nO5PxfnvxxJkiTtj6acfl3B7r/XNYCUUvpki1clSZKkZmnK6deRea9CkiRJB6Qpp1//8P7jiDgCOJWGI3fLUkrr81ibJEmSmqg53/36ReDXwFjgc8CzEXFZvgqTJElS0zXna76+ApycUqoFiIhS4Gng+/koTJIkSU3XnG+U+BOwYYfnG4C1LVuOJEmS9kdT7n69JvdwHfCriJhPwzV1o2k4HStJkqRW1pTTrz1zf67O/bxvfsuXI0mSpP3RlLtfZxSiEEmSJO2/Jt8oERFPsPvvfj2zRSuSJElSszXn7tepOzw+BBgH1LVsOZIkSdofTQ51738H7A6WRoTfBytJktQGNOf064d2eNoBKAOOaPGKJEmS1GzNOf36HB9cU1cHrAH+rqULkiRJUvM1J9QNBL4EnEFDuFsC1OSjKEmSJDVPc0LdD4G3gW/lnn8e+DHwNy1dlCRJkpqnOaHuuJTSoB2ePxERy1u6IEmSJDVfc7779YWIGPz+k4g4DVja8iVJkiSpuZpzpO40YEJE/DH3/GhgZUSsAFJK6ZMtXp0kSZKapDmh7ry8VSFJkqQD0pwPH/5DPguRJEnS/mvONXWSJElqowx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjIgr6EuIj4aEU9ExMqIeDkiJu9mzOiIeCkiXoyImog4o6lzJUmS1KA5n1O3P+qAf0wpPR8RPYHnImJhSumVHcYsAh5MKaWI+CQwDxjQxLmSJEkiz0fqUkqvp5Sezz3eAKwEjtppzMaUUso97QGkps6VJElSg4JdUxcR/YCTgV/tZt1nI2IV8AvgsubMlSRJEsQHB8ny+CIRRcBiYGZK6ad7GVcOXJ9SGtHUuRFxBXAFwOGHH/7puXPntnT5ytm4cSNFRUWtXUbm2efCsdeFY68Lx14XzoH2evjw4c+llMpaqp68h7qI6Aw8DDyaUrqpCeNfA05JKb3Z3LllZWWppqbmgGvW7lVXVzNs2LDWLiPz7HPh2OvCsdeFY68L50B7HREtGuryffdrAN8DVu4plEXEX+TGERGfAroAtU2ZK0mSpAb5vvt1CHAJsCIiXswtuxY4GiCldAcwDpgQEe8Bm4GLcnfCnrG7uSmlR/JcsyRJUruT11CXUnoKiH2MqQKq9meuJEmSGviNEpIkSRlgqJMkScoAQ50kSVIGGOokSZIywFAnSZKUAYY6SZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqJMkScoAQ50kSVIGGOokSZIywFAnSZKUAYY6SZKkDDDUSZK0FzU1NUQEEydO3OfYfv36UVRUBMCaNWuIiEY/U6ZMadZr19fXc8kll1BUVERE8Pjjj+/HHuhg0am1C5AkKcv+/u//nqFDhwJw3HHHNXleXV0dv/3tb5kzZw4nn3wyU6dO5ROf+ES+ylQGeKROkpQp7x8hKy8v5/zzz6dnz55UVVVx4403UlxczEknncSaNWtYu3YtY8aMoaSkhD59+jBlyhS2bNkCwKJFi+jfvz99+/Zl7ty5jba/bt06xo0bt33e9OnTqa+v32M9ZWVljBo1iosvvpiTTz55j+PuvvtuIoKLLrqIE044gQsvvJATTzwRgBdeeIHx48ezefPmFuiQsspQJ0nKpGeeeYazzz6b0tJSKisrWbBgARMnTmT58uXcfPPNjB8/noceeohp06Zx7rnncssttzBz5ky2bNlCRUUFtbW1TJs2jWXLljXabkVFBQsXLmTy5MmMGjWKqqoqbr/99j3Wcfnll9OjRw9OOOEEnn322X3W/eijj3LllVcyYcIEZs6cCUB5eTn33XcfvXv3PrCmKNMMdZKkTDrttNO45pprGDJkCCklKisrmTRpEgArVqxgyZIlDB48mMrKSu644w46dOjAggULWLVqFevXr2f06NFcddVVXH/99du3uXHjRhYvXsyGDRuYMWMGd955JwCPPfbYLq/fo0cPZsyYwc9//nNmz57Nf/3XfzF+/Ph91n3ZZZcxadIkxowZwznnnANA//79ufjii+nRo0dLtEYZ5TV1kqR2bdPWOmo3bqW0qAvdu3zwa61Xr14AdO7cGYDi4mI6duzYaG5E7HP7KaVdlg0aNIjZs2dvf15cXLzLmN69ezcKhPfeey/PP/887777LocccsgeX69Pnz77rEnaHUOdJKndWvn623z94Vd4enUtpx9TynUjB9KtCfO6detGeXk5S5cuZdasWbz66qvU19dzwQUXMGDAAI444gjmz5/Pbbfdxrx587bPKyoqYujQoTz55JMsWbKEo446iqeeeooBAwZwyimnNHqN7373uyxbtozTTjuN1157jRdffJFBgwbtNdBJB8LTr5KkdmnT1rrtgQ7g6dW1fOPhV9i8ta5J8+fMmcPIkSOZNWsWjzzyCJMmTeLaa6+la9euzJkzh9LSUm644QZOPfXUXeaNHTuWW2+9lalTp7J69epdxgAce+yxvPTSS0yZMoXbbruN8847j/vvv//Ad1zag9jdYeX2qqysLNXU1LR2GZlVXV3NsGHDWruMzLPPhWOvCycfvV7750381Tef2GX5kn8azkdLurfoa7Wk9957j7feeqvRss6dO+/2FO7+8H1dOAfa64h4LqVU1lL1eKROktQulRZ14fRjShstG3JMKaU9urRSRU2zdOlSevfu3ehn9OjRrV2WMsBr6iRJ7VL3Lp24buTA7adghxxTyr+MHNjoZom2aNCgQSxcuLDRspKSklaqRlnStt/5kiTtxfFHHspdl5ZR+85WSnt0afOBDhoC3IgRI1q7DGVQ23/3S5K0F927dGoXYU7KN6+pkyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgbkNdRFxEcj4omIWBkRL0fE5N2MGR0RL0XEixFRExFn7LDuvIj4bUT8LiKm57NWSZKk9qxTnrdfB/xjSun5iOgJPBcRC1NKr+wwZhHwYEopRcQngXnAgIjoCNwGnA38CVgWEQ/uNFeSJEnk+UhdSun1lNLzuccbgJXAUTuN2ZhSSrmnPYD3H58K/C6l9PuU0lZgLjA6n/VKkiS1VwW7pi4i+gEnA7/azbrPRsQq4BfAZbnFRwFrdxj2J3YKhJIkSWoQHxwky+OLRBQBi4GZKaWf7mVcOXB9SmlERPwNcG5K6Yu5dZcAp6aUrt5pzhXAFQCHH374p+fOnZuv3Tjobdy4kaKiotYuI/Psc+HY68Kx14VjrwvnQHs9fPjw51JKZS1VT76vqSMiOgM/Ae7ZW6ADSCk9GRHHRMRhNByZ++gOqz8C/Pdu5nwH+A5AWVlZGjZsWEuVrp1UV1djf/PPPheOvS4ce1049rpw2lqv8333awDfA1amlG7aw5i/yI0jIj4FdAFqgWXAxyOif0R0AS4GHsxnvZIkSe1Vvo/UDQEuAVZExIu5ZdcCRwOklO4AxgETIuI9YDNwUe7GibqI+D/Ao0BH4PsppZfzXK8kSVK7lNdQl1J6Coh9jKkCqvaw7hHgkTyUJkmSlCl+o4QkSVIGGOokSZIywFAnSZKUAYY6SZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqJMkScoAQ50kSVIGGOokSZIywFAnSZKUAYY6SZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqMu4mpoaIoKJEyfuc2y/fv0oKioCYPPmzZx11lkUFRUREcyePXuvc+++++4mjQOYOHEiEUFNTU2T9kGSJO2boU67tW3bNj70oQ9x3nnntXYpkiSpCQx1bcyaNWuICMrLyzn//PPp2bMnVVVV3HjjjRQXF3PSSSexZs0a1q5dy5gxYygpKaFPnz5MmTKFLVu2ALBo0SL69+9P3759mTt3bqPtr1u3jnHjxm2fN336dOrr63epo6ioiPvvv5+RI0fu13788pe/5GMf+xh9+/blK1/5ym6PFs6dO5d+/frRv39/Fi1aBHxwxG/ixIkMHDiQD3/4wyxYsIDx48fTo0cPxowZQ11d3X7VJElSlhnq2qhnnnmGs88+m9LSUiorK1mwYAETJ05k+fLl3HzzzYwfP56HHnqIadOmce6553LLLbcwc+ZMtmzZQkVFBbW1tUybNo1ly5Y12m5FRQULFy5k8uTJjBo1iqqqKm6//fYWrX3Lli2MHz+eN954g6lTp/L000/vdtyvf/1rvvKVr1BbW0tFRcX2UAoNofAf/uEfePPNNxk5ciS9evXijDPOYP78+Tz88MMtWq8kSVlgqGtlm7bWsfbPm9i0tfHRp9NOO41rrrmGIUOGkFKisrKSSZMmAbBixQqWLFnC4MGDqays5I477qBDhw4sWLCAVatWsX79ekaPHs1VV13F9ddfv32bGzduZPHixWzYsIEZM2Zw5513AvDYY4+16D7tWMPVV1/N1772td2O++pXv8pVV13F6NGjWb9+Pb/97W+3r5swYQJXX301Rx55JAD/8R//wYUXXgjAa6+91qL1SpKUBZ1au4CD2crX3+brD7/C06trOf2YUq4bOZBuuXW9evUCoHPnzgAUFxfTsWPHRvMjYp+vkVLaZdmgQYMa3dBQXFy8fzuwD02pD3Zf4477361bN7p06bJ9/7dt29ZiNUqSlBWGulayaWvd9kAH8PTqWr7x8Cv8y9DD9jm3W7dulJeXs3TpUmbNmsWrr75KfX09F1xwAQMGDOCII45g/vz53HbbbcybN2/7vKKiIoYOHcqTTz7JkiVLOOqoo3jqqacYMGAAp5xyyi6vc9ddd20/dfrrX/+a//mf/6GsrGz7HbJ7snMN9957727H/eu//iurVq3iwQcf5Mgjj+S4447j+eef3+f+S5KkXXn6tZXUbty6PdC9b+nqWv5383tNmj9nzhxGjhzJrFmzeOSRR5g0aRLXXnstXbt2Zc6cOZSWlnLDDTdw6qmn7jJv7Nix3HrrrUydOpXVq1fvMuZ9l19+OT/4wQ8AuP/++5k9ezZvvvnmPmvr2rUr99xzD6WlpcyaNYvBgwcDHxx9e98ZZ5xBVVUVH/rQh/jxj39M165dm7TvkiRpV7G7U1/tVVlZWWovn322aWsdX/xhTaNgN+SYUr57aRndu7TNA6jV1dUMGTKEt956q9Hyzp0773IKd/78+aSU6NatGzfddBOPPfYYDz300H7fTXswqa6uZtiwYa1dxkHBXheOvS4ce104B9rriHgupVTWUvV4pK6VdO/SietGDuT0Y0qBhkD3LyMHttlA976lS5fSu3fvRj+jR4/eZdwf//hHLr/8ckaPHs0f/vAHbr31VgOdJEl51LYTRMYdf+Sh3HVpGbXvbKW0R5c2H+ig4SaLhQsXNlpWUlKyy7irr76aq6++ulBlSZJ00Gv7KSLjunfp1C7C3PtKSkoYMWJEa5chSZJ24ulXSZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqJMkScoAQ50kSVIGGOokSZIywFAnSZKUAYY6SZKkDIiUUmvX0GIi4g3gD61dR4YdBrzZ2kUcBOxz4djrwrHXhWOvC+dAe903pdS7pYrJVKhTfkVETUqprLXryDr7XDj2unDsdeHY68Jpa7329KskSVIGGOokSZIywFCn5vhOaxdwkLDPhWOvC8deF469Lpw21WuvqZMkScoAj9RJkiRlgKHuIBUR50XEbyPidxExfTfrSyLiZxHxUkT8OiJO3GHdlyPi5Yj4TUTcFxGH5Jb/e0Ssys35WUT0KuAutVn56PUO66dGRIqIwwqxL21dvnodEVfntvtyRHyzUPvTVuXp34+TIuLZiHgxImoi4tRC7lNbdYC9npzr88sRMWWH5R+KiIUR8Wruz5IC7U6blqdeF/b3YkrJn4PsB+gIrAY+BnQBlgMDdxrz78BXc48HAItyj48CXgO65Z7PAybmHp8DdMo9rgKqWntfW/snX73OPf8o8CgNn814WGvva2v/5PF9PRx4HOiae/7h1t7XjPb5MeD83OMLgOrW3tfW/jnAXp8I/AboDnTKvYc/nlv3TWB67vF0/63Oa68L+nvRI3UHp1OB36WUfp9S2grMBUbvNGYgsAggpbQK6BcRh+fWdQK6RUQnGt7E/50b91hKqS435lngI/ndjXYhL73O+Q9gGuCFsQ3y1et/AGallLbk5v1/+d2NNi9ffU7AobnHxTR+rx+sDqTXxwPPppQ25f5dXgx8NjdnNPDD3OMfAmPyuhftQ156Xejfi4a6g9NRwNodnv8pt2xHy4GxALnTIH2Bj6SU1gGzgT8CrwNvpZQe281rXAYsaOG626O89DoiRgHrUkrL81t+u5Kv9/WxwF9FxK8iYnFEnJLHfWgP8tXnKcC/R8Ta3JjKfO1AO7LfvabhyFF5RJRGRHcajn5+NDfn8JTS6wC5Pz+ctz1oP/LV6x3l/feioe7gFLtZtvPRnllASUS8CFwNvADU5a69GA30B/oAPSKiotHGI/4ZqAPuaeG626MW73XuH41/Bq7PW9XtU77e152AEmAw8BVgXkTs7rUOFvnq8z8AX04pfRT4MvC9PNTe3ux3r1NKK2k43bcQ+L80BJI6tCd57XWhfi92yufG1Wb9icb/i/gIO53qSCm9DXwBIPcL7LXcz7nAaymlN3LrfgqcDszJPb8UGAmclXIXERzk8tHr5TT8UlyeyxYfAZ6PiFNTSuvzujdtW77e138Cfpp7P/86Iupp+L7HN/K6N21Xvvp8KTA5t4n7gbvytwvtxoH0mpTS98iF44i4Ibc9gP+JiCNTSq9HxJHAwX5JAeSv1wX9veiRuoPTMuDjEdE/IroAFwMP7jggInrl1gF8EXgy94b+IzA4Irrn3tRnAStzc84D/gkYlVLaVKB9aetavNcppRUppQ+nlPqllPrR8I/Hpw7yQAd5el8DPwfOzM0/loaLqA/mL0vPV5//Gxiae3wm8Gqe96M9OJBeExEfzv15NA2nDe/LjXuQhhBN7s/5ed2L9iEvvS7070WP1B2EUkp1EfF/aLhzsiPw/ZTSyxHx97n1d9Bw4eePImIb8Arwd7l1v4qIB4DnaTiU/AIffKL2rUBXYGHuCNKzKaW/L9yetT157LV2ksdefx/4fkT8BtgKXHowH4XOY58vB26Jhhso3gWuKOButUkH0uucn0REKfAecFVK6X9zy2fRcBnB39EQtP+mMHvUduWx1wX9veg3SkiSJGWAp18lSZIywFAnSZKUAYY6SZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqJOkA5D7XDVJanWGOknKiYgJEfFSRCyPiB9HRN+IWJRbtij3afFExN0RcVNEPEHDdz5KUqvzf5iSBETECcA/A0NSSm9GxIeAHwI/Sin9MCIuA74FjMlNORYYkVLa1ioFS9JOPFInSQ3OBB5IKb0JkFL6M/CXwL259T8Gzthh/P0GOkltiaFOkhoEsK/vTdxx/Tt5rEWSms1QJ0kNFgEX5r6Um9zp16eBi3PrxwNPtVJtkrRPXlMnSUBK6eWImAksjohtwAvAJOD7EfEV4A3gC61ZoyTtTaS0r7MNkiRJaus8/SpJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLg/wfnHnHFYbLEdwAAAABJRU5ErkJggg==\n", 300 | "text/plain": [ 301 | "
" 302 | ] 303 | }, 304 | "metadata": { 305 | "needs_background": "light" 306 | }, 307 | "output_type": "display_data" 308 | } 309 | ], 310 | "source": [ 311 | "plt.figure(figsize=(10,5))\n", 312 | "# 점의 사이즈가 30인 산점도 그래프\n", 313 | "graph = sns.scatterplot(x=\"cor\", y=\"public_rmse\", data=score_df, s=30)\n", 314 | "# 각 점에 대한 모델명 표시\n", 315 | "for idx in range(0, score_df.shape[0]):\n", 316 | " graph.text(score_df.cor[idx]+0.00005 , score_df.public_rmse[idx]-0.00003, \n", 317 | " score_df.model[idx], horizontalalignment='left', \n", 318 | " size='medium', color='black', weight='semibold')\n", 319 | "\n", 320 | "# x축 범위 지정\n", 321 | "plt.xlim((score_df.cor.min()-0.001, score_df.cor.max()+0.001))\n", 322 | "# y축 범위 지정\n", 323 | "plt.ylim((score_df.public_rmse.min()-0.005, score_df.public_rmse.max()+0.005))\n", 324 | "plt.grid() # 격자 무늬\n", 325 | "plt.show()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "#### 3.5.1.2. 여러 가지 앙상블 기법" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 7, 338 | "metadata": { 339 | "scrolled": true 340 | }, 341 | "outputs": [ 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "ensemble_model1+model5.csv가 저장되었습니다!\n" 347 | ] 348 | } 349 | ], 350 | "source": [ 351 | "ensemble_dir = \"C:/dacon/ch03/submission/Ensemble1/\"\n", 352 | "\n", 353 | "# Ensemble1 폴더가 없다면 Ensemble1 폴더를 생성\n", 354 | "if not os.path.isdir(ensemble_dir):\n", 355 | " os.mkdir(ensemble_dir)\n", 356 | "\n", 357 | "# Ensemble1 폴더로 파일 이동\n", 358 | "import shutil\n", 359 | "shutil.move(sub_dir + 'model1_lgbm=2.29.csv', ensemble_dir + 'model1_lgbm=2.29.csv')\n", 360 | "shutil.move(sub_dir + 'model5_rf=2.31.csv', ensemble_dir + 'model5_rf=2.31.csv')\n", 361 | "\n", 362 | "# model1_lgbm=2.29.csv, model5_rf=2.31.csv 멱 평균\n", 363 | "nf = 0\n", 364 | "for f in os.listdir(ensemble_dir):\n", 365 | " ext = os.path.splitext(f)[-1] # 'Ensemble1' 폴더에 있는 파일의 확장자를 분리\n", 366 | "\n", 367 | " if ext == '.csv':\n", 368 | " sub = pd.read_csv(ensemble_dir + f) # 확장자명이 .csv라면 해당 데이터를 로드\n", 369 | " else:\n", 370 | " continue # 확장자명이 .csv가 아니라면 for문을 종료\n", 371 | "\n", 372 | " if len(sub.columns) !=2:\n", 373 | " continue # 로드한 데이터의 변수의 수가 2개가 아니라면 for문을 종료\n", 374 | "\n", 375 | " # 파일의 확장자명이 .csv이고, 변수의 수가 2개인 경우에만 밑의 조건문이 실행됩니다.\n", 376 | " if nf == 0:\n", 377 | " sub_df = sub # nf가 0이면 해당 데이터(sub)를 sub_df 에 저장\n", 378 | " else:\n", 379 | " sub_df = pd.merge(sub_df, sub, on=\"id\") # nf가 0이 아니면 해당 데이터를 sub_df와 병합\n", 380 | " \n", 381 | " nf += 1\n", 382 | "\n", 383 | "p = 21 # 하이퍼파라미터이므로 최적의 값을 찾아야 합니다.\n", 384 | "\n", 385 | "# 파일의 개수(nf)가 2개 이상인 경우에만 밑의 조건문이 실행됩니다.\n", 386 | "if nf >= 2: \n", 387 | " pred = 0\n", 388 | " \n", 389 | " # 앞서 소개한 멱 평균의 식을 나타낸 코드입니다.\n", 390 | " for j in range(nf):\n", 391 | " pred = pred + sub_df.iloc[:,j+1]**p\n", 392 | " pred = pred / nf \n", 393 | " pred = pred**(1/p)\n", 394 | " \n", 395 | " # 멱 평균 결과를 대입하여 데이터프레임 submit을 생성하고, csv파일로 저장 \n", 396 | " submit = pd.DataFrame({'id': sub_df.id, '18~20_ride': pred})\n", 397 | " fname = \"ensemble_model1+model5.csv\"\n", 398 | " submit.to_csv(fname, index=False)\n", 399 | " \n", 400 | " print(fname + '가 저장되었습니다!')" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 8, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "ensemble_model2+model4.csv가 저장되었습니다!\n" 413 | ] 414 | } 415 | ], 416 | "source": [ 417 | "ensemble_dir = \"C:/dacon/ch03/submission/Ensemble2/\"\n", 418 | "\n", 419 | "# Ensemble2 폴더가 없다면 Ensemble2 폴더를 생성\n", 420 | "if not os.path.isdir(ensemble_dir):\n", 421 | " os.mkdir(ensemble_dir)\n", 422 | "\n", 423 | "# Ensemble2 폴더로 파일 이동\n", 424 | "import shutil\n", 425 | "shutil.move(sub_dir + 'model2_rf=2.34.csv', ensemble_dir + 'model2_rf=2.34.csv')\n", 426 | "shutil.move(sub_dir + 'model4_rf=2.36.csv', ensemble_dir + 'model4_rf=2.36.csv')\n", 427 | "\n", 428 | "# model2_rf=2.34.csv, model4_rf=2.36.csv 멱 평균\n", 429 | "nf = 0\n", 430 | "for f in os.listdir(ensemble_dir):\n", 431 | " # 'Ensemble2' 폴더에 있는 파일의 확장자를 분리\n", 432 | " ext = os.path.splitext(f)[-1] \n", 433 | "\n", 434 | " if ext == '.csv': \n", 435 | " # 확장자명이 .csv라면 해당 데이터를 로드\n", 436 | " sub = pd.read_csv(ensemble_dir + f) \n", 437 | " else: \n", 438 | " # 확장자명이 .csv가 아니라면 for문을 종료\n", 439 | " continue \n", 440 | "\n", 441 | " if len(sub.columns) !=2:\n", 442 | " # 로드한 데이터의 변수의 수가 2개가 아니라면 for 문을 종료\n", 443 | " continue \n", 444 | "\n", 445 | " # 파일의 확장자명이 .csv이고, 변수의 수가 2개인 경우에만\n", 446 | " # 밑의 조건문이 실행됩니다.\n", 447 | " if nf == 0:\n", 448 | " sub_df = sub # nf가 0이면 해당 데이터(sub)를 sub_df 에 저장\n", 449 | " else: \n", 450 | " # nf가 0이 아니면 해당 데이터를 sub_df와 병합\n", 451 | " sub_df = pd.merge(sub_df, sub, on=\"id\") \n", 452 | " \n", 453 | " nf += 1\n", 454 | "\n", 455 | "p = 21 # 하이퍼파라미터이므로 최적의 값을 찾아야 합니다.\n", 456 | "\n", 457 | "# 파일의 개수(nf)가 2개 이상인 경우에만 밑의 조건문이 실행됩니다.\n", 458 | "if nf >= 2: \n", 459 | " pred = 0\n", 460 | " \n", 461 | " # 앞서 소개한 멱 평균의 식을 나타낸 코드입니다.\n", 462 | " for j in range(nf):\n", 463 | " pred = pred + sub_df.iloc[:,j+1]**p\n", 464 | " pred = pred / nf \n", 465 | " pred = pred**(1/p)\n", 466 | "\n", 467 | " # 멱 평균 결과를 대입하여 데이터프레임 submit을 생성하고, csv 파일로 저장\n", 468 | " submit = pd.DataFrame({'id': sub_df.id, '18~20_ride': pred})\n", 469 | " fname = \"ensemble_model2+model4.csv\"\n", 470 | " submit.to_csv(fname, index=False)\n", 471 | " \n", 472 | " print(fname + '가 저장되었습니다!')" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 9, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "# 앙상블 파일 로드\n", 482 | "first = pd.read_csv('ensemble_model1+model5.csv')\n", 483 | "second = pd.read_csv('ensemble_model2+model4.csv')\n", 484 | "third = pd.read_csv('model3_rf=2.38.csv')\n", 485 | "\n", 486 | "# 가중산술평균\n", 487 | "w_mean = 0.22*first['18~20_ride'] + 0.30*second['18~20_ride'] + 0.48*third['18~20_ride']\n", 488 | "\n", 489 | "# 최종 submission 파일 생성\n", 490 | "sub['18~20_ride'] = w_mean\n", 491 | "sub.to_csv('final_submission.csv', index=False)" 492 | ] 493 | } 494 | ], 495 | "metadata": { 496 | "kernelspec": { 497 | "display_name": "Python 3", 498 | "language": "python", 499 | "name": "python3" 500 | }, 501 | "language_info": { 502 | "codemirror_mode": { 503 | "name": "ipython", 504 | "version": 3 505 | }, 506 | "file_extension": ".py", 507 | "mimetype": "text/x-python", 508 | "name": "python", 509 | "nbconvert_exporter": "python", 510 | "pygments_lexer": "ipython3", 511 | "version": "3.8.5" 512 | }, 513 | "toc": { 514 | "base_numbering": 1, 515 | "nav_menu": {}, 516 | "number_sections": true, 517 | "sideBar": true, 518 | "skip_h1_title": false, 519 | "title_cell": "Table of Contents", 520 | "title_sidebar": "Contents", 521 | "toc_cell": false, 522 | "toc_position": {}, 523 | "toc_section_display": true, 524 | "toc_window_display": false 525 | } 526 | }, 527 | "nbformat": 4, 528 | "nbformat_minor": 2 529 | } 530 | --------------------------------------------------------------------------------