├── ch01
├── .gitignore
├── requirements.txt
└── README.md
├── ch02
├── .gitignore
├── requirements.txt
├── config.py
├── dataloader_test.py
├── README.md
├── ipynb
│ ├── voting.ipynb
│ ├── split_file.ipynb
│ └── MAE.ipynb
├── test.py
├── src
│ ├── utils.py
│ ├── model.py
│ └── get_score.py
└── train.py
├── ch05
├── requirements.txt
└── README.md
├── ch03
├── weather.csv
├── df_location.csv
├── life_location.csv
├── .gitignore
├── requirements.txt
├── README.md
├── rain.csv
└── submission.ipynb
├── .gitignore
├── requirements.txt
├── README.md
└── ch04
├── environment.yml
├── README.md
└── problem.ipynb
/ch01/.gitignore:
--------------------------------------------------------------------------------
1 | dataset/
--------------------------------------------------------------------------------
/ch02/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | bin/
--------------------------------------------------------------------------------
/ch05/requirements.txt:
--------------------------------------------------------------------------------
1 | -r ../requirements.txt
2 | pandas
3 | statsmodels
--------------------------------------------------------------------------------
/ch03/weather.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wikibook/dacon/master/ch03/weather.csv
--------------------------------------------------------------------------------
/ch01/requirements.txt:
--------------------------------------------------------------------------------
1 | -r ../requirements.txt
2 | pandas<=1.1
3 | scikit-learn
4 | xgboost
5 |
--------------------------------------------------------------------------------
/ch03/df_location.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wikibook/dacon/master/ch03/df_location.csv
--------------------------------------------------------------------------------
/ch03/life_location.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wikibook/dacon/master/ch03/life_location.csv
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.xlsx
3 | *.bak
4 |
5 | .ipynb_checkpoints/
6 | __pycache__/
7 |
8 | .vscode/
--------------------------------------------------------------------------------
/ch03/.gitignore:
--------------------------------------------------------------------------------
1 | !weather.csv
2 | !rain.csv
3 | !df_location.csv
4 | !life_location.csv
5 |
6 | submission/
7 |
--------------------------------------------------------------------------------
/ch03/requirements.txt:
--------------------------------------------------------------------------------
1 | -r ../requirements.txt
2 | pandas
3 | geopy
4 | scikit-learn
5 | xgboost
6 | lightgbm
7 | tqdm
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | notebook
3 | numpy==1.19.3 # Workaround for issue https://tinyurl.com/y3dm3h86
4 | seaborn<=0.11
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 데이콘 경진대회 1등 솔루션
2 |
3 | * [1장 타자 OPS 모델링](ch01)
4 | * [2장 반도체 박막 두께 분석](ch02)
5 | * [3장 버스 승차 인원 예측](ch03)
6 | * [4장 상점 매출 예측](ch04)
7 | * [5장 투수 스카우트 최적화](ch05)
8 |
--------------------------------------------------------------------------------
/ch02/requirements.txt:
--------------------------------------------------------------------------------
1 | -r ../requirements.txt
2 | pandas
3 | scikit-learn
4 | xgboost
5 | tqdm
6 |
7 | --find-links https://download.pytorch.org/whl/torch_stable.html
8 | torch>=1.4.0
9 |
--------------------------------------------------------------------------------
/ch01/README.md:
--------------------------------------------------------------------------------
1 | # 1장 KBO 타자 OPS 예측
2 |
3 | * 실습에 필요한 패키지: [requirements.txt](requirements.txt)
4 |
5 | * 예제 코드: [batter_OPS_prediction.ipynb](batter_OPS_prediction.ipynb)
6 |
7 | * 데이터 다운로드: [https://dacon.io/competitions/official/62540/data/](https://dacon.io/competitions/official/62540/data/) (DACON 사이트 회원 가입 및 대회 참여 후 다운로드 가능)
8 |
--------------------------------------------------------------------------------
/ch04/environment.yml:
--------------------------------------------------------------------------------
1 | name: store_amount_prediction
2 | dependencies:
3 | - python=3.7
4 | - r::rpy2==2.9.4
5 | - jupyter
6 | - notebook
7 | - pip
8 | - pip:
9 | - tzlocal
10 | - numpy==1.19.1
11 | - pandas==0.25.1
12 | - pmdarima==1.5.3
13 | - statsmodels==0.11.1
14 | - seaborn==0.11.0
15 | - tqdm==4.51.0
--------------------------------------------------------------------------------
/ch02/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # 경로와 학습에 필요한 configuration을 관리합니다. 리눅스 환경 기준으로 경로가 설정되어 있지만,
3 | # 윈도우 환경에서 학습한다고 해도 파이썬 및 파이토치(cpu버전)이 설치되어 있다면 문제가 발생하지 않습니다.
4 | TRAIN_PATH = 'data/train_splited.csv'
5 | VAL_PATH = 'data/val.csv'
6 |
7 | LR = 1e-03
8 | ADAM_EPSILON = 1e-06
9 | EPOCHS = 100
10 | BATCH_SIZE = 2048
11 | WARMUP_STEPS = 2000
12 |
--------------------------------------------------------------------------------
/ch05/README.md:
--------------------------------------------------------------------------------
1 | # 5장 KBO 외국인 투수 스카우팅 최적화 경진대회
2 |
3 | * 실습에 필요한 패키지: [requirements.txt](requirements.txt)
4 | * 예제 코드
5 | + [eda.ipynb](eda.ipynb)
6 | - 6.2. 탐색적 데이터 분석
7 | + [evaluation.ipynb](evaluation.ipynb)
8 | - 6.3. 데이터 전처리
9 | - 6.4. 모델 구축과 검증
10 | - 6.5. 성능 향상을 위한 방법
11 | * 데이터 다운로드: https://dacon.io/competitions/official/68346/data/
12 |
--------------------------------------------------------------------------------
/ch04/README.md:
--------------------------------------------------------------------------------
1 | # 4장 상점 신용카드 매출 예측
2 |
3 | * 실습 환경: [environment.yml](environment.yml)
4 | * 예제 코드
5 | + [problem.ipynb](problem.ipynb)
6 | - 4.1 문제 정의
7 | + [store_amount_prediction.ipynb](store_amount_prediction.ipynb)
8 | - 4.2 데이터 전처리
9 | - 4.3 탐색적 데이터 분석
10 | - 4.4 모델 구축과 검증
11 | - 4.5 성능 향상을 위한 방법
12 | * 데이터 다운로드: https://dacon.io/competitions/official/140472/data/
13 |
--------------------------------------------------------------------------------
/ch02/dataloader_test.py:
--------------------------------------------------------------------------------
1 | import config
2 |
3 | from torch.utils.data import DataLoader
4 | from src.utils import PandasDataset
5 |
6 | # 배치 사이즈는 하이퍼파라미터로 사용자가 직접 정의할 수 있습니다.
7 | batch_size=32
8 |
9 | # 학습 데이터 csv와 검증 데이터 csv 경로를 지정해 줍니다.
10 | train_path = config.TRAIN_PATH #'data/train_splited.csv'
11 | val_path = config.VAL_PATH #'data/val.csv'
12 |
13 | # Loader를 통해 Batch 크기로 데이터를 반환합니다.
14 | train_dataset = PandasDataset(train_path)
15 | train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0)
16 |
17 | val_dataset = PandasDataset(val_path)
18 | val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=0)
19 |
--------------------------------------------------------------------------------
/ch03/README.md:
--------------------------------------------------------------------------------
1 | # 3장 퇴근시간 버스승차인원 예측
2 |
3 | * 실습에 필요한 패키지: [requirements.txt](requirements.txt)
4 |
5 | * 예제 코드
6 | + [eda.ipynb](eda.ipynb): 3.2 탐색적 데이터 분석
7 | + [prediction.ipynb](prediction.ipynb)
8 | - 3.3 데이터 전처리
9 | - 3.4 모델 구축과 검증
10 | + [jeju_life_location.ipynb](jeju_life_location.ipynb)
11 | - 행정구역 이름 추출
12 | + [submission.ipynb](submission.ipynb)
13 | - 3.5 성능 향상을 위한 방법
14 |
15 | * 데이터
16 | + 데이콘 데이터 다운로드
17 | - 퇴근시간 버스승차인원 예측 경진대회: https://dacon.io/competitions/official/229255/data/
18 | - KCB 금융스타일 시각화 경진대회: https://dacon.io/competitions/official/82407/data/
19 | + 외부 데이터
20 | - 기상 데이터: [weather.csv](weather.csv), [rain.csv](rain.csv)
21 | - 위치 관련 데이터: [df_location.csv](df_location.csv), [life_location.csv](life_location.csv)
22 |
--------------------------------------------------------------------------------
/ch02/README.md:
--------------------------------------------------------------------------------
1 | # 2장 반도체 박막 두께 분석
2 |
3 | * 실습에 필요한 패키지: [requirements.txt](requirements.txt)
4 | * 예제 코드
5 | - [2장 코드 통합.ipynb](https://github.com/wikibook/dacon/blob/master/ch02/2%EC%9E%A5%20%EC%BD%94%EB%93%9C%20%ED%86%B5%ED%95%A9.ipynb)
6 | - [config.py](config.py): 공통 파일
7 | - [dataloader_test.py](dataloader_test.py): 2.3.3 커스텀 데이터 클래스
8 | - [train.py](train.py): 2.5.2.1 옵티마이저 및 스케줄러 조정
9 | - [test.py](test.py): 2.5.2.2 하이퍼파라미터
10 | - ipynb/
11 | - [eda.ipynb](ipynb/eda.ipynb): 2.2 탐색적 데이터 분석
12 | - [split_file.ipynb](ipynb/split_file.ipynb): 2.3 데이터 전처리
13 | - [voting.ipynb](ipynb/voting.ipynb): 2.5.3 앙상블
14 | + src/
15 | - [model.py](src/model.py): 2.5 성능 향상을 위한 방법
16 | - [utils.py](src/utils.py): 2.3.3 커스텀 데이터 클래스
17 |
18 | * 데이터 다운로드: https://dacon.io/competitions/official/235554/data/
19 |
--------------------------------------------------------------------------------
/ch02/ipynb/voting.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "[2, 1, 0]\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "output_1 = [0.7, 0.1, 0.2]\n",
18 | "output_2 = [0.5, 0.2, 0.3]\n",
19 | "output_3 = [0.3, 0.4, 0.3]\n",
20 | "\n",
21 | "\n",
22 | "def hard_voting(output_1, output2, output_3):\n",
23 | " \n",
24 | " result = [0,0,0]\n",
25 | " \n",
26 | " # 각 output에서 가장 큰 수의 인덱스를 찾습니다.\n",
27 | " output_1_max_value = max(output_1)\n",
28 | " output_1_max_index = output_1.index(output_1_max_value)\n",
29 | " result[output_1_max_index] += 1\n",
30 | "\n",
31 | " output_2_max_value = max(output_2)\n",
32 | " output_2_max_index = output_2.index(output_2_max_value)\n",
33 | " result[output_2_max_index] += 1\n",
34 | "\n",
35 | " output_3_max_value = max(output_3)\n",
36 | " output_3_max_index = output_3.index(output_3_max_value)\n",
37 | " result[output_3_max_index] += 1\n",
38 | "\n",
39 | " return result\n",
40 | "\n",
41 | "\n",
42 | "result = hard_voting(output_1, output_2, output_3)\n",
43 | "print(result)"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "[0.5, 0.23333333333333336, 0.26666666666666666]\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "output_1 = [0.7, 0.1, 0.2]\n",
61 | "output_2 = [0.5, 0.2, 0.3]\n",
62 | "output_3 = [0.3, 0.4, 0.3]\n",
63 | "\n",
64 | "\n",
65 | "def soft_voting(output_1, output2, output_3):\n",
66 | " \n",
67 | " result = [0,0,0]\n",
68 | " \n",
69 | " # 각 output의 소프트 맥스 확률을 더합니다.\n",
70 | " result = [(x+y+z)/3 for x,y,z in zip(output_1, output_2, output_3)]\n",
71 | " \n",
72 | " return result\n",
73 | "\n",
74 | "\n",
75 | "result = soft_voting(output_1, output_2, output_3)\n",
76 | "print(result)"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": []
85 | }
86 | ],
87 | "metadata": {
88 | "kernelspec": {
89 | "display_name": "Python 3",
90 | "language": "python",
91 | "name": "python3"
92 | },
93 | "language_info": {
94 | "codemirror_mode": {
95 | "name": "ipython",
96 | "version": 3
97 | },
98 | "file_extension": ".py",
99 | "mimetype": "text/x-python",
100 | "name": "python",
101 | "nbconvert_exporter": "python",
102 | "pygments_lexer": "ipython3",
103 | "version": "3.8.5"
104 | }
105 | },
106 | "nbformat": 4,
107 | "nbformat_minor": 4
108 | }
109 |
--------------------------------------------------------------------------------
/ch02/ipynb/split_file.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "(810000, 230)\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "from itertools import chain\n",
18 | "import numpy as np\n",
19 | "import pandas as pd\n",
20 | "\n",
21 | "\"\"\"\n",
22 | "원래 제공된 학습 데이터셋에서 일정 비율을 검증 데이터셋으로 나누어 다시 저장합니다.\n",
23 | "train.csv에 있는 데이터를 분할하여 학습 데이터는 train_splited.csv 파일에 저장하고\n",
24 | "검증 데이터는 val.csv 파일에 저장합니다.\n",
25 | "\"\"\"\n",
26 | "path_train = '../data/train.csv'\n",
27 | "\n",
28 | "# 데이터를 섞어 다시 저장 시 인덱스 재정렬을 위한 작업을 진행합니다.\n",
29 | "layers = [['layer_1','layer_2','layer_3','layer_4'], \\\n",
30 | " [str(i) for i in np.arange(0,226).tolist()]]\n",
31 | "layers = list(chain(*layers))\n",
32 | "\n",
33 | "# train의 row를 random으로 섞어줍니다.\n",
34 | "train = pd.read_csv(path_train)\n",
35 | "print(train.shape)\n",
36 | "train = train.sample(frac=1)\n",
37 | "rows, cols = train.shape"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 2,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "train file saved....\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "# 학습 데이터에서 일정 비율(13%)을 잘라 검증 데이터로 구성한 후 저장합니다.\n",
55 | "train1 = train.iloc[:rows - 80000,:]\n",
56 | "train1 = train1.values\n",
57 | "train1 = pd.DataFrame(data=train1,columns=layers)\n",
58 | "\n",
59 | "# 판다스 라이브러리의 to_csv() 함수를 사용해 CSV 파일로 학습 데이터를 저장합니다.\n",
60 | "train1.to_csv('../data/train_splited.csv', index_label='id')\n",
61 | "print(\"train file saved....\")"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "name": "stdout",
71 | "output_type": "stream",
72 | "text": [
73 | "validation file saved....\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "# 마찬가지로 나머지 부분은 검증 데이터를 CSV 파일로 저장합니다.\n",
79 | "val = train.iloc[rows - 80000:,:]\n",
80 | "val = val.values\n",
81 | "val = pd.DataFrame(data=val,columns=layers)\n",
82 | "val.to_csv('../data/val.csv', index_label='id')\n",
83 | "print(\"validation file saved....\")"
84 | ]
85 | }
86 | ],
87 | "metadata": {
88 | "kernelspec": {
89 | "display_name": "Python 3",
90 | "language": "python",
91 | "name": "python3"
92 | },
93 | "language_info": {
94 | "codemirror_mode": {
95 | "name": "ipython",
96 | "version": 3
97 | },
98 | "file_extension": ".py",
99 | "mimetype": "text/x-python",
100 | "name": "python",
101 | "nbconvert_exporter": "python",
102 | "pygments_lexer": "ipython3",
103 | "version": "3.8.5"
104 | }
105 | },
106 | "nbformat": 4,
107 | "nbformat_minor": 2
108 | }
109 |
--------------------------------------------------------------------------------
/ch02/ipynb/MAE.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## evaluate test data \n",
8 | "\n",
9 | "- 측정 : base.csv ~ *.csv\n",
10 | "- overfitting 지표로는 사용."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "data": {
20 | "text/plain": [
21 | "0.45674784462451934"
22 | ]
23 | },
24 | "execution_count": 1,
25 | "metadata": {},
26 | "output_type": "execute_result"
27 | }
28 | ],
29 | "source": [
30 | "import pandas as pd\n",
31 | "import numpy as np\n",
32 | "\n",
33 | "best_path = 'test_(20, 57)_0.001_100.csv'\n",
34 | "my_path = 'test_(9, 45)_0.001_150.csv'\n",
35 | "\n",
36 | "def mae(best_path, my_path):\n",
37 | " best = pd.read_csv(best_path)\n",
38 | " best_value = best.iloc[:,1:].values\n",
39 | "\n",
40 | " value = pd.read_csv(my_path)\n",
41 | " my_value = value.iloc[:,1:].values\n",
42 | "\n",
43 | " abs_value = abs(best_value - my_value)\n",
44 | " size = abs_value.shape\n",
45 | " return sum(sum(abs_value)) / (size[0]*size[1])\n",
46 | "\n",
47 | "mae(best_path, my_path)"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 4,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "from torch.optim import lr_scheduler"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 12,
62 | "metadata": {},
63 | "outputs": [
64 | {
65 | "data": {
66 | "text/plain": [
67 | "0.38016"
68 | ]
69 | },
70 | "execution_count": 12,
71 | "metadata": {},
72 | "output_type": "execute_result"
73 | }
74 | ],
75 | "source": [
76 | "528*180 / 250000"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 15,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "data": {
86 | "text/plain": [
87 | "0.002112"
88 | ]
89 | },
90 | "execution_count": 15,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "528/250000"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": []
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": []
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": []
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": []
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": []
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": []
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": []
147 | }
148 | ],
149 | "metadata": {
150 | "kernelspec": {
151 | "display_name": "Python 3",
152 | "language": "python",
153 | "name": "python3"
154 | },
155 | "language_info": {
156 | "codemirror_mode": {
157 | "name": "ipython",
158 | "version": 3
159 | },
160 | "file_extension": ".py",
161 | "mimetype": "text/x-python",
162 | "name": "python",
163 | "nbconvert_exporter": "python",
164 | "pygments_lexer": "ipython3",
165 | "version": "3.8.5"
166 | }
167 | },
168 | "nbformat": 4,
169 | "nbformat_minor": 2
170 | }
171 |
--------------------------------------------------------------------------------
/ch02/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import torch
4 | from src.model import SkipConnectionModel
5 | from src.utils import TestDataset
6 | from torch.utils.data import DataLoader
7 |
8 | # 모델 평가 시 GPU를 사용하기 위해서 설정.
9 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
10 |
11 | # 테스트 데이터 위치
12 | path_test = 'data/test.csv'
13 |
14 | # pth 파일(모델 한 개 예시)
15 | # 학습을 통해 저장된 pth 파일을 가져옵니다.
16 | pth_bin = 'bin/test.pth' # 학습해서 이미 모델이 저장되어 있어야합니다.
17 |
18 | # CSV가 저장될 디렉터리를 미리 만들어 놓습니다.
19 | if not os.path.exists('test'): # 'test' 는 USER에 맞게 지정하시면 됩니다.
20 | os.mkdir('test')
21 |
22 | ########################################
23 | ######### 모델 하나에 대한 테스트 ##########
24 | ########################################
25 |
26 | # Test Model
27 | # 모델을 테스트하기 위해서 모델을 다시 정의합니다.
28 | test_model = SkipConnectionModel(fn_in=226, fn_out=4)
29 | test_model = test_model.to(device)
30 |
31 | # Test dataset을 불러옵니다.
32 | test_data = TestDataset(path_test)
33 | test_loader = DataLoader(test_data, batch_size=10000, num_workers=0)
34 |
35 | # 테스트 데이터를 불러와서 모델로 결과를 예측하고 그 결과를 파일로 씁니다.
36 | with torch.no_grad():
37 | for data in test_loader:
38 | data = data.to(device)
39 | outputs = test_model(data.float())
40 | pred_test = outputs
41 |
42 | sample_sub = pd.read_csv('data/sample_submission.csv', index_col=0)
43 | layers = ['layer_1','layer_2','layer_3','layer_4'] # 데이터의 컬럼명을 정의해줍니다.
44 | submission = sample_sub.values + pred_test.cpu().numpy() # 파일을 쓸 때 CPU에서 진행합니다.
45 |
46 | submission = pd.DataFrame(data=submission,columns=layers)
47 | submission.to_csv('./test/submission.csv', index_label='id')
48 |
49 |
50 | #######################################################################
51 | ### 아래 버전은 앙상블용 예제 코드입니다.
52 | #######################################################################
53 | # test 파일 경로
54 | # path_test = 'data/test.csv'
55 |
56 | # pth_list = os.listdir('bin') # 'outputs' pth들이 저장된 경로
57 | # print(pth_list) # pth 파일 리스트들을 확인합니다.
58 |
59 | # # 모델들을 dictionary 형태로 정의하여 바로 사용할 수 있게 합니다.
60 | # models = {
61 | # 'model':TestModel(),
62 | # 'model1': TestModel1(),
63 | # 'model2': TestModel2(),
64 | # 'model3': TestModel3(),
65 | # 'model4': TestModel4(),
66 | # 'model5': TestModel5(),
67 | # 'model6': TestModel6()
68 | # }
69 |
70 |
71 | # 모델에 학습된 가중치를 올립니다.
72 | # USER_BIN = 'bin/model.pth'
73 | # weights = torch.load(USER_BIN, map_location='cuda:1')
74 | # test_model.load_state_dict(weights)
75 | # test_model = test_model.to(device)
76 | # test_model.eval()
77 |
78 |
79 | # 앙상블 할 모델에 대해서 파일을 씁니다.
80 | # for pth in sorted(pth_list):
81 | # if pth[-3:] != 'pth':
82 | # pass
83 | # else:
84 | # if int(pth[0]) == 0:
85 | # test_model(pth, test_loader, model_type='model')
86 | # elif int(pth[0]) == 1:
87 | # test_model(pth, test_loader, model_type='model1')
88 | # elif int(pth[0]) == 2:
89 | # #test_model(pth, test_loader, model_type='model2')
90 | # pass
91 | # elif int(pth[0]) == 3:
92 | # test_model(pth, test_loader, model_type='model4')
93 | # elif int(pth[0]) > 3 and int(pth[0]) <7:
94 | # test_model(pth, test_loader, model_type='model5')
95 | # elif int(pth[0])>= 7:
96 | # test_model(pth, test_loader, model_type='model6')
97 |
98 | # pth 가중치를 불러와서 모델을 테스트하고 그 결과 csv 파일을 씁니다.
99 | # def test_model(path_pth, test_loader, model_type:str):
100 | # model = models[model_type]
101 | # ws = torch.load(f'./outputs/{path_pth}', map_location='cpu') # 불러옴
102 | # model.load_state_dict(ws)
103 | # model.eval()
104 |
105 | # with torch.no_grad():
106 | # for data in test_loader:
107 | # outputs = model(data.float()) # 모델을 테스트
108 | # pred_test = outputs
109 |
110 | # sample_sub = pd.read_csv('sample_submission.csv', index_col=0)
111 | # layers = ['layer_1','layer_2','layer_3','layer_4']
112 | # submission = sample_sub.values + pred_test.numpy()
113 |
114 | # submission = pd.DataFrame(data=submission,columns=layers)
115 | # submission.to_csv(f'./test/{path_pth[:-4]}.csv', index_label='id') # test 경로에 csv 파일 저장
116 |
117 | # def check_state(model):
118 | # for val in model.state_dict().keys():
119 | # if val[-4:] =='bias':
120 | # pass
121 | # else:
122 | # print(f'{val} : {model.state_dict()[val].shape}')
123 |
--------------------------------------------------------------------------------
/ch02/src/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import pandas as pd
3 | import torch
4 | from torch.optim.lr_scheduler import LambdaLR
5 | from torch.utils.data import Dataset, DataLoader
6 |
7 | class PandasDataset(Dataset):
8 | """ Train dataset을 가져와서 torch 모델이 학습할 수 있는 tensor 형태로 반환합니다."""
9 | def __init__(self, path):
10 | super(PandasDataset, self).__init__()
11 | train = pd.read_csv(path).iloc[:,1:]
12 | self.train_X, self.train_Y = train.iloc[:,4:], train.iloc[:,0:4]
13 | self.tmp_x , self.tmp_y = self.train_X.values, self.train_Y.values
14 |
15 | def __len__(self):
16 | return len(self.train_X)
17 |
18 | def __getitem__(self, idx):
19 | return {
20 | 'X':torch.from_numpy(self.tmp_x)[idx],
21 | 'Y':torch.from_numpy(self.tmp_y)[idx]
22 | }
23 |
24 | class TestDataset(Dataset):
25 | def __init__(self, path_test):
26 | super(TestDataset, self).__init__()
27 | test = pd.read_csv(path_test)
28 | self.test_X = test.iloc[:,1:]
29 | self.tmp_x = self.test_X.values
30 |
31 | def __len__(self):
32 | return len(self.test_X)
33 |
34 | def __getitem__(self, idx):
35 | return torch.from_numpy(self.tmp_x)[idx]
36 |
37 | """
38 | 학습 최적화를 위해 스케줄러를 활용합니다.
39 | Pytorch 및 transformer의 스케줄러를 참고.
40 | https://github.com/huggingface/transformers/blob/master/src/transformers/optimization.py
41 | """
42 | def get_constant_schedule(optimizer, last_epoch=-1):
43 | """ Create a schedule with a constant learning rate.
44 | """
45 | return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
46 |
47 |
48 | def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
49 | """ Create a schedule with a constant learning rate preceded by a warmup
50 | period during which the learning rate increases linearly between 0 and 1.
51 | """
52 |
53 | def lr_lambda(current_step):
54 | if current_step < num_warmup_steps:
55 | return float(current_step) / float(max(1.0, num_warmup_steps))
56 | return 1.0
57 |
58 | return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
59 |
60 |
61 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
62 | """ Create a schedule with a learning rate that decreases linearly after
63 | linearly increasing during a warmup period.
64 | """
65 |
66 | def lr_lambda(current_step):
67 | if current_step < num_warmup_steps:
68 | return float(current_step) / float(max(1, num_warmup_steps))
69 | return max(
70 | 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
71 | )
72 |
73 | return LambdaLR(optimizer, lr_lambda, last_epoch)
74 |
75 |
76 | def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1):
77 | """ Create a schedule with a learning rate that decreases following the
78 | values of the cosine function between 0 and `pi * cycles` after a warmup
79 | period during which it increases linearly between 0 and 1.
80 | """
81 |
82 | def lr_lambda(current_step):
83 | if current_step < num_warmup_steps:
84 | return float(current_step) / float(max(1, num_warmup_steps))
85 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
86 | return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
87 |
88 | return LambdaLR(optimizer, lr_lambda, last_epoch)
89 |
90 |
91 | def get_cosine_with_hard_restarts_schedule_with_warmup(
92 | optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1
93 | ):
94 | """ 학습률이 웜업 기간 이후 몇 번의 하드 리스타트를 하는 코사인 함수 값에 따라 감소하는
95 | 스케줄러를 만듭니다. 웜업 기간에는 학습률이 0과 1 사이에서 선형으로 증가합니다.
96 | """
97 |
98 | def lr_lambda(current_step):
99 | if current_step < num_warmup_steps:
100 | return float(current_step) / float(max(1, num_warmup_steps))
101 | progress = float(current_step - num_warmup_steps) / \
102 | float(max(1, num_training_steps - num_warmup_steps))
103 | if progress >= 1.0:
104 | return 0.0
105 | return max(0.0, \
106 | 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
107 |
108 | return LambdaLR(optimizer, lr_lambda, last_epoch)
--------------------------------------------------------------------------------
/ch02/train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | import os
5 | import time
6 |
7 | import numpy as np
8 | import pandas as pd
9 | from tqdm.auto import tqdm
10 |
11 | import torch
12 | import torch.nn as nn
13 | from torch.utils.data import Dataset, DataLoader
14 | from torch.optim import AdamW
15 |
16 | import config
17 | from src.model import SkipConnectionModel
18 | from src.utils import PandasDataset, get_cosine_with_hard_restarts_schedule_with_warmup
19 |
20 | # 모델을 학습시키기 위한 하이퍼 파라미터를 설정합니다.
21 | lr = config.LR
22 | adam_epsilon = config.ADAM_EPSILON
23 | epochs = config.EPOCHS
24 | batch_size = config.BATCH_SIZE
25 | warmup_step = config.WARMUP_STEPS
26 |
27 | # original data : train.csv => random으로 섞어 미리 train_split data를 만들고
28 | # evaluation을 위해서 val data를 따로 분리했습니다. (9:1)
29 | # 모델 학습 데이터 경로를 설정합니다.
30 | train_path = config.TRAIN_PATH
31 | val_path = config.VAL_PATH
32 |
33 | # Loader를 통해 Batch 데이터로 반환합니다.
34 | train_dataset = PandasDataset(train_path)
35 | train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0)
36 |
37 | val_dataset = PandasDataset(val_path)
38 | val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=0)
39 |
40 | # 모델이 학습하는 전체 step을 계산합니다.
41 | total_step = len(train_loader) * epochs
42 | print(f"Total step is....{total_step}")
43 |
44 | # 모델 인스턴스를 정의합니다.
45 | model = SkipConnectionModel(fn_in=226, fn_out=4) # channel은 모델에서 수정합니다.
46 |
47 | # GPU 및 CUDA 환경이 마련되어 있다면, 모델 학습을 위해 CUDA 환경을 직접 설정합니다.
48 | # 그렇지 않은 경우 자동으로 CPU를 설정하게 됩니다.
49 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
50 |
51 | # 모델을 GPU 메모리에 올립니다. gpu가 없는 환경은 자동으로 cpu가 설정됩니다.
52 | model = model.to(device)
53 |
54 | # 손실함수와 옵티마이저
55 | # 신경망 모델을 최적화할 수 있게 손실함수를 정의합니다.
56 | # MAE를 사용합니다.
57 | loss_fn = nn.L1Loss()
58 |
59 | # 옵티마이저와 스케줄러의 파라미터를 정의합니다.
60 | no_decay = ["bias", "LayerNorm.weight"] # decay하지 않을 영역 지정
61 | optimizer_grouped_parameters = [
62 | {
63 | "params": [p for n, p in model.named_parameters() \
64 | if not any(nd in n for nd in no_decay)],
65 | "weight_decay": 0.0,
66 | },
67 | {"params": [p for n, p in model.named_parameters() \
68 | if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
69 | ]
70 |
71 | # 옵티마이저와 스케줄러 객체를 정의합니다.
72 | optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon)
73 | scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
74 | optimizer, num_warmup_steps=warmup_step, num_training_steps=total_step
75 | )
76 |
77 | # 모델 이름을 위해서 변수를 만듭니다.
78 | version = time.localtime()[3:5]
79 | curr_lr = lr
80 |
81 | # train loss와 val loss를 지정합니다.
82 | total_loss = 0.0
83 | total_val_loss = 0.0
84 | n_val_loss = 10000000. # best validation loss를 저장하기 위해서 변수 설정합니다.
85 |
86 | if not os.path.exists('bin'):
87 | os.mkdir('bin')
88 |
89 | for epoch in range(epochs):
90 | total_loss = 0
91 | total_val_loss = 0
92 | for i, data in enumerate(tqdm(train_loader, desc='*********Train mode*******')): # train 데이터를 부르고 학습합니다.
93 | # 학습 데이터를 부르고 학습합니다.
94 | # 순방향 정의
95 | pred = model(data['X'].float().to(device))
96 | loss = loss_fn(pred, data['Y'].float().to(device))
97 |
98 | # 역방향 정의
99 | # optimizer 객체를 사용해서 학습 가능한 가중치 변수에 대한 모든 변화도를
100 | # 0으로 만듭니다.
101 | optimizer.zero_grad()
102 |
103 | # loss에 따른 오차 역전파를 구합니다.
104 | loss.backward()
105 |
106 | # optimizer 객체의 파라미터들을 업데이트합니다.
107 | optimizer.step()
108 |
109 | # scheduler 객체의 파라미터들을 업데이트합니다.
110 | scheduler.step()
111 |
112 | total_loss += loss.item()
113 |
114 | train_loss = total_loss / len(train_loader)
115 | print ("Epoch [{}/{}], Train Loss: {:.4f}".format(epoch+1, epochs, train_loss))
116 |
117 | # 평가
118 | # 검증 데이터를 부르고 에포크마다 학습된 모델을 부르고 평가합니다.
119 | model.eval()
120 | with torch.no_grad():
121 | for i, data in enumerate(tqdm(val_loader, \
122 | desc='*********Evaluation mode*******')):
123 | pred = model(data['X'].float().to(device))
124 | loss_val = loss_fn(pred, data['Y'].float().to(device))
125 |
126 | total_val_loss += loss_val.item()
127 | val_loss = total_val_loss / len(val_loader)
128 | print ("Epoch [{}/{}], Eval Loss: {:.4f}".format(epoch+1, epochs, val_loss))
129 |
130 | # 검증 데이터에서 가장 낮은 평균 절대 오차를 보인 에포크의 모델을 저장합니다.
131 | if val_loss < n_val_loss:
132 | n_val_loss = val_loss
133 | #torch.save(model.state_dict(), f'bin/test_{version}.pth')
134 | torch.save(model.state_dict(), f'bin/test.pth')
135 | print("Best Model saved......")
136 |
--------------------------------------------------------------------------------
/ch03/rain.csv:
--------------------------------------------------------------------------------
1 | 지점,일시,기온(°C),강수량(mm)
2 | gosan,2019-09-01,23.916666666666668,0.0
3 | gosan,2019-09-02,23.7,12.1
4 | gosan,2019-09-03,23.549999999999997,1.9
5 | gosan,2019-09-04,23.53333333333333,12.84
6 | gosan,2019-09-05,26.033333333333335,0.0
7 | gosan,2019-09-06,27.066666666666666,
8 | gosan,2019-09-07,25.133333333333336,0.0
9 | gosan,2019-09-08,24.383333333333336,0.2
10 | gosan,2019-09-09,26.666666666666668,4.750000000000001
11 | gosan,2019-09-10,26.183333333333334,
12 | gosan,2019-09-11,25.95,0.6
13 | gosan,2019-09-12,25.483333333333334,
14 | gosan,2019-09-13,23.75,
15 | gosan,2019-09-14,23.566666666666666,
16 | gosan,2019-09-15,23.900000000000002,
17 | gosan,2019-09-16,23.8,
18 | gosan,2019-09-17,23.866666666666664,
19 | gosan,2019-09-18,22.683333333333334,
20 | gosan,2019-09-19,22.716666666666665,
21 | gosan,2019-09-20,22.21666666666667,
22 | gosan,2019-09-21,20.216666666666665,1.44
23 | gosan,2019-09-22,20.016666666666666,0.72
24 | gosan,2019-09-23,20.48333333333333,
25 | gosan,2019-09-24,20.533333333333335,
26 | gosan,2019-09-25,21.46666666666667,
27 | gosan,2019-09-26,22.266666666666666,
28 | gosan,2019-09-27,24.416666666666668,
29 | gosan,2019-09-28,24.46666666666667,
30 | gosan,2019-09-29,23.766666666666666,
31 | gosan,2019-09-30,23.866666666666664,
32 | gosan,2019-10-01,23.883333333333336,0.30000000000000004
33 | gosan,2019-10-02,25.099999999999998,5.550000000000001
34 | gosan,2019-10-03,23.13333333333333,0.07500000000000001
35 | gosan,2019-10-04,23.233333333333334,
36 | gosan,2019-10-05,21.666666666666668,
37 | gosan,2019-10-06,20.400000000000002,
38 | gosan,2019-10-07,22.516666666666666,0.0
39 | gosan,2019-10-08,19.86666666666667,
40 | gosan,2019-10-09,17.166666666666668,
41 | gosan,2019-10-10,19.583333333333332,
42 | gosan,2019-10-11,20.566666666666666,
43 | gosan,2019-10-12,21.216666666666665,
44 | gosan,2019-10-13,19.666666666666668,
45 | gosan,2019-10-14,19.466666666666665,
46 | gosan,2019-10-15,16.633333333333336,
47 | gosan,2019-10-16,18.46666666666667,
48 | jeju,2019-09-01,24.650000000000002,0.0
49 | jeju,2019-09-02,22.8,4.74
50 | jeju,2019-09-03,24.03333333333333,3.1
51 | jeju,2019-09-04,24.383333333333336,6.9799999999999995
52 | jeju,2019-09-05,26.11666666666667,0.1
53 | jeju,2019-09-06,27.533333333333335,0.0
54 | jeju,2019-09-07,27.983333333333334,0.04
55 | jeju,2019-09-08,25.133333333333336,0.0
56 | jeju,2019-09-09,26.666666666666668,1.116666666666667
57 | jeju,2019-09-10,26.583333333333332,
58 | jeju,2019-09-11,25.26666666666667,1.25
59 | jeju,2019-09-12,24.28333333333333,0.19999999999999998
60 | jeju,2019-09-13,23.51666666666667,
61 | jeju,2019-09-14,23.983333333333334,
62 | jeju,2019-09-15,24.583333333333332,
63 | jeju,2019-09-16,24.566666666666666,
64 | jeju,2019-09-17,23.833333333333332,
65 | jeju,2019-09-18,23.383333333333336,
66 | jeju,2019-09-19,22.849999999999998,
67 | jeju,2019-09-20,22.099999999999998,
68 | jeju,2019-09-21,20.400000000000002,1.75
69 | jeju,2019-09-22,20.25,7.766666666666667
70 | jeju,2019-09-23,20.7,
71 | jeju,2019-09-24,20.333333333333332,
72 | jeju,2019-09-25,21.183333333333334,
73 | jeju,2019-09-26,22.883333333333336,0.2
74 | jeju,2019-09-27,24.233333333333334,
75 | jeju,2019-09-28,24.066666666666663,
76 | jeju,2019-09-29,23.983333333333334,0.0
77 | jeju,2019-09-30,23.866666666666664,
78 | jeju,2019-10-01,24.76666666666667,0.3666666666666667
79 | jeju,2019-10-02,24.716666666666665,11.216666666666669
80 | jeju,2019-10-03,22.899999999999995,0.05
81 | jeju,2019-10-04,23.350000000000005,
82 | jeju,2019-10-05,21.950000000000003,
83 | jeju,2019-10-06,20.549999999999997,0.25
84 | jeju,2019-10-07,22.516666666666666,
85 | jeju,2019-10-08,20.466666666666665,
86 | jeju,2019-10-09,17.45,
87 | jeju,2019-10-10,18.2,
88 | jeju,2019-10-11,20.416666666666668,
89 | jeju,2019-10-12,21.316666666666666,
90 | jeju,2019-10-13,19.366666666666667,
91 | jeju,2019-10-14,19.783333333333335,
92 | jeju,2019-10-15,16.900000000000002,
93 | jeju,2019-10-16,17.98333333333333,
94 | po,2019-09-01,23.78333333333333,0.0
95 | po,2019-09-02,24.966666666666665,1.38
96 | po,2019-09-03,24.866666666666664,2.8
97 | po,2019-09-04,24.349999999999998,4.42
98 | po,2019-09-05,25.41666666666666,1.0799999999999998
99 | po,2019-09-06,26.549999999999997,0.25
100 | po,2019-09-07,25.183333333333334,0.20000000000000004
101 | po,2019-09-08,24.5,0.13333333333333333
102 | po,2019-09-09,26.600000000000005,1.725
103 | po,2019-09-10,26.116666666666664,
104 | po,2019-09-11,27.33333333333333,
105 | po,2019-09-12,25.100000000000005,
106 | po,2019-09-13,24.03333333333333,
107 | po,2019-09-14,25.083333333333332,
108 | po,2019-09-15,25.25,
109 | po,2019-09-16,24.733333333333334,
110 | po,2019-09-17,23.400000000000002,
111 | po,2019-09-18,23.483333333333334,
112 | po,2019-09-19,23.700000000000003,
113 | po,2019-09-20,22.633333333333336,
114 | po,2019-09-21,20.216666666666665,1.075
115 | po,2019-09-22,20.883333333333333,5.1000000000000005
116 | po,2019-09-23,20.816666666666666,0.1
117 | po,2019-09-24,20.666666666666668,
118 | po,2019-09-25,22.11666666666667,
119 | po,2019-09-26,23.899999999999995,0.1
120 | po,2019-09-27,25.2,
121 | po,2019-09-28,24.96666666666667,
122 | po,2019-09-29,25.0,
123 | po,2019-09-30,24.2,
124 | po,2019-10-01,24.683333333333334,0.02
125 | po,2019-10-02,24.766666666666666,9.816666666666668
126 | po,2019-10-03,24.03333333333333,
127 | po,2019-10-04,23.0,
128 | po,2019-10-05,22.05,
129 | po,2019-10-06,21.7,0.06666666666666667
130 | po,2019-10-07,22.416666666666668,0.0
131 | po,2019-10-08,20.883333333333336,
132 | po,2019-10-09,18.5,
133 | po,2019-10-10,17.900000000000002,
134 | po,2019-10-11,20.966666666666665,
135 | po,2019-10-12,21.7,
136 | po,2019-10-13,19.083333333333336,
137 | po,2019-10-14,18.883333333333333,
138 | po,2019-10-15,19.183333333333334,
139 | po,2019-10-16,18.299999999999997,
140 | seongsan,2019-09-01,23.53333333333333,
141 | seongsan,2019-09-02,24.566666666666666,2.0
142 | seongsan,2019-09-03,24.533333333333335,4.6
143 | seongsan,2019-09-04,25.400000000000002,3.0
144 | seongsan,2019-09-05,25.766666666666666,0.5666666666666667
145 | seongsan,2019-09-06,27.183333333333334,
146 | seongsan,2019-09-07,26.53333333333333,0.05
147 | seongsan,2019-09-08,24.083333333333332,0.1
148 | seongsan,2019-09-09,27.149999999999995,0.13333333333333333
149 | seongsan,2019-09-10,26.566666666666663,
150 | seongsan,2019-09-11,27.3,
151 | seongsan,2019-09-12,24.900000000000002,
152 | seongsan,2019-09-13,23.933333333333337,
153 | seongsan,2019-09-14,23.583333333333332,
154 | seongsan,2019-09-15,25.150000000000002,
155 | seongsan,2019-09-16,24.583333333333332,
156 | seongsan,2019-09-17,23.583333333333332,
157 | seongsan,2019-09-18,23.166666666666668,
158 | seongsan,2019-09-19,23.616666666666664,
159 | seongsan,2019-09-20,23.350000000000005,
160 | seongsan,2019-09-21,21.033333333333335,0.9166666666666666
161 | seongsan,2019-09-22,21.266666666666666,5.349999999999999
162 | seongsan,2019-09-23,19.833333333333332,0.0
163 | seongsan,2019-09-24,19.900000000000002,
164 | seongsan,2019-09-25,21.400000000000002,
165 | seongsan,2019-09-26,24.266666666666666,0.0
166 | seongsan,2019-09-27,24.200000000000003,0.25
167 | seongsan,2019-09-28,25.5,
168 | seongsan,2019-09-29,24.5,
169 | seongsan,2019-09-30,24.033333333333335,
170 | seongsan,2019-10-01,25.166666666666668,
171 | seongsan,2019-10-02,25.316666666666663,11.680000000000001
172 | seongsan,2019-10-03,23.549999999999997,
173 | seongsan,2019-10-04,23.53333333333333,
174 | seongsan,2019-10-05,21.733333333333334,
175 | seongsan,2019-10-06,21.333333333333332,0.05
176 | seongsan,2019-10-07,23.516666666666666,
177 | seongsan,2019-10-08,18.96666666666667,
178 | seongsan,2019-10-09,17.016666666666666,
179 | seongsan,2019-10-10,17.983333333333334,
180 | seongsan,2019-10-11,20.51666666666667,
181 | seongsan,2019-10-12,21.3,
182 | seongsan,2019-10-13,19.3,
183 | seongsan,2019-10-14,18.583333333333332,
184 | seongsan,2019-10-15,16.21666666666667,2.9
185 | seongsan,2019-10-16,18.88333333333333,
186 |
--------------------------------------------------------------------------------
/ch04/problem.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 4. 상점 신용카드 매출 예측\n",
8 | "## 4.1. 문제 정의\n",
9 | "### 4.1.4. 문제 해결을 위한 접근 방식 소개"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import os\n",
20 | "import warnings\n",
21 | "\n",
22 | "warnings.filterwarnings(\"ignore\")\n",
23 | "\n",
24 | "os.chdir('C:/dacon/ch04')\n",
25 | "train = pd.read_csv('./funda_train.csv')\n",
26 | "submission = pd.read_csv('./submission.csv')"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/plain": [
37 | "(6556613, 9)"
38 | ]
39 | },
40 | "execution_count": 2,
41 | "metadata": {},
42 | "output_type": "execute_result"
43 | }
44 | ],
45 | "source": [
46 | "train.shape"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "data": {
56 | "text/html": [
57 | "
\n",
58 | "\n",
71 | "
\n",
72 | " \n",
73 | " \n",
74 | " | \n",
75 | " store_id | \n",
76 | " card_id | \n",
77 | " card_company | \n",
78 | " transacted_date | \n",
79 | " transacted_time | \n",
80 | " installment_term | \n",
81 | " region | \n",
82 | " type_of_business | \n",
83 | " amount | \n",
84 | "
\n",
85 | " \n",
86 | " \n",
87 | " \n",
88 | " | 0 | \n",
89 | " 0 | \n",
90 | " 0 | \n",
91 | " b | \n",
92 | " 2016-06-01 | \n",
93 | " 13:13 | \n",
94 | " 0 | \n",
95 | " NaN | \n",
96 | " 기타 미용업 | \n",
97 | " 1857.142857 | \n",
98 | "
\n",
99 | " \n",
100 | " | 1 | \n",
101 | " 0 | \n",
102 | " 1 | \n",
103 | " h | \n",
104 | " 2016-06-01 | \n",
105 | " 18:12 | \n",
106 | " 0 | \n",
107 | " NaN | \n",
108 | " 기타 미용업 | \n",
109 | " 857.142857 | \n",
110 | "
\n",
111 | " \n",
112 | " | 2 | \n",
113 | " 0 | \n",
114 | " 2 | \n",
115 | " c | \n",
116 | " 2016-06-01 | \n",
117 | " 18:52 | \n",
118 | " 0 | \n",
119 | " NaN | \n",
120 | " 기타 미용업 | \n",
121 | " 2000.000000 | \n",
122 | "
\n",
123 | " \n",
124 | " | 3 | \n",
125 | " 0 | \n",
126 | " 3 | \n",
127 | " a | \n",
128 | " 2016-06-01 | \n",
129 | " 20:22 | \n",
130 | " 0 | \n",
131 | " NaN | \n",
132 | " 기타 미용업 | \n",
133 | " 7857.142857 | \n",
134 | "
\n",
135 | " \n",
136 | " | 4 | \n",
137 | " 0 | \n",
138 | " 4 | \n",
139 | " c | \n",
140 | " 2016-06-02 | \n",
141 | " 11:06 | \n",
142 | " 0 | \n",
143 | " NaN | \n",
144 | " 기타 미용업 | \n",
145 | " 2000.000000 | \n",
146 | "
\n",
147 | " \n",
148 | "
\n",
149 | "
"
150 | ],
151 | "text/plain": [
152 | " store_id card_id card_company transacted_date transacted_time \\\n",
153 | "0 0 0 b 2016-06-01 13:13 \n",
154 | "1 0 1 h 2016-06-01 18:12 \n",
155 | "2 0 2 c 2016-06-01 18:52 \n",
156 | "3 0 3 a 2016-06-01 20:22 \n",
157 | "4 0 4 c 2016-06-02 11:06 \n",
158 | "\n",
159 | " installment_term region type_of_business amount \n",
160 | "0 0 NaN 기타 미용업 1857.142857 \n",
161 | "1 0 NaN 기타 미용업 857.142857 \n",
162 | "2 0 NaN 기타 미용업 2000.000000 \n",
163 | "3 0 NaN 기타 미용업 7857.142857 \n",
164 | "4 0 NaN 기타 미용업 2000.000000 "
165 | ]
166 | },
167 | "execution_count": 3,
168 | "metadata": {},
169 | "output_type": "execute_result"
170 | }
171 | ],
172 | "source": [
173 | "train.head()"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 4,
179 | "metadata": {},
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "\n",
186 | "RangeIndex: 6556613 entries, 0 to 6556612\n",
187 | "Data columns (total 9 columns):\n",
188 | "store_id int64\n",
189 | "card_id int64\n",
190 | "card_company object\n",
191 | "transacted_date object\n",
192 | "transacted_time object\n",
193 | "installment_term int64\n",
194 | "region object\n",
195 | "type_of_business object\n",
196 | "amount float64\n",
197 | "dtypes: float64(1), int64(3), object(5)\n",
198 | "memory usage: 450.2+ MB\n"
199 | ]
200 | }
201 | ],
202 | "source": [
203 | "train.info()"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 5,
209 | "metadata": {},
210 | "outputs": [
211 | {
212 | "data": {
213 | "text/html": [
214 | "\n",
215 | "\n",
228 | "
\n",
229 | " \n",
230 | " \n",
231 | " | \n",
232 | " store_id | \n",
233 | " card_id | \n",
234 | " card_company | \n",
235 | " transacted_date | \n",
236 | " transacted_time | \n",
237 | " installment_term | \n",
238 | " region | \n",
239 | " type_of_business | \n",
240 | " amount | \n",
241 | "
\n",
242 | " \n",
243 | " \n",
244 | " \n",
245 | " | 0 | \n",
246 | " 0 | \n",
247 | " 0 | \n",
248 | " b | \n",
249 | " 2016-06-01 | \n",
250 | " 13:13 | \n",
251 | " 0 | \n",
252 | " NaN | \n",
253 | " 기타 미용업 | \n",
254 | " 1857.142857 | \n",
255 | "
\n",
256 | " \n",
257 | " | 1 | \n",
258 | " 0 | \n",
259 | " 1 | \n",
260 | " h | \n",
261 | " 2016-06-01 | \n",
262 | " 18:12 | \n",
263 | " 0 | \n",
264 | " NaN | \n",
265 | " 기타 미용업 | \n",
266 | " 857.142857 | \n",
267 | "
\n",
268 | " \n",
269 | " | 2 | \n",
270 | " 0 | \n",
271 | " 2 | \n",
272 | " c | \n",
273 | " 2016-06-01 | \n",
274 | " 18:52 | \n",
275 | " 0 | \n",
276 | " NaN | \n",
277 | " 기타 미용업 | \n",
278 | " 2000.000000 | \n",
279 | "
\n",
280 | " \n",
281 | " | 3 | \n",
282 | " 0 | \n",
283 | " 3 | \n",
284 | " a | \n",
285 | " 2016-06-01 | \n",
286 | " 20:22 | \n",
287 | " 0 | \n",
288 | " NaN | \n",
289 | " 기타 미용업 | \n",
290 | " 7857.142857 | \n",
291 | "
\n",
292 | " \n",
293 | " | 4 | \n",
294 | " 0 | \n",
295 | " 4 | \n",
296 | " c | \n",
297 | " 2016-06-02 | \n",
298 | " 11:06 | \n",
299 | " 0 | \n",
300 | " NaN | \n",
301 | " 기타 미용업 | \n",
302 | " 2000.000000 | \n",
303 | "
\n",
304 | " \n",
305 | "
\n",
306 | "
"
307 | ],
308 | "text/plain": [
309 | " store_id card_id card_company transacted_date transacted_time \\\n",
310 | "0 0 0 b 2016-06-01 13:13 \n",
311 | "1 0 1 h 2016-06-01 18:12 \n",
312 | "2 0 2 c 2016-06-01 18:52 \n",
313 | "3 0 3 a 2016-06-01 20:22 \n",
314 | "4 0 4 c 2016-06-02 11:06 \n",
315 | "\n",
316 | " installment_term region type_of_business amount \n",
317 | "0 0 NaN 기타 미용업 1857.142857 \n",
318 | "1 0 NaN 기타 미용업 857.142857 \n",
319 | "2 0 NaN 기타 미용업 2000.000000 \n",
320 | "3 0 NaN 기타 미용업 7857.142857 \n",
321 | "4 0 NaN 기타 미용업 2000.000000 "
322 | ]
323 | },
324 | "execution_count": 5,
325 | "metadata": {},
326 | "output_type": "execute_result"
327 | }
328 | ],
329 | "source": [
330 | "train[:5]"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": 6,
336 | "metadata": {},
337 | "outputs": [
338 | {
339 | "data": {
340 | "text/html": [
341 | "\n",
342 | "\n",
355 | "
\n",
356 | " \n",
357 | " \n",
358 | " | \n",
359 | " store_id | \n",
360 | " card_id | \n",
361 | " card_company | \n",
362 | " transacted_date | \n",
363 | " transacted_time | \n",
364 | " installment_term | \n",
365 | " region | \n",
366 | " type_of_business | \n",
367 | " amount | \n",
368 | "
\n",
369 | " \n",
370 | " \n",
371 | " \n",
372 | " | 41 | \n",
373 | " 0 | \n",
374 | " 40 | \n",
375 | " a | \n",
376 | " 2016-06-10 | \n",
377 | " 17:26 | \n",
378 | " 2 | \n",
379 | " NaN | \n",
380 | " 기타 미용업 | \n",
381 | " -8571.428571 | \n",
382 | "
\n",
383 | " \n",
384 | " | 347 | \n",
385 | " 0 | \n",
386 | " 285 | \n",
387 | " a | \n",
388 | " 2016-08-04 | \n",
389 | " 17:52 | \n",
390 | " 0 | \n",
391 | " NaN | \n",
392 | " 기타 미용업 | \n",
393 | " -1857.142857 | \n",
394 | "
\n",
395 | " \n",
396 | " | 731 | \n",
397 | " 0 | \n",
398 | " 473 | \n",
399 | " g | \n",
400 | " 2016-10-17 | \n",
401 | " 10:32 | \n",
402 | " 0 | \n",
403 | " NaN | \n",
404 | " 기타 미용업 | \n",
405 | " -2000.000000 | \n",
406 | "
\n",
407 | " \n",
408 | " | 831 | \n",
409 | " 0 | \n",
410 | " 230 | \n",
411 | " b | \n",
412 | " 2016-11-03 | \n",
413 | " 15:36 | \n",
414 | " 0 | \n",
415 | " NaN | \n",
416 | " 기타 미용업 | \n",
417 | " -85.714286 | \n",
418 | "
\n",
419 | " \n",
420 | " | 944 | \n",
421 | " 0 | \n",
422 | " 138 | \n",
423 | " a | \n",
424 | " 2016-11-28 | \n",
425 | " 13:21 | \n",
426 | " 0 | \n",
427 | " NaN | \n",
428 | " 기타 미용업 | \n",
429 | " -57.142857 | \n",
430 | "
\n",
431 | " \n",
432 | " | ... | \n",
433 | " ... | \n",
434 | " ... | \n",
435 | " ... | \n",
436 | " ... | \n",
437 | " ... | \n",
438 | " ... | \n",
439 | " ... | \n",
440 | " ... | \n",
441 | " ... | \n",
442 | "
\n",
443 | " \n",
444 | " | 6556242 | \n",
445 | " 2136 | \n",
446 | " 4663626 | \n",
447 | " b | \n",
448 | " 2019-02-01 | \n",
449 | " 21:19 | \n",
450 | " 0 | \n",
451 | " 제주 제주시 | \n",
452 | " 기타 주점업 | \n",
453 | " -13428.571429 | \n",
454 | "
\n",
455 | " \n",
456 | " | 6556448 | \n",
457 | " 2136 | \n",
458 | " 4663760 | \n",
459 | " d | \n",
460 | " 2019-02-15 | \n",
461 | " 00:46 | \n",
462 | " 0 | \n",
463 | " 제주 제주시 | \n",
464 | " 기타 주점업 | \n",
465 | " -6928.571429 | \n",
466 | "
\n",
467 | " \n",
468 | " | 6556485 | \n",
469 | " 2136 | \n",
470 | " 4663779 | \n",
471 | " b | \n",
472 | " 2019-02-18 | \n",
473 | " 02:45 | \n",
474 | " 0 | \n",
475 | " 제주 제주시 | \n",
476 | " 기타 주점업 | \n",
477 | " -5571.428571 | \n",
478 | "
\n",
479 | " \n",
480 | " | 6556489 | \n",
481 | " 2136 | \n",
482 | " 4663780 | \n",
483 | " d | \n",
484 | " 2019-02-18 | \n",
485 | " 21:43 | \n",
486 | " 0 | \n",
487 | " 제주 제주시 | \n",
488 | " 기타 주점업 | \n",
489 | " -8571.428571 | \n",
490 | "
\n",
491 | " \n",
492 | " | 6556608 | \n",
493 | " 2136 | \n",
494 | " 4663855 | \n",
495 | " d | \n",
496 | " 2019-02-28 | \n",
497 | " 23:20 | \n",
498 | " 0 | \n",
499 | " 제주 제주시 | \n",
500 | " 기타 주점업 | \n",
501 | " -4500.000000 | \n",
502 | "
\n",
503 | " \n",
504 | "
\n",
505 | "
73100 rows × 9 columns
\n",
506 | "
"
507 | ],
508 | "text/plain": [
509 | " store_id card_id card_company transacted_date transacted_time \\\n",
510 | "41 0 40 a 2016-06-10 17:26 \n",
511 | "347 0 285 a 2016-08-04 17:52 \n",
512 | "731 0 473 g 2016-10-17 10:32 \n",
513 | "831 0 230 b 2016-11-03 15:36 \n",
514 | "944 0 138 a 2016-11-28 13:21 \n",
515 | "... ... ... ... ... ... \n",
516 | "6556242 2136 4663626 b 2019-02-01 21:19 \n",
517 | "6556448 2136 4663760 d 2019-02-15 00:46 \n",
518 | "6556485 2136 4663779 b 2019-02-18 02:45 \n",
519 | "6556489 2136 4663780 d 2019-02-18 21:43 \n",
520 | "6556608 2136 4663855 d 2019-02-28 23:20 \n",
521 | "\n",
522 | " installment_term region type_of_business amount \n",
523 | "41 2 NaN 기타 미용업 -8571.428571 \n",
524 | "347 0 NaN 기타 미용업 -1857.142857 \n",
525 | "731 0 NaN 기타 미용업 -2000.000000 \n",
526 | "831 0 NaN 기타 미용업 -85.714286 \n",
527 | "944 0 NaN 기타 미용업 -57.142857 \n",
528 | "... ... ... ... ... \n",
529 | "6556242 0 제주 제주시 기타 주점업 -13428.571429 \n",
530 | "6556448 0 제주 제주시 기타 주점업 -6928.571429 \n",
531 | "6556485 0 제주 제주시 기타 주점업 -5571.428571 \n",
532 | "6556489 0 제주 제주시 기타 주점업 -8571.428571 \n",
533 | "6556608 0 제주 제주시 기타 주점업 -4500.000000 \n",
534 | "\n",
535 | "[73100 rows x 9 columns]"
536 | ]
537 | },
538 | "execution_count": 6,
539 | "metadata": {},
540 | "output_type": "execute_result"
541 | }
542 | ],
543 | "source": [
544 | "train[train['amount']<0]"
545 | ]
546 | }
547 | ],
548 | "metadata": {
549 | "kernelspec": {
550 | "display_name": "[store_amount]",
551 | "language": "python",
552 | "name": "store_amount"
553 | },
554 | "language_info": {
555 | "codemirror_mode": {
556 | "name": "ipython",
557 | "version": 3
558 | },
559 | "file_extension": ".py",
560 | "mimetype": "text/x-python",
561 | "name": "python",
562 | "nbconvert_exporter": "python",
563 | "pygments_lexer": "ipython3",
564 | "version": "3.7.9"
565 | }
566 | },
567 | "nbformat": 4,
568 | "nbformat_minor": 4
569 | }
570 |
--------------------------------------------------------------------------------
/ch02/src/model.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | import torch.nn as nn
4 | """
5 | code for mlp with skip connection model.
6 | scalable model for ensemble.
7 | """
8 |
9 | ##############################
10 | ###### Activation ############
11 | ##############################
12 |
13 | class GELU(nn.Module):
14 | """
15 | Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
16 | """
17 | def forward(self, x):
18 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
19 |
20 | class LayerNorm(nn.Module):
21 | def __init__(self, hidden_size, eps=1e-5):
22 | """Construct a layernorm module in the TF style (epsilon inside the square root).
23 | """
24 | super(LayerNorm, self).__init__()
25 | self.weight = nn.Parameter(torch.ones(hidden_size))
26 | self.bias = nn.Parameter(torch.zeros(hidden_size))
27 | self.variance_epsilon = eps
28 |
29 | self.init_weights()
30 |
31 | def init_weights(self):
32 | self.weight.data.fill_(1.0)
33 | self.bias.data.zero_()
34 |
35 | def forward(self, x):
36 | u = x.mean(-1, keepdim=True)
37 | s = (x - u).pow(2).mean(-1, keepdim=True)
38 | x = (x - u) / torch.sqrt(s + self.variance_epsilon)
39 | return self.weight * x + self.bias
40 |
41 | ##################################
42 | ######## Free format #############
43 | ##################################
44 | """
45 | - Baseline :
46 | - MLP(ANN) 구성
47 | - Skip Connection idea 적용 => 이전 정보를 효율적으로 활용
48 | - LayerNorm : 블록마다 feature Normalization 사용하여 수렴을 촉진
49 | - GELU 활성화 함수 적용 (미분 가능 및 음수 값에 대한 계산 확대)
50 | """
51 |
52 | # 이 SkipConnectionModel 클래스는 앞서 정의한 SkipConnectionModel
53 | # 클래스에 계층 정규화를 추가한 클래스입니다.
54 | class SkipConnectionModel(nn.Module):
55 | """
56 | >> model = Model(f_in, f_out, 300, 2000, 4000, 7000, 10000)
57 | 300, 2000, 4000, 7000, 10000 : channels
58 | """
59 | def __init__(self, fn_in=226, fn_out=4, *args):
60 | super(SkipConnectionModel, self).__init__()
61 | self.ln = LayerNorm(10000) #10000
62 | self.ln1 = LayerNorm(7000) # 7000
63 | self.ln2 = LayerNorm(4000) # 4000
64 | self.ln3 = LayerNorm(2000) # 2000
65 |
66 | self.upblock1 = nn.Sequential(nn.Linear(fn_in, 2000),GELU(),nn.BatchNorm1d(2000))
67 | self.upblock2 = nn.Sequential(nn.Linear(2000,4000),GELU(),nn.BatchNorm1d(4000))
68 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
69 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000))
70 |
71 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000))
72 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
73 | self.downblock3 = nn.Sequential(nn.Linear(4000, 2000),GELU(),nn.BatchNorm1d(2000))
74 | self.downblock4 = nn.Sequential(nn.Linear(2000, 300),GELU(),nn.BatchNorm1d(300))
75 |
76 | self.fclayer = nn.Sequential(nn.Linear(300, fn_out))
77 | self.dropout = nn.Dropout(0.1)
78 |
79 | def forward(self, x):
80 | upblock1_out = self.upblock1(x)
81 | upblock2_out = self.upblock2(upblock1_out)
82 | upblock3_out = self.upblock3(upblock2_out)
83 | upblock4_out = self.upblock4(upblock3_out)
84 |
85 | # upblock에서 나온 결괏값들의 정규화를 진행합니다.
86 | downblock1_out = self.downblock1(self.ln(upblock4_out))
87 | skipblock1 = downblock1_out + upblock3_out
88 | downblock2_out = self.downblock2(self.ln1(skipblock1))
89 | skipblock2 = downblock2_out + upblock2_out
90 | downblock3_out = self.downblock3(self.ln2(skipblock2))
91 | skipblock3 = downblock3_out + upblock1_out
92 | downblock4_out = self.downblock4(self.ln3(skipblock3))
93 |
94 | output = self.fclayer(downblock4_out)
95 |
96 | return output
97 |
98 | ########################################
99 | ########################################
100 |
101 | """
102 | - Test Models for Ensemble
103 | """
104 |
105 | class TestModel(nn.Module):
106 | def __init__(self):
107 | super(TestModel, self).__init__()
108 |
109 | # self.ln = LayerNorm(1args[4]0)
110 | self.ln = LayerNorm(10000)
111 | self.ln1 = LayerNorm(7000)
112 | self.ln2 = LayerNorm(4000)
113 | self.ln3 = LayerNorm(2000)
114 |
115 | self.upblock1 = nn.Sequential(nn.Linear(226, 2000),GELU(),nn.BatchNorm1d(2000))
116 | self.upblock2 = nn.Sequential(nn.Linear(2000,4000),GELU(),nn.BatchNorm1d(4000))
117 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
118 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000))
119 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000))
120 |
121 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000))
122 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000))
123 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
124 | self.downblock3 = nn.Sequential(nn.Linear(4000, 2000),GELU(),nn.BatchNorm1d(2000))
125 | self.downblock4 = nn.Sequential(nn.Linear(2000, 300),GELU(),nn.BatchNorm1d(300))
126 |
127 | self.fclayer = nn.Sequential(nn.Linear(300,4))
128 | self.dropout = nn.Dropout(0.1)
129 |
130 | def forward(self, x):
131 | upblock1_out = self.upblock1(x)
132 | upblock2_out = self.upblock2(upblock1_out)
133 | upblock3_out = self.upblock3(upblock2_out)
134 | upblock4_out = self.upblock4(upblock3_out)
135 | #upblock5_out = self.upblock5(upblock4_out)
136 |
137 | downblock1_out = self.downblock1(self.ln(upblock4_out))
138 | skipblock1 = downblock1_out + upblock3_out
139 | downblock2_out = self.downblock2(self.ln1(skipblock1))
140 | skipblock2 = downblock2_out + upblock2_out
141 | downblock3_out = self.downblock3(self.ln2(skipblock2))
142 | skipblock3 = downblock3_out + upblock1_out
143 | downblock4_out = self.downblock4(self.ln3(skipblock3))
144 |
145 | output = self.fclayer(downblock4_out)
146 |
147 | return output
148 |
149 |
150 | class TestModel1(nn.Module):
151 | def __init__(self):
152 | super(TestModel1, self).__init__()
153 |
154 | # self.ln = LayerNorm(13000)
155 | self.ln = LayerNorm(10000)
156 | self.ln1 = LayerNorm(7000)
157 | self.ln2 = LayerNorm(4000)
158 | self.ln3 = LayerNorm(1000)
159 |
160 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000))
161 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000))
162 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
163 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000))
164 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000))
165 |
166 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000))
167 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000))
168 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
169 | self.downblock3 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000))
170 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300))
171 |
172 | self.fclayer = nn.Sequential(nn.Linear(300,4))
173 | self.dropout = nn.Dropout(0.1)
174 |
175 | def forward(self, x):
176 | upblock1_out = self.upblock1(x)
177 | upblock2_out = self.upblock2(upblock1_out)
178 | upblock3_out = self.upblock3(upblock2_out)
179 | upblock4_out = self.upblock4(upblock3_out)
180 | #upblock5_out = self.upblock5(upblock4_out)
181 |
182 | downblock1_out = self.downblock1(self.ln(upblock4_out))
183 | skipblock1 = downblock1_out + upblock3_out
184 | downblock2_out = self.downblock2(self.ln1(skipblock1))
185 | skipblock2 = downblock2_out + upblock2_out
186 | downblock3_out = self.downblock3(self.ln2(skipblock2))
187 | skipblock3 = downblock3_out + upblock1_out
188 | downblock4_out = self.downblock4(self.ln3(skipblock3))
189 |
190 | output = self.fclayer(downblock4_out)
191 |
192 | return output
193 |
194 | # Model 2
195 |
196 | class TestModel2(nn.Module):
197 | def __init__(self):
198 | super(TestModel2, self).__init__()
199 |
200 | # self.ln = LayerNorm(13000)
201 | self.ln = LayerNorm(20000)
202 | self.ln1 = LayerNorm(13000)
203 | self.ln2 = LayerNorm(7000)
204 | self.ln3 = LayerNorm(4000)
205 | self.ln4 = LayerNorm(1000)
206 | self.ln5 = LayerNorm(13000)
207 |
208 |
209 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
210 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),nn.ReLU(),nn.BatchNorm1d(4000))
211 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000),nn.ReLU(),nn.BatchNorm1d(7000))
212 | self.upblock4 = nn.Sequential(nn.Linear(7000,13000),nn.ReLU(),nn.BatchNorm1d(13000))
213 | self.upblock5 = nn.Sequential(nn.Linear(13000,20000),nn.ReLU(),nn.BatchNorm1d(20000))
214 | self.upblock6 = nn.Sequential(nn.Linear(20000,13000),nn.ReLU(),nn.BatchNorm1d(13000))
215 |
216 | self.downblock1 = nn.Sequential(nn.Linear(13000, 20000),nn.ReLU(),nn.BatchNorm1d(20000))
217 | self.downblock2 = nn.Sequential(nn.Linear(20000, 13000),nn.ReLU(),nn.BatchNorm1d(13000))
218 | self.downblock3 = nn.Sequential(nn.Linear(13000, 7000),nn.ReLU(),nn.BatchNorm1d(7000))
219 | self.downblock4 = nn.Sequential(nn.Linear(7000, 4000),nn.ReLU(),nn.BatchNorm1d(4000))
220 | self.downblock5 = nn.Sequential(nn.Linear(4000, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
221 | self.downblock6 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300))
222 |
223 |
224 | self.fclayer = nn.Sequential(nn.Linear(300,4))
225 | self.dropout = nn.Dropout(0.1)
226 |
227 | def forward(self, x):
228 | upblock1_out = self.upblock1(x)
229 | upblock2_out = self.upblock2(upblock1_out)
230 | upblock3_out = self.upblock3(upblock2_out)
231 | upblock4_out = self.upblock4(upblock3_out)
232 | upblock5_out = self.upblock5(upblock4_out)
233 | upblock6_out = self.upblock6(upblock5_out)
234 |
235 |
236 | downblock1_out = self.downblock1(self.ln1(upblock6_out))
237 | skipblock1 = downblock1_out + upblock5_out # 20000
238 | downblock2_out = self.downblock2(self.ln(skipblock1))
239 | skipblock2 = downblock2_out + upblock4_out # 13000
240 | downblock3_out = self.downblock3(self.ln5(skipblock2))
241 | skipblock3 = downblock3_out + upblock3_out # 7000
242 | downblock4_out = self.downblock4(self.ln2(skipblock3))
243 | skipblock4 = downblock4_out + upblock2_out # 4000
244 |
245 | downblock5_out = self.downblock5(self.ln3(skipblock4))
246 | skipblock5 = downblock5_out + upblock1_out
247 | downblock6_out = self.downblock6(self.ln4(skipblock5))
248 |
249 | output = self.fclayer(downblock6_out)
250 |
251 | return output
252 |
253 | # Model3
254 | class TestModel3(nn.Module):
255 | """
256 | Model for (20,40)
257 | """
258 | def __init__(self):
259 | super(TestModel3, self).__init__()
260 |
261 | self.ln = LayerNorm(17000)
262 | self.ln1 = LayerNorm(13000)
263 | self.ln2 = LayerNorm(7000)
264 | self.ln3 = LayerNorm(5000)
265 |
266 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
267 | self.upblock2 = nn.Sequential(nn.Linear(1000,3000),nn.ReLU(),nn.BatchNorm1d(3000))
268 | self.upblock3 = nn.Sequential(nn.Linear(3000,5000),nn.ReLU(),nn.BatchNorm1d(5000))
269 | self.upblock4 = nn.Sequential(nn.Linear(5000,7000),nn.ReLU(),nn.BatchNorm1d(7000))
270 | self.upblock5 = nn.Sequential(nn.Linear(7000,13000),nn.ReLU(),nn.BatchNorm1d(13000))
271 | self.upblock6 = nn.Sequential(nn.Linear(13000,17000),nn.ReLU(),nn.BatchNorm1d(17000))
272 |
273 | self.downblock1 = nn.Sequential(nn.Linear(17000,13000),nn.ReLU(),nn.BatchNorm1d(13000))
274 | self.downblock2 = nn.Sequential(nn.Linear(13000, 7000),nn.ReLU(),nn.BatchNorm1d(7000))
275 | self.downblock3 = nn.Sequential(nn.Linear(7000, 5000),nn.ReLU(),nn.BatchNorm1d(5000))
276 | self.downblock4 = nn.Sequential(nn.Linear(5000, 3000),nn.ReLU(),nn.BatchNorm1d(3000))
277 | self.downblock5 = nn.Sequential(nn.Linear(3000, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
278 | self.downblock6 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300))
279 |
280 | self.fclayer = nn.Sequential(nn.Linear(300,4))
281 | self.dropout = nn.Dropout(0.1)
282 |
283 | def forward(self, x):
284 | upblock1_out = self.upblock1(x)
285 | upblock2_out = self.upblock2(upblock1_out)
286 | upblock3_out = self.upblock3(upblock2_out)
287 | upblock4_out = self.upblock4(upblock3_out)
288 | upblock5_out = self.upblock5(upblock4_out)
289 | upblock6_out = self.upblock6(upblock5_out)
290 |
291 | downblock1_out = self.dropout(self.downblock1(self.ln(upblock6_out)))
292 | skipblock1 = downblock1_out + upblock5_out
293 | downblock2_out = self.downblock2(self.ln1(skipblock1))
294 | skipblock2 = downblock2_out + upblock4_out
295 | downblock3_out = self.dropout(self.downblock3(self.ln2(skipblock2)))
296 | skipblock3 = downblock3_out + upblock3_out
297 | downblock4_out = self.downblock4(self.ln3(skipblock3))
298 | skipblock4 = downblock4_out + upblock2_out
299 | downblock5_out = self.downblock5(skipblock4)
300 | skipblock5 = self.dropout(downblock5_out + upblock1_out)
301 | downblock6_out = self.downblock6(skipblock5)
302 |
303 | output = self.fclayer(downblock6_out)
304 |
305 | return output
306 |
307 | class TestModel4(nn.Module):
308 | def __init__(self):
309 | super(TestModel4, self).__init__()
310 |
311 | self.ln = LayerNorm(10000)
312 | self.ln1 = LayerNorm(7000)
313 | self.ln2 = LayerNorm(4000)
314 | self.ln3 = LayerNorm(1000)
315 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
316 | self.upblock2 = nn.Sequential(nn.Linear(1000,10000),nn.ReLU(),nn.BatchNorm1d(10000))
317 | self.upblock3 = nn.Sequential(nn.Linear(10000,7000), nn.ReLU(),nn.BatchNorm1d(7000))
318 | self.upblock4 = nn.Sequential(nn.Linear(7000,4000),nn.ReLU(),nn.BatchNorm1d(4000))
319 |
320 | self.downblock1 = nn.Sequential(nn.Linear(4000, 7000),nn.ReLU(),nn.BatchNorm1d(7000))
321 | self.downblock2 = nn.Sequential(nn.Linear(7000, 10000),nn.ReLU(),nn.BatchNorm1d(10000))
322 | self.downblock3 = nn.Sequential(nn.Linear(10000, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
323 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300))
324 |
325 | self.fclayer = nn.Sequential(nn.Linear(300,4))
326 | self.dropout = nn.Dropout(0.1)
327 |
328 | def forward(self, x):
329 | upblock1_out = self.upblock1(x)
330 | upblock2_out = self.dropout(self.upblock2(upblock1_out))
331 | upblock3_out = self.dropout(self.upblock3(upblock2_out))
332 | upblock4_out = self.dropout(self.upblock4(upblock3_out))
333 |
334 | downblock1_out = self.downblock1(self.ln2(upblock4_out))
335 | skipblock1 = downblock1_out + upblock3_out # 7000
336 | downblock2_out = self.downblock2(self.ln1(skipblock1))
337 | skipblock2 = downblock2_out + upblock2_out # 10000
338 | downblock3_out = self.downblock3(self.ln(skipblock2))
339 | skipblock3 = downblock3_out + upblock1_out
340 | downblock4_out = self.downblock4(self.ln3(skipblock3))
341 |
342 | output = self.fclayer(downblock4_out)
343 |
344 | return output
345 |
346 | class TestModel5(nn.Module):
347 | def __init__(self):
348 | super(TestModel5, self).__init__()
349 |
350 | self.ln = LayerNorm(13000)
351 | self.ln1 = LayerNorm(11000)
352 | self.ln2 = LayerNorm(7000)
353 | self.ln3 = LayerNorm(4000)
354 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000))
355 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000))
356 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
357 | self.upblock4 = nn.Sequential(nn.Linear(7000,11000),GELU(),nn.BatchNorm1d(11000))
358 | self.upblock5 = nn.Sequential(nn.Linear(11000,13000),GELU(),nn.BatchNorm1d(13000))
359 |
360 | self.downblock1 = nn.Sequential(nn.Linear(13000, 11000),GELU(),nn.BatchNorm1d(11000))
361 | self.downblock2 = nn.Sequential(nn.Linear(11000, 7000),GELU(),nn.BatchNorm1d(7000))
362 | self.downblock3 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
363 | self.downblock4 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000))
364 | self.downblock5 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300))
365 |
366 | self.fclayer = nn.Sequential(nn.Linear(300,4))
367 | self.dropout = nn.Dropout(0.1)
368 |
369 | def forward(self, x):
370 | upblock1_out = self.upblock1(x)
371 | upblock2_out = self.upblock2(upblock1_out)
372 | upblock3_out = self.upblock3(upblock2_out)
373 | upblock4_out = self.upblock4(upblock3_out)
374 | upblock5_out = self.upblock5(upblock4_out)
375 |
376 | downblock1_out = self.downblock1(self.ln(upblock5_out))
377 | skipblock1 = downblock1_out + upblock4_out
378 | downblock2_out = self.downblock2(self.ln1(skipblock1))
379 | skipblock2 = downblock2_out + upblock3_out
380 | downblock3_out = self.downblock3(self.ln2(skipblock2))
381 | skipblock3 = downblock3_out + upblock2_out
382 | downblock4_out = self.dropout(self.downblock4(self.ln3(skipblock3)))
383 | skipblock4 = downblock4_out + upblock1_out
384 | downblock5_out = self.downblock5(skipblock4)
385 |
386 | output = self.fclayer(downblock5_out)
387 |
388 | return output
389 |
390 |
391 | class TestModel6(nn.Module):
392 | def __init__(self):
393 | super(TestModel6, self).__init__()
394 |
395 | # self.ln = LayerNorm(13000)
396 | self.ln = LayerNorm(10000)
397 | self.ln1 = LayerNorm(7000)
398 | self.ln2 = LayerNorm(4000)
399 |
400 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000))
401 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000))
402 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
403 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000))
404 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000))
405 |
406 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000))
407 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000))
408 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
409 | self.downblock3 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000))
410 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300))
411 |
412 | self.fclayer = nn.Sequential(nn.Linear(300,4))
413 | self.dropout = nn.Dropout(0.1)
414 |
415 | def forward(self, x):
416 | upblock1_out = self.upblock1(x)
417 | upblock2_out = self.upblock2(upblock1_out)
418 | upblock3_out = self.upblock3(upblock2_out)
419 | upblock4_out = self.upblock4(upblock3_out)
420 | #upblock5_out = self.upblock5(upblock4_out)
421 |
422 | downblock1_out = self.downblock1(self.ln(upblock4_out))
423 | skipblock1 = downblock1_out + upblock3_out
424 | downblock2_out = self.downblock2(self.ln1(skipblock1))
425 | skipblock2 = downblock2_out + upblock2_out
426 | downblock3_out = self.downblock3(self.ln2(skipblock2))
427 | skipblock3 = downblock3_out + upblock1_out
428 | downblock4_out = self.downblock4(skipblock3)
429 |
430 | output = self.fclayer(downblock4_out)
431 |
432 | return output
433 |
--------------------------------------------------------------------------------
/ch02/src/get_score.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # ## 점수 복원 코드
5 | #
6 | # - Baseline : - MLP(ANN) 모델
7 | # - Skip Connection idea 적용 => 이전 정보를 효율적으로 활용
8 | # - LayerNorm : 블록마다 feature Normalization 사용하여 수렴을 촉진
9 | # - GELU 활성화 함수 적용 (미분 가능 및 음수 값에 대한 계산 확대)
10 | #
11 | # - 조합을 통해 대표로 3가지 구성 - 파생 6가지
12 |
13 | import os
14 | import math
15 | import time
16 | from itertools import chain
17 |
18 | import numpy as np
19 | import pandas as pd
20 | import torch
21 | import torch.nn as nn
22 |
23 | from torch.utils.data import Dataset, DataLoader
24 | from tqdm.auto import tqdm
25 |
26 |
27 | # ## Model define
28 | # >Test Model ~ Test Model 6
29 | #
30 | # weight list:
31 | # 'test_(9, 49)_0.001_150.pth'-(819),'test_(20, 40)_0.001_80.pth-(2.92)', 'test_(20, 57)_0.001_100.pth'-(819),
32 | # 'test_(12, 20)_0.001_100.pth'-(853),'test_(10, 7)_0.001_100.pth'-(867), 'test_(0, 54)_0.001_70.pth'-(819),
33 | # 'test_(16, 47)_0.0001_80.pth'-(2.02),'test_(11, 43)_0.0001_70.pth-(2.02G)', 'test_(0, 9)_0.0001_70.pth'-(819),
34 | # 'test_(19, 4)_0.001_200.pth'-(819)
35 | #
36 | #
37 | # >
38 | #
39 | # TestModel : (12,20)
40 | # TestModel1 : (0, 54)
41 | # TestModel2 : (2.40)
42 | # TestModel 4 : (10,7)
43 | # TestModel 5 : (16,47), (11,43), (0,9)
44 | # TestModel 6 : (19, 4), (20, 57), (9, 49)
45 | #
46 |
47 | # In[2]:
48 |
49 |
50 | # 각각의 모델들
51 |
52 | class GELU(nn.Module):
53 | """
54 | Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
55 | """
56 | def forward(self, x):
57 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
58 |
59 | class LayerNorm(nn.Module):
60 | def __init__(self, hidden_size, eps=1e-5):
61 | """Construct a layernorm module in the TF style (epsilon inside the square root).
62 | """
63 | super(LayerNorm, self).__init__()
64 | self.weight = nn.Parameter(torch.ones(hidden_size))
65 | self.bias = nn.Parameter(torch.zeros(hidden_size))
66 | self.variance_epsilon = eps
67 |
68 | self.init_weights()
69 |
70 | def init_weights(self):
71 | self.weight.data.fill_(1.0)
72 | self.bias.data.zero_()
73 |
74 | def forward(self, x):
75 | u = x.mean(-1, keepdim=True)
76 | s = (x - u).pow(2).mean(-1, keepdim=True)
77 | x = (x - u) / torch.sqrt(s + self.variance_epsilon)
78 | return self.weight * x + self.bias
79 |
80 | # Model 1
81 | class TestModel(nn.Module):
82 | def __init__(self):
83 | super(TestModel, self).__init__()
84 |
85 | # self.ln = LayerNorm(13000)
86 | self.ln = LayerNorm(10000)
87 | self.ln1 = LayerNorm(7000)
88 | self.ln2 = LayerNorm(4000)
89 | self.ln3 = LayerNorm(2000)
90 |
91 | self.upblock1 = nn.Sequential(nn.Linear(226, 2000),GELU(),nn.BatchNorm1d(2000))
92 | self.upblock2 = nn.Sequential(nn.Linear(2000,4000),GELU(),nn.BatchNorm1d(4000))
93 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
94 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000))
95 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000))
96 |
97 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000))
98 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000))
99 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
100 | self.downblock3 = nn.Sequential(nn.Linear(4000, 2000),GELU(),nn.BatchNorm1d(2000))
101 | self.downblock4 = nn.Sequential(nn.Linear(2000, 300),GELU(),nn.BatchNorm1d(300))
102 |
103 | self.fclayer = nn.Sequential(nn.Linear(300,4))
104 | self.dropout = nn.Dropout(0.1)
105 |
106 | def forward(self, x):
107 | upblock1_out = self.upblock1(x)
108 | upblock2_out = self.upblock2(upblock1_out)
109 | upblock3_out = self.upblock3(upblock2_out)
110 | upblock4_out = self.upblock4(upblock3_out)
111 | #upblock5_out = self.upblock5(upblock4_out)
112 |
113 | downblock1_out = self.downblock1(self.ln(upblock4_out))
114 | skipblock1 = downblock1_out + upblock3_out
115 | downblock2_out = self.downblock2(self.ln1(skipblock1))
116 | skipblock2 = downblock2_out + upblock2_out
117 | downblock3_out = self.downblock3(self.ln2(skipblock2))
118 | skipblock3 = downblock3_out + upblock1_out
119 | downblock4_out = self.downblock4(self.ln3(skipblock3))
120 |
121 | output = self.fclayer(downblock4_out)
122 |
123 | return output
124 |
125 | class TestModel1(nn.Module):
126 | def __init__(self):
127 | super(TestModel1, self).__init__()
128 |
129 | # self.ln = LayerNorm(13000)
130 | self.ln = LayerNorm(10000)
131 | self.ln1 = LayerNorm(7000)
132 | self.ln2 = LayerNorm(4000)
133 | self.ln3 = LayerNorm(1000)
134 |
135 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000))
136 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000))
137 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
138 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000))
139 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000))
140 |
141 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000))
142 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000))
143 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
144 | self.downblock3 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000))
145 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300))
146 |
147 | self.fclayer = nn.Sequential(nn.Linear(300,4))
148 | self.dropout = nn.Dropout(0.1)
149 |
150 | def forward(self, x):
151 | upblock1_out = self.upblock1(x)
152 | upblock2_out = self.upblock2(upblock1_out)
153 | upblock3_out = self.upblock3(upblock2_out)
154 | upblock4_out = self.upblock4(upblock3_out)
155 | #upblock5_out = self.upblock5(upblock4_out)
156 |
157 | downblock1_out = self.downblock1(self.ln(upblock4_out))
158 | skipblock1 = downblock1_out + upblock3_out
159 | downblock2_out = self.downblock2(self.ln1(skipblock1))
160 | skipblock2 = downblock2_out + upblock2_out
161 | downblock3_out = self.downblock3(self.ln2(skipblock2))
162 | skipblock3 = downblock3_out + upblock1_out
163 | downblock4_out = self.downblock4(self.ln3(skipblock3))
164 |
165 | output = self.fclayer(downblock4_out)
166 |
167 | return output
168 |
169 | # Model 2
170 |
171 | class TestModel2(nn.Module):
172 | def __init__(self):
173 | super(TestModel2, self).__init__()
174 |
175 | # self.ln = LayerNorm(13000)
176 | self.ln = LayerNorm(20000)
177 | self.ln1 = LayerNorm(13000)
178 | self.ln2 = LayerNorm(7000)
179 | self.ln3 = LayerNorm(4000)
180 | self.ln4 = LayerNorm(1000)
181 | self.ln5 = LayerNorm(13000)
182 |
183 |
184 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
185 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),nn.ReLU(),nn.BatchNorm1d(4000))
186 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000),nn.ReLU(),nn.BatchNorm1d(7000))
187 | self.upblock4 = nn.Sequential(nn.Linear(7000,13000),nn.ReLU(),nn.BatchNorm1d(13000))
188 | self.upblock5 = nn.Sequential(nn.Linear(13000,20000),nn.ReLU(),nn.BatchNorm1d(20000))
189 | self.upblock6 = nn.Sequential(nn.Linear(20000,13000),nn.ReLU(),nn.BatchNorm1d(13000))
190 |
191 | self.downblock1 = nn.Sequential(nn.Linear(13000, 20000),nn.ReLU(),nn.BatchNorm1d(20000))
192 | self.downblock2 = nn.Sequential(nn.Linear(20000, 13000),nn.ReLU(),nn.BatchNorm1d(13000))
193 | self.downblock3 = nn.Sequential(nn.Linear(13000, 7000),nn.ReLU(),nn.BatchNorm1d(7000))
194 | self.downblock4 = nn.Sequential(nn.Linear(7000, 4000),nn.ReLU(),nn.BatchNorm1d(4000))
195 | self.downblock5 = nn.Sequential(nn.Linear(4000, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
196 | self.downblock6 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300))
197 |
198 |
199 | self.fclayer = nn.Sequential(nn.Linear(300,4))
200 | self.dropout = nn.Dropout(0.1)
201 |
202 | def forward(self, x):
203 | upblock1_out = self.upblock1(x)
204 | upblock2_out = self.upblock2(upblock1_out)
205 | upblock3_out = self.upblock3(upblock2_out)
206 | upblock4_out = self.upblock4(upblock3_out)
207 | upblock5_out = self.upblock5(upblock4_out)
208 | upblock6_out = self.upblock6(upblock5_out)
209 |
210 |
211 | downblock1_out = self.downblock1(self.ln1(upblock6_out))
212 | skipblock1 = downblock1_out + upblock5_out # 20000
213 | downblock2_out = self.downblock2(self.ln(skipblock1))
214 | skipblock2 = downblock2_out + upblock4_out # 13000
215 | downblock3_out = self.downblock3(self.ln5(skipblock2))
216 | skipblock3 = downblock3_out + upblock3_out # 7000
217 | downblock4_out = self.downblock4(self.ln2(skipblock3))
218 | skipblock4 = downblock4_out + upblock2_out # 4000
219 |
220 | downblock5_out = self.downblock5(self.ln3(skipblock4))
221 | skipblock5 = downblock5_out + upblock1_out
222 | downblock6_out = self.downblock6(self.ln4(skipblock5))
223 |
224 | output = self.fclayer(downblock6_out)
225 |
226 | return output
227 |
228 | # Model3
229 | class TestModel3(nn.Module):
230 | """
231 | Model for (20,40)
232 | """
233 | def __init__(self):
234 | super(TestModel3, self).__init__()
235 |
236 | self.ln = LayerNorm(17000)
237 | self.ln1 = LayerNorm(13000)
238 | self.ln2 = LayerNorm(7000)
239 | self.ln3 = LayerNorm(5000)
240 |
241 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
242 | self.upblock2 = nn.Sequential(nn.Linear(1000,3000),nn.ReLU(),nn.BatchNorm1d(3000))
243 | self.upblock3 = nn.Sequential(nn.Linear(3000,5000),nn.ReLU(),nn.BatchNorm1d(5000))
244 | self.upblock4 = nn.Sequential(nn.Linear(5000,7000),nn.ReLU(),nn.BatchNorm1d(7000))
245 | self.upblock5 = nn.Sequential(nn.Linear(7000,13000),nn.ReLU(),nn.BatchNorm1d(13000))
246 | self.upblock6 = nn.Sequential(nn.Linear(13000,17000),nn.ReLU(),nn.BatchNorm1d(17000))
247 |
248 | self.downblock1 = nn.Sequential(nn.Linear(17000,13000),nn.ReLU(),nn.BatchNorm1d(13000))
249 | self.downblock2 = nn.Sequential(nn.Linear(13000, 7000),nn.ReLU(),nn.BatchNorm1d(7000))
250 | self.downblock3 = nn.Sequential(nn.Linear(7000, 5000),nn.ReLU(),nn.BatchNorm1d(5000))
251 | self.downblock4 = nn.Sequential(nn.Linear(5000, 3000),nn.ReLU(),nn.BatchNorm1d(3000))
252 | self.downblock5 = nn.Sequential(nn.Linear(3000, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
253 | self.downblock6 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300))
254 |
255 | self.fclayer = nn.Sequential(nn.Linear(300,4))
256 | self.dropout = nn.Dropout(0.1)
257 |
258 | def forward(self, x):
259 | upblock1_out = self.upblock1(x)
260 | upblock2_out = self.upblock2(upblock1_out)
261 | upblock3_out = self.upblock3(upblock2_out)
262 | upblock4_out = self.upblock4(upblock3_out)
263 | upblock5_out = self.upblock5(upblock4_out)
264 | upblock6_out = self.upblock6(upblock5_out)
265 |
266 | downblock1_out = self.dropout(self.downblock1(self.ln(upblock6_out)))
267 | skipblock1 = downblock1_out + upblock5_out
268 | downblock2_out = self.downblock2(self.ln1(skipblock1))
269 | skipblock2 = downblock2_out + upblock4_out
270 | downblock3_out = self.dropout(self.downblock3(self.ln2(skipblock2)))
271 | skipblock3 = downblock3_out + upblock3_out
272 | downblock4_out = self.downblock4(self.ln3(skipblock3))
273 | skipblock4 = downblock4_out + upblock2_out
274 | downblock5_out = self.downblock5(skipblock4)
275 | skipblock5 = self.dropout(downblock5_out + upblock1_out)
276 | downblock6_out = self.downblock6(skipblock5)
277 |
278 | output = self.fclayer(downblock6_out)
279 |
280 | return output
281 |
282 | class TestModel4(nn.Module):
283 | def __init__(self):
284 | super(TestModel4, self).__init__()
285 |
286 | self.ln = LayerNorm(10000)
287 | self.ln1 = LayerNorm(7000)
288 | self.ln2 = LayerNorm(4000)
289 | self.ln3 = LayerNorm(1000)
290 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
291 | self.upblock2 = nn.Sequential(nn.Linear(1000,10000),nn.ReLU(),nn.BatchNorm1d(10000))
292 | self.upblock3 = nn.Sequential(nn.Linear(10000,7000), nn.ReLU(),nn.BatchNorm1d(7000))
293 | self.upblock4 = nn.Sequential(nn.Linear(7000,4000),nn.ReLU(),nn.BatchNorm1d(4000))
294 |
295 | self.downblock1 = nn.Sequential(nn.Linear(4000, 7000),nn.ReLU(),nn.BatchNorm1d(7000))
296 | self.downblock2 = nn.Sequential(nn.Linear(7000, 10000),nn.ReLU(),nn.BatchNorm1d(10000))
297 | self.downblock3 = nn.Sequential(nn.Linear(10000, 1000),nn.ReLU(),nn.BatchNorm1d(1000))
298 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),nn.ReLU(),nn.BatchNorm1d(300))
299 |
300 | self.fclayer = nn.Sequential(nn.Linear(300,4))
301 | self.dropout = nn.Dropout(0.1)
302 |
303 | def forward(self, x):
304 | upblock1_out = self.upblock1(x)
305 | upblock2_out = self.dropout(self.upblock2(upblock1_out))
306 | upblock3_out = self.dropout(self.upblock3(upblock2_out))
307 | upblock4_out = self.dropout(self.upblock4(upblock3_out))
308 |
309 | downblock1_out = self.downblock1(self.ln2(upblock4_out))
310 | skipblock1 = downblock1_out + upblock3_out # 7000
311 | downblock2_out = self.downblock2(self.ln1(skipblock1))
312 | skipblock2 = downblock2_out + upblock2_out # 10000
313 | downblock3_out = self.downblock3(self.ln(skipblock2))
314 | skipblock3 = downblock3_out + upblock1_out
315 | downblock4_out = self.downblock4(self.ln3(skipblock3))
316 |
317 | output = self.fclayer(downblock4_out)
318 |
319 | return output
320 |
321 | class TestModel5(nn.Module):
322 | def __init__(self):
323 | super(TestModel5, self).__init__()
324 |
325 | self.ln = LayerNorm(13000)
326 | self.ln1 = LayerNorm(11000)
327 | self.ln2 = LayerNorm(7000)
328 | self.ln3 = LayerNorm(4000)
329 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000))
330 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000))
331 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
332 | self.upblock4 = nn.Sequential(nn.Linear(7000,11000),GELU(),nn.BatchNorm1d(11000))
333 | self.upblock5 = nn.Sequential(nn.Linear(11000,13000),GELU(),nn.BatchNorm1d(13000))
334 |
335 | self.downblock1 = nn.Sequential(nn.Linear(13000, 11000),GELU(),nn.BatchNorm1d(11000))
336 | self.downblock2 = nn.Sequential(nn.Linear(11000, 7000),GELU(),nn.BatchNorm1d(7000))
337 | self.downblock3 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
338 | self.downblock4 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000))
339 | self.downblock5 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300))
340 |
341 | self.fclayer = nn.Sequential(nn.Linear(300,4))
342 | self.dropout = nn.Dropout(0.1)
343 |
344 | def forward(self, x):
345 | upblock1_out = self.upblock1(x)
346 | upblock2_out = self.upblock2(upblock1_out)
347 | upblock3_out = self.upblock3(upblock2_out)
348 | upblock4_out = self.upblock4(upblock3_out)
349 | upblock5_out = self.upblock5(upblock4_out)
350 |
351 | downblock1_out = self.downblock1(self.ln(upblock5_out))
352 | skipblock1 = downblock1_out + upblock4_out
353 | downblock2_out = self.downblock2(self.ln1(skipblock1))
354 | skipblock2 = downblock2_out + upblock3_out
355 | downblock3_out = self.downblock3(self.ln2(skipblock2))
356 | skipblock3 = downblock3_out + upblock2_out
357 | downblock4_out = self.dropout(self.downblock4(self.ln3(skipblock3)))
358 | skipblock4 = downblock4_out + upblock1_out
359 | downblock5_out = self.downblock5(skipblock4)
360 |
361 | output = self.fclayer(downblock5_out)
362 |
363 | return output
364 |
365 |
366 | class TestModel6(nn.Module):
367 | def __init__(self):
368 | super(TestModel6, self).__init__()
369 |
370 | # self.ln = LayerNorm(13000)
371 | self.ln = LayerNorm(10000)
372 | self.ln1 = LayerNorm(7000)
373 | self.ln2 = LayerNorm(4000)
374 |
375 | self.upblock1 = nn.Sequential(nn.Linear(226, 1000),GELU(),nn.BatchNorm1d(1000))
376 | self.upblock2 = nn.Sequential(nn.Linear(1000,4000),GELU(),nn.BatchNorm1d(4000))
377 | self.upblock3 = nn.Sequential(nn.Linear(4000,7000), GELU(),nn.BatchNorm1d(7000))
378 | self.upblock4 = nn.Sequential(nn.Linear(7000,10000),GELU(),nn.BatchNorm1d(10000))
379 | #self.upblock5 = nn.Sequential(nn.Linear(10000,13000),GELU(),nn.BatchNorm1d(13000))
380 |
381 | #self.downblock1 = nn.Sequential(nn.Linear(13000, 10000),GELU(),nn.BatchNorm1d(10000))
382 | self.downblock1 = nn.Sequential(nn.Linear(10000, 7000),GELU(),nn.BatchNorm1d(7000))
383 | self.downblock2 = nn.Sequential(nn.Linear(7000, 4000),GELU(),nn.BatchNorm1d(4000))
384 | self.downblock3 = nn.Sequential(nn.Linear(4000, 1000),GELU(),nn.BatchNorm1d(1000))
385 | self.downblock4 = nn.Sequential(nn.Linear(1000, 300),GELU(),nn.BatchNorm1d(300))
386 |
387 | self.fclayer = nn.Sequential(nn.Linear(300,4))
388 | self.dropout = nn.Dropout(0.1)
389 |
390 | def forward(self, x):
391 | upblock1_out = self.upblock1(x)
392 | upblock2_out = self.upblock2(upblock1_out)
393 | upblock3_out = self.upblock3(upblock2_out)
394 | upblock4_out = self.upblock4(upblock3_out)
395 | #upblock5_out = self.upblock5(upblock4_out)
396 |
397 | downblock1_out = self.downblock1(self.ln(upblock4_out))
398 | skipblock1 = downblock1_out + upblock3_out
399 | downblock2_out = self.downblock2(self.ln1(skipblock1))
400 | skipblock2 = downblock2_out + upblock2_out
401 | downblock3_out = self.downblock3(self.ln2(skipblock2))
402 | skipblock3 = downblock3_out + upblock1_out
403 | downblock4_out = self.downblock4(skipblock3)
404 |
405 | output = self.fclayer(downblock4_out)
406 |
407 | return output
408 |
409 |
410 | # ## Load weights and Test
411 | #
412 | # - train 된 pth 파일을 갖고 옵니다.
413 | # - test file을 읽어서 evaluation mode로 모델을 테스트합니다.
414 | # - 결과를 csv 파일로 반환합니다.
415 | #
416 |
417 | # In[11]:
418 |
419 |
420 | # 모델들을 dictionary 형태로 정의하여 바로 사용할 수 있게 합니다.
421 | models = {
422 | 'model':TestModel(),
423 | 'model1': TestModel1(),
424 | 'model2': TestModel2(),
425 | 'model3': TestModel3(),
426 | 'model4': TestModel4(),
427 | 'model5': TestModel5(),
428 | 'model6': TestModel6()
429 | }
430 |
431 |
432 | # 테스트 파일 경로
433 | path_test = 'test.csv'
434 | # pth 파일 리스트들
435 | pth_list = os.listdir('./outputs') # 'outputs' pth들이 저장된 경로
436 |
437 | print(pth_list) # 2.pth > test_(10, 12)_0.0005_200 로 변경 예정
438 |
439 | # csv가 저장될 디렉토리를 미리 만들어 놓습니다.
440 | if os.path.exists('test'): # 'test' 는 USER에 맞게 지정하시면 됩니다.
441 | pass
442 | else:
443 | os.mkdir('test')
444 |
445 |
446 | # In[10]:
447 |
448 |
449 | os.path.exists('test')
450 |
451 |
452 | # In[4]:
453 |
454 |
455 | # Test
456 | # 테스트 데이터셋을 정의하고 부릅니다.
457 | class TestDataset(Dataset):
458 | def __init__(self, path_test):
459 | super(TestDataset, self).__init__()
460 | test = pd.read_csv(path_test)
461 | self.test_X = test.iloc[:,1:]
462 | self.tmp_x = self.test_X.values
463 |
464 | def __len__(self):
465 | return len(self.test_X)
466 |
467 | def __getitem__(self, idx):
468 | return torch.from_numpy(self.tmp_x)[idx]
469 |
470 | test_data = TestDataset(path_test)
471 | test_loader = DataLoader(test_data, batch_size=10000, num_workers=4)
472 |
473 |
474 | # In[5]:
475 |
476 |
477 | # pth 가중치를 불러와서 모델을 테스트하고 그 결과 csv 파일을 씁니다.
478 | def test_model(path_pth, test_loader, model_type:str):
479 | model = models[model_type]
480 | ws = torch.load(f'./outputs/{path_pth}', map_location='cpu') # 불러옴
481 | model.load_state_dict(ws)
482 | model.eval()
483 |
484 | with torch.no_grad():
485 | for data in test_loader:
486 | outputs = model(data.float()) # 모델을 테스트
487 | pred_test = outputs
488 |
489 | sample_sub = pd.read_csv('sample_submission.csv', index_col=0)
490 | layers = ['layer_1','layer_2','layer_3','layer_4']
491 | submission = sample_sub.values + pred_test.numpy()
492 |
493 | submission = pd.DataFrame(data=submission,columns=layers)
494 | submission.to_csv(f'./test/{path_pth[:-4]}.csv', index_label='id') # test 경로에 csv 파일 저장
495 |
496 |
497 | # In[6]:
498 |
499 |
500 | # 앙상블 할 모델에 대해서 파일을 씁니다.
501 | for pth in sorted(pth_list):
502 | if pth[-3:] != 'pth':
503 | pass
504 | else:
505 | if int(pth[0]) == 0:
506 | test_model(pth, test_loader, model_type='model')
507 | elif int(pth[0]) == 1:
508 | test_model(pth, test_loader, model_type='model1')
509 | elif int(pth[0]) == 2:
510 | #test_model(pth, test_loader, model_type='model2')
511 | pass
512 | elif int(pth[0]) == 3:
513 | test_model(pth, test_loader, model_type='model4')
514 | elif int(pth[0]) > 3 and int(pth[0]) <7:
515 | test_model(pth, test_loader, model_type='model5')
516 | elif int(pth[0])>= 7:
517 | test_model(pth, test_loader, model_type='model6')
518 |
519 |
520 | # In[7]:
521 |
522 |
523 | def check_state(model):
524 | for val in model.state_dict().keys():
525 | if val[-4:] =='bias':
526 | pass
527 | else:
528 | print(f'{val} : {model.state_dict()[val].shape}')
529 |
--------------------------------------------------------------------------------
/ch03/submission.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 3.5. 성능 향상을 위한 방법"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import os\n",
17 | "import numpy as np\n",
18 | "import pandas as pd\n",
19 | "import matplotlib.pyplot as plt\n",
20 | "import seaborn as sns"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "sub_dir = \"C:/dacon/ch03/submission/\"\n",
30 | "os.chdir(sub_dir)"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "### 3.5.1. 앙상블"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "5개의 submission 파일명을 각각 '모델 번호_사용한 모델=임시 스코어.csv' 형식으로 변경한 뒤 다음 과정을 진행합니다.
예를 들어, 'model1_lgbm.csv' 파일의 임시 스코어가 2.29라면 'model1_lgbm=2.29.csv' 형식으로 변경하는 것입니다."
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "sub_list = [] # 작업 경로 안에 있는 파일 중 확장자가 .csv인 파일을 담을 리스트 생성\n",
54 | "fname_list = [] # 확장자를 제외한 파일명을 담을 리스트 생성\n",
55 | "\n",
56 | "for filename in os.listdir(): # 작업 경로 안에 있는 모든 파일의 리스트 불러오기\n",
57 | " fname, ext = os.path.splitext(filename) # filename을 파일명과 확장자로 분리\n",
58 | " if ext == '.csv': # 확장자가 .csv인 파일인 경우\n",
59 | " sub_list.append(filename) # filename을 sub_list에 추가\n",
60 | " fname_list.append(fname) # 파일명을 fname_list에 추가"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "#### 3.5.1.1. 결괏값 간 상관계수 확인"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 4,
73 | "metadata": {},
74 | "outputs": [
75 | {
76 | "data": {
77 | "text/html": [
78 | "\n",
79 | "\n",
92 | "
\n",
93 | " \n",
94 | " \n",
95 | " | \n",
96 | " model1_lgbm=2.29.csv | \n",
97 | " model2_rf=2.34.csv | \n",
98 | " model3_rf=2.38.csv | \n",
99 | " model4_rf=2.36.csv | \n",
100 | " model5_rf=2.31.csv | \n",
101 | "
\n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " | model1_lgbm=2.29.csv | \n",
106 | " 1.000000 | \n",
107 | " 0.977246 | \n",
108 | " 0.972529 | \n",
109 | " 0.973676 | \n",
110 | " 0.981939 | \n",
111 | "
\n",
112 | " \n",
113 | " | model2_rf=2.34.csv | \n",
114 | " 0.977246 | \n",
115 | " 1.000000 | \n",
116 | " 0.995776 | \n",
117 | " 0.992885 | \n",
118 | " 0.990955 | \n",
119 | "
\n",
120 | " \n",
121 | " | model3_rf=2.38.csv | \n",
122 | " 0.972529 | \n",
123 | " 0.995776 | \n",
124 | " 1.000000 | \n",
125 | " 0.995266 | \n",
126 | " 0.986214 | \n",
127 | "
\n",
128 | " \n",
129 | " | model4_rf=2.36.csv | \n",
130 | " 0.973676 | \n",
131 | " 0.992885 | \n",
132 | " 0.995266 | \n",
133 | " 1.000000 | \n",
134 | " 0.983549 | \n",
135 | "
\n",
136 | " \n",
137 | " | model5_rf=2.31.csv | \n",
138 | " 0.981939 | \n",
139 | " 0.990955 | \n",
140 | " 0.986214 | \n",
141 | " 0.983549 | \n",
142 | " 1.000000 | \n",
143 | "
\n",
144 | " \n",
145 | "
\n",
146 | "
"
147 | ],
148 | "text/plain": [
149 | " model1_lgbm=2.29.csv model2_rf=2.34.csv \\\n",
150 | "model1_lgbm=2.29.csv 1.000000 0.977246 \n",
151 | "model2_rf=2.34.csv 0.977246 1.000000 \n",
152 | "model3_rf=2.38.csv 0.972529 0.995776 \n",
153 | "model4_rf=2.36.csv 0.973676 0.992885 \n",
154 | "model5_rf=2.31.csv 0.981939 0.990955 \n",
155 | "\n",
156 | " model3_rf=2.38.csv model4_rf=2.36.csv \\\n",
157 | "model1_lgbm=2.29.csv 0.972529 0.973676 \n",
158 | "model2_rf=2.34.csv 0.995776 0.992885 \n",
159 | "model3_rf=2.38.csv 1.000000 0.995266 \n",
160 | "model4_rf=2.36.csv 0.995266 1.000000 \n",
161 | "model5_rf=2.31.csv 0.986214 0.983549 \n",
162 | "\n",
163 | " model5_rf=2.31.csv \n",
164 | "model1_lgbm=2.29.csv 0.981939 \n",
165 | "model2_rf=2.34.csv 0.990955 \n",
166 | "model3_rf=2.38.csv 0.986214 \n",
167 | "model4_rf=2.36.csv 0.983549 \n",
168 | "model5_rf=2.31.csv 1.000000 "
169 | ]
170 | },
171 | "execution_count": 4,
172 | "metadata": {},
173 | "output_type": "execute_result"
174 | }
175 | ],
176 | "source": [
177 | "# 상관계수 행렬을 저장할 데이터프레임 생성\n",
178 | "corr_df = pd.DataFrame()\n",
179 | "\n",
180 | "for file in sub_list:\n",
181 | " # 각 submission 파일의 18~20_ride 변수를 sub_df에 저장\n",
182 | " sub_df = pd.read_csv(file, engine = 'python').iloc[:,1:]\n",
183 | " # 변수명을 파일의 이름으로 지정\n",
184 | " sub_df.columns = [str(file)]\n",
185 | " # sub_df를 corr_df에 병합\n",
186 | " corr_df = pd.concat([corr_df, sub_df], axis = 1)\n",
187 | "\n",
188 | "# 상관계수 행렬 출력 \n",
189 | "corr_df.corr()"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 5,
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "data": {
199 | "text/html": [
200 | "\n",
201 | "\n",
214 | "
\n",
215 | " \n",
216 | " \n",
217 | " | \n",
218 | " model | \n",
219 | " public_rmse | \n",
220 | " cor | \n",
221 | "
\n",
222 | " \n",
223 | " \n",
224 | " \n",
225 | " | 0 | \n",
226 | " model1_lgbm | \n",
227 | " 2.29 | \n",
228 | " 0.981078 | \n",
229 | "
\n",
230 | " \n",
231 | " | 1 | \n",
232 | " model2_rf | \n",
233 | " 2.34 | \n",
234 | " 0.991372 | \n",
235 | "
\n",
236 | " \n",
237 | " | 2 | \n",
238 | " model3_rf | \n",
239 | " 2.38 | \n",
240 | " 0.989957 | \n",
241 | "
\n",
242 | " \n",
243 | " | 3 | \n",
244 | " model4_rf | \n",
245 | " 2.36 | \n",
246 | " 0.989075 | \n",
247 | "
\n",
248 | " \n",
249 | " | 4 | \n",
250 | " model5_rf | \n",
251 | " 2.31 | \n",
252 | " 0.988531 | \n",
253 | "
\n",
254 | " \n",
255 | "
\n",
256 | "
"
257 | ],
258 | "text/plain": [
259 | " model public_rmse cor\n",
260 | "0 model1_lgbm 2.29 0.981078\n",
261 | "1 model2_rf 2.34 0.991372\n",
262 | "2 model3_rf 2.38 0.989957\n",
263 | "3 model4_rf 2.36 0.989075\n",
264 | "4 model5_rf 2.31 0.988531"
265 | ]
266 | },
267 | "execution_count": 5,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "corr_list = np.array(corr_df.corr().mean(axis = 0)) \n",
274 | "\n",
275 | "model_list = [] # 모델명을 담을 리스트 생성\n",
276 | "public_rmse_list = [] # 각 submission 파일의 임시 스코어를 담을 리스트 생성\n",
277 | "\n",
278 | "for fname in fname_list:\n",
279 | " model = fname.split('=')[0] # '=' 기호를 기준으로 모델명을 분리\n",
280 | " model_list.append(model) # 모델명을 model_list에 추가\n",
281 | "\n",
282 | " score = fname.split('=')[-1] # '=' 기호를 기준으로 임시 스코어를 분리\n",
283 | " score = float(score) # 실수형으로 변환\n",
284 | " public_rmse_list.append(score) # 임시 스코어를 public_rmse_list에 추가\n",
285 | " \n",
286 | "# model_list, public_rmse_list, corr_list를 변수로 하는 데이터프레임 생성\n",
287 | "score_df = pd.DataFrame({'model': model_list, 'public_rmse': public_rmse_list,\n",
288 | " 'cor': corr_list})\n",
289 | "score_df"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 6,
295 | "metadata": {},
296 | "outputs": [
297 | {
298 | "data": {
299 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnUAAAE9CAYAAABtFJTIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAuHklEQVR4nO3de3hV5Zn///fNUSAY0khVbAXqVBFt0TYqI04AxeNQoNBRO0SkTtWZ+hWoQxnijLZMi0M66Ggvtdra1raoDNoDauWrSA0iaks8IFXoWCotZWR+ms5XQRAMeX5/ZIuEYwLZO8ni/bquXOy91vOsfa+bfZEP67B3pJSQJElS+9ahtQuQJEnSgTPUSZIkZYChTpIkKQMMdZIkSRlgqJMkScoAQ50kSVIGdGrtAlrSYYcdlvr169faZWTWO++8Q48ePVq7jMyzz4VjrwvHXheOvS6cA+31c88992ZKqXdL1ZOpUNevXz9qampau4zMqq6uZtiwYa1dRubZ58Kx14VjrwvHXhfOgfY6Iv7QctV4+lWSJCkTDHWSJEkZYKiTJEnKAEOdJElSBhjqJEnSXtXU1BARTJw4cZ9j+/XrR1FREQC/+c1vOP744znkkEPo1asXf/3Xf826deua9dr19fVccsklFBUVERE8/vjj+7MLBwVDnSRJyosOHTpw8cUX853vfIexY8fyyCOPMGPGjCbPr6urY+XKlcyZM4djjz2We+65h0984hN5rLh9M9RJkpQxa9asISIoLy/n/PPPp2fPnlRVVXHjjTdSXFzMSSedxJo1a1i7di1jxoyhpKSEPn36MGXKFLZs2QLAokWL6N+/P3379mXu3LmNtr9u3TrGjRu3fd706dOpr6/fpY6BAwdSWVnJeeedx+mnnw40BL09ufvuu4kILrroIk444QQuvPBCTjzxRABeeOEFxo8fz+bNm1uqTZljqJMkKaOeeeYZzj77bEpLS6msrGTBggVMnDiR5cuXc/PNNzN+/Hgeeughpk2bxrnnnsstt9zCzJkz2bJlCxUVFdTW1jJt2jSWLVvWaLsVFRUsXLiQyZMnM2rUKKqqqrj99tt3W8MjjzzC4YcfzuWXX84JJ5zQpCN1jz76KFdeeSUTJkxg5syZAJSXl3PffffRu3eLfVZv5uQ11EXERyPiiYhYGREvR8Tk3YwZHREvRcSLEVETEWfssO7LuXm/iYj7IuKQfNYrSVKWnHbaaVxzzTUMGTKElBKVlZVMmjQJgBUrVrBkyRIGDx5MZWUld9xxBx06dGDBggWsWrWK9evXM3r0aK666iquv/767dvcuHEjixcvZsOGDcyYMYM777wTgMcee2y3NQwZMoQFCxYwefJkXn755e3j9+ayyy5j0qRJjBkzhnPOOQeA/v37c/HFF/ttGXuR7yN1dcA/ppSOBwYDV0XEwJ3GLAIGpZROAi4D7gKIiKOASUBZSulEoCNwcZ7rlSSp3dm0tY61f97Epq11jZb36tULgM6dOwNQXFxMx44dG42JiH1uP6W0y7JBgwaxcOHC7T/XXXfdbuf27t2b8847jxtvvJEOHTowb968fb5enz599jlGu8rr14SllF4HXs893hARK4GjgFd2GLNxhyk9gB3fOZ2AbhHxHtAd+O981itJUnuz8vW3+frDr/D06lpOP6aUCf12vbZtd7p160Z5eTlLly5l1qxZvPrqq9TX13PBBRcwYMAAjjjiCObPn89tt93WKIgVFRUxdOhQnnzySZYsWcJRRx3FU089xYABAzjllFMavca//du/8fbbb3Pcccfxy1/+kvr6egYO3PnYjlpKwa6pi4h+wMnAr3az7rMRsQr4BQ1H60gprQNmA3+kIRi+lVLa/bFdSZIOQpu21m0PdABPr67l9bc2s3mnI3Z7MmfOHEaOHMmsWbN45JFHmDRpEtdeey1du3Zlzpw5lJaWcsMNN3DqqafuMm/s2LHceuutTJ06ldWrV+8yBhqO0t17771cccUVPProo3z+85/n1ltvPfAd127F7g6ptviLRBQBi4GZKaWf7mVcOXB9SmlERJQAPwEuAv4fcD/wQEppzk5zrgCuADj88MM/vfMdOmo5Gzdu3P7ZQ8of+1w49rpw7HV+vLetnlXrNzRadng3KCnuSeeObfdeyLq6Ot55551Gyzp27Nju3iMH+r4ePnz4cymlspaqJ++hLiI6Aw8Dj6aUbmrC+NeAU4DhwHkppb/LLZ8ADE4pfWlPc8vKylJNTU3LFK5dVFdXM2zYsNYuI/Psc+HY68Kx1/mxaWsdX/xhzfYjdQBfLYOLRp1L9y55vcLqgFRXVzN8+PBGy4YOHUp1dXXrFLSfDvR9HREtGury+jceDVdffg9YuadAFxF/AaxOKaWI+BTQBail4bTr4IjoDmwGzgJMbJIk5XTv0onrRg7cfgp2yDGlHFm8qU0HOvjgJosdlZSUtFI12ZHvv/UhwCXAioh4MbfsWuBogJTSHcA4YELuZojNwEWp4fDhryLiAeB5Gu6ifQH4Tp7rlSSpXTn+yEO569Iyat/ZSmmPLvz66adau6R9KikpYcSIEa1dRubk++7Xp4C93iudUqoCqvaw7qvAV/NQmiRJmdG9S6c2f3RO+dd2r6KUJElSkxnqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJEmSMsBQJ0mSlAGGOkmSpAww1EmSJGWAoU6SJCkDDHWSJEkZYKiTJEnKAEOdJElSBhjqJElqZTU1NUQEEydO3OfYfv36UVRU1GjZqlWr6Nq1KxHB4sWLm/Xa9fX1XHLJJRQVFRERPP74482ar7bDUCdJUjuWUuLyyy+nU6dOzZ5bV1fHypUrmTNnDsceeyz33HMPn/jEJ/JQpQrBUCdJUjOtWbOGiKC8vJzzzz+fnj17UlVVxY033khxcTEnnXQSa9asYe3atYwZM4aSkhL69OnDlClT2LJlCwCLFi2if//+9O3bl7lz5zba/rp16xg3btz2edOnT6e+vn63tXz7299mzZo1XHnllfus++677yYiuOiiizjhhBO48MILOfHEEwF44YUXGD9+PJs3bz7A7qi1GOokSdpPzzzzDGeffTalpaVUVlayYMECJk6cyPLly7n55psZP348Dz30ENOmTePcc8/llltuYebMmWzZsoWKigpqa2uZNm0ay5Yta7TdiooKFi5cyOTJkxk1ahRVVVXcfvvtu7z+unXrqKys5Nvf/jaHHnpok+t+9NFHufLKK5kwYQIzZ84EoLy8nPvuu4/evXsfWFPUavIa6iLioxHxRESsjIiXI2LybsaMjoiXIuLFiKiJiDN2WNcrIh6IiFW5bfxlPuuVJKk5TjvtNK655hqGDBlCSonKykomTZoEwIoVK1iyZAmDBw+msrKSO+64gw4dOrBgwQJWrVrF+vXrGT16NFdddRXXX3/99m1u3LiRxYsXs2HDBmbMmMGdd94JwGOPPbbL60+fPp2ysjIGDBjAn//8ZwD+/Oc/s3Hjxr3WfdlllzFp0iTGjBnDOeecA0D//v25+OKL6dGjR4v0RoXX/BPwzVMH/GNK6fmI6Ak8FxELU0qv7DBmEfBgSilFxCeBecCA3LpbgP+bUvpcRHQBuue5XkmSdrFpax21G7dSWtSF7l0++NXZq1cvADp37gxAcXExHTt2bDQ3Iva5/ZTSLssGDRrE7Nmztz8vLi7eZczatWtZvHgxH//4x7cv+9a3vsUpp5xCRUXFHl+vT58++6xJ7U9eQ11K6XXg9dzjDRGxEjgKeGWHMTv+d6IHkAAi4lCgHJiYG7cV2JrPeiVJ2tnK19/m6w+/wtOrazn9mFKuGzmQbk2Y161bN8rLy1m6dCmzZs3i1Vdfpb6+ngsuuIABAwZwxBFHMH/+fG677TbmzZu3fV5RURFDhw7lySefZMmSJRx11FE89dRTDBgwgFNOOaXRa8yYMYM33ngDgHnz5nH//fdz4YUXUl5e3pItUDtRsGvqIqIfcDLwq92s+2xErAJ+AVyWW/wx4A3gBxHxQkTcFREeE5YkFcymrXXbAx3A06tr+cbDr7B5a12T5s+ZM4eRI0cya9YsHnnkESZNmsS1115L165dmTNnDqWlpdxwww2ceuqpu8wbO3Yst956K1OnTmX16tW7jAEYOnQon/vc5/jc5z7HwIEDARg4cCBHH330Ae652qPY3SHfFn+RiCJgMTAzpfTTvYwrB65PKY2IiDLgWWBISulXEXEL8HZK6bqd5lwBXAFw+OGHf3rnO4jUcjZu3LjLZyOp5dnnwrHXhdNee/3etnpWrd+wy/IBR/Skc8e2ea/hxo0bOeSQQ3jnnXcaLe/YsWO7/Dtoyw70fT18+PDnUkplLVVP3kNdRHQGHgYeTSnd1ITxrwGn0HBq+NmUUr/c8r8CpqeU/npPc8vKylJNTU2L1K1dVVdXM2zYsNYuI/Psc+HY68Jpr73etLWOL/6wZvuROoAhx5Ty3UvLGl1b15ZUV1cDMHz48EbLhw4dun2dWsaBvq8jokVDXV7fkdFwdej3gJV7CnQR8RfA6tyNEp8CugC1uedrI+K4lNJvgbPY4Vo8SZLyrXuXTlw3cuD2U7BDjinlX0YObLOB7n2DBg1i4cKFjZaVlJS0UjUqlHy/K4cAlwArIuLF3LJrgaMBUkp3AOOACRHxHrAZuCh9cPjwauCe3J2vvwe+kOd6JUlq5PgjD+WuS8uofWcrpT26tPlABw0BbsSIEa1dhgos33e/PgXs9V7ulFIVULWHdS8CLXZYUpKk/dG9S6d2EeZ0cGubV3lKkiSpWQx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJyoAmh7poUBER1+eeHx0Rp+avNEmSJDVVc47U3Q78JfD53PMNwG0tXpEkSZKarVMzxp6WUvpURLwAkFL634jokqe6JEmS1AzNOVL3XkR0BBJARPQG6vNSlSRJkpqlOaHuW8DPgA9HxEzgKeCGvFQlSZKkZmny6deU0j0R8RxwFhDAmJTSyrxVJkmSpCZrzt2vxwCvpZRuA34DnB0RvfJVmCRJkpquOadffwJsi4i/AO4C+gP35qUqSZKkNqKmpoaIYOLEifsc269fP4qKigB48MEH+dSnPkXPnj057LDDuOyyy9i8eXOzXjsiOkTEjyNiY0SkiBixp7HNCXX1KaU6YCxwS0rpy8CRzapMkiTpILF8+XIGDhzITTfdxKc//Wl+8IMf8M1vfrPJ8yOiE3A8UAH8FzAeWLGn8c29+/XzwATg4dyyzs2YL0mSlHdr1qwhIigvL+f888+nZ8+eVFVVceONN1JcXMxJJ53EmjVrWLt2LWPGjKGkpIQ+ffowZcoUtmzZAsCiRYvo378/ffv2Ze7cuY22v27dOsaNG8dnPvMZ+vTpw/Tp06mv3/UDQf7pn/6JOXPmcPnllzN79mwAXn755T3WHRETc0fj/jMiXgbm0XDJG8DJwD1Atz3Nb06o+wINHz48M6X0WkT0B+Y0Y74kSVLBPPPMM5x99tmUlpZSWVnJggULmDhxIsuXL+fmm29m/PjxPPTQQ0ybNo1zzz2XW265hZkzZ7JlyxYqKiqora1l2rRpLFu2rNF2KyoqWLhwIePGjWPUqFFUVVVx++237/L6Xbp88HG+jz76KADl5eVNKf1c4E7gR8A/55Y9ScMXQLyxx1kppbz9AB8FngBWAi8Dk3czZjTwEvAiUAOcsdP6jsALwMP7er1Pf/rTSfnzxBNPtHYJBwX7XDj2unDsdeEcjL1+Z8t76Y+176R3tryXUkrptddeS0AaMmRISimlv/3bv01Aevzxx9Pvfve7BKQzzzwzAen0009PKaX07rvvpg4dOqSysrL04osvJiBVVFSklFJ6/PHHE5AuvfTStGHDhhQRiYbP7d3+85nPfCallFLfvn1Tjx49GtX3wAMPpM6dO6cLLrgg1dXVbV8O1KTGmWdibns37bCsLLfs7rSPHNTkjzSJiJHA14G+NHwUSjRkwnToXqbVAf+YUno+InoCz0XEwpTSKzuMWQQ8mFJKEfFJGg41Dthh/eRcKNzb60iSpIPQytff5usPv8LTq2s5/ZhSrhs5cPv5yV69egHQuXPD1WLFxcV07Nix0fyI2Odr5MJVI4MGDeKSSy5h0KBB27e9O//5n/9JRUUFZ555Jj/5yU92ef09+O+mDNpZc06/3gxcCpSmlA5NKfXcR6AjpfR6Sun53OMNNISzo3YaszF90K0e5L6xAiAiPgL8NQ1320qSJG23aWvd9kAH8PTqWr7x8Cts3lq3z7ndunWjvLycZ599llmzZvGlL32J+vp6LrjgAgYMGMARRxzB/Pnzue222/j617++fV5RURFDhw5lxYoVvPTSS/z+97/nRz/6EQsXLtzlNX7xi18wfvx4evXqxec//3l+/vOf88tf/rLlGrCT5nz361rgN2l3cbUJIqIfDRf5/Wo36z4L/BvwYRpC3PtuBqYBPfey3SuAKwAOP/xwqqur96c8NcHGjRvtbwHY58Kx14VjrwvnYOr1e9vq+cvuG/jLT+y49H+oWfZ7AGpra6murmb9+vUAPPfcc9uPqNXW1vLlL3+Zbdu28Y1vfIOuXbsyduxYhgwZwjPPPMPUqVOZPXs2X/va1zjrrLMAWL9+PdXV1XzpS18ipcTPfvYzHnjgAfr378+gQYOorq7m3XffZdu2bVRXV3P//fezbds23nzzTb7whS8ADUf4br755rz0I5qa0SLiFBpOvy4Gtry/PKV0UxPmFuXmzUwp/XQv48qB61NKI3Kney9IKX0pIoYBU1NKI/f2OmVlZammpqYpu6P9UF1dzbBhw1q7jMyzz4VjrwvHXhfOwdTrTVvr+OIPa7YfqQMYckwp3720jO5dmnPcav/sb6/fe+893nrrLXr37r0ceP9z595LKb11IPU05/TrTGATcAgNR87e/9mriOhMwwcX37O3QAeQUnoSOCYiDgOGAKMiYg0wFzgzIrzbVpIkAdC9SyeuGzmQ048pBRoC3b+MHFiQQHcgli5dSu/evQEG0XA36xvA/APdbnP2+kMppXOas/FouPrwe8DKPR3Ry31DxercjRKfAroAtSmlSqAyN2YYDUfqKprz+pIkKduOP/JQ7rq0jNp3tlLao0ubD3TQcAp24cKFnH322f8FXJVb/L8Hut3m7PnjEXFOSumxZswZAlwCrIiIF3PLrgWOBkgp3QGMAyZExHvAZuCi/b1uT5IkHXy6d+nULsLc+0pKShgxYgTAhpTS4y213eZ04CpgWkRsAd6jCR9pklJ6Kjduj1JKVUDVPsZUA9XNqFWSJOmg0qRQFxEdgPNSSkvzXI8kSZL2Q5NulEgp1QOz81yLJEmS9lNz7n59LCLGRVM+elmSJEkF1Zxr6q6h4Rsf6iLiXZr2NWGSJEkqgCaHupTSXj+TLiJOSCm9fOAlSZIkqbmac/p1X37cgtuSJElSM7RkqPNaO0mSpFbSkqHODwyWJElqJS0Z6iRJktRKWjLUbW3BbUmSJKkZmhzqIuKzEVG8w/NeETHm/ecppcEtXJskSZKaqDlH6r6aUnrr/Scppf8HfLXFK5IkSVKzNSfU7W5scz68WJIkSXnSnFBXExE3RcQxEfGxiPgP4Ll8FSZJkqSma06ou5qGmyH+E7gfeBe4Kh9FSZIkqXma8zVh7wDT81iLJEmS9tM+Q11E3JxSmhIRD7GbDxhOKY3KS2WSJElqsqYcqXv/O11n57MQSZIk7b99hrqU0nO5PxfnvxxJkiTtj6acfl3B7r/XNYCUUvpki1clSZKkZmnK6deRea9CkiRJB6Qpp1//8P7jiDgCOJWGI3fLUkrr81ibJEmSmqg53/36ReDXwFjgc8CzEXFZvgqTJElS0zXna76+ApycUqoFiIhS4Gng+/koTJIkSU3XnG+U+BOwYYfnG4C1LVuOJEmS9kdT7n69JvdwHfCriJhPwzV1o2k4HStJkqRW1pTTrz1zf67O/bxvfsuXI0mSpP3RlLtfZxSiEEmSJO2/Jt8oERFPsPvvfj2zRSuSJElSszXn7tepOzw+BBgH1LVsOZIkSdofTQ51738H7A6WRoTfBytJktQGNOf064d2eNoBKAOOaPGKJEmS1GzNOf36HB9cU1cHrAH+rqULkiRJUvM1J9QNBL4EnEFDuFsC1OSjKEmSJDVPc0LdD4G3gW/lnn8e+DHwNy1dlCRJkpqnOaHuuJTSoB2ePxERy1u6IEmSJDVfc7779YWIGPz+k4g4DVja8iVJkiSpuZpzpO40YEJE/DH3/GhgZUSsAFJK6ZMtXp0kSZKapDmh7ry8VSFJkqQD0pwPH/5DPguRJEnS/mvONXWSJElqowx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjIgr6EuIj4aEU9ExMqIeDkiJu9mzOiIeCkiXoyImog4o6lzJUmS1KA5n1O3P+qAf0wpPR8RPYHnImJhSumVHcYsAh5MKaWI+CQwDxjQxLmSJEkiz0fqUkqvp5Sezz3eAKwEjtppzMaUUso97QGkps6VJElSg4JdUxcR/YCTgV/tZt1nI2IV8AvgsubMlSRJEsQHB8ny+CIRRcBiYGZK6ad7GVcOXJ9SGtHUuRFxBXAFwOGHH/7puXPntnT5ytm4cSNFRUWtXUbm2efCsdeFY68Lx14XzoH2evjw4c+llMpaqp68h7qI6Aw8DDyaUrqpCeNfA05JKb3Z3LllZWWppqbmgGvW7lVXVzNs2LDWLiPz7HPh2OvCsdeFY68L50B7HREtGuryffdrAN8DVu4plEXEX+TGERGfAroAtU2ZK0mSpAb5vvt1CHAJsCIiXswtuxY4GiCldAcwDpgQEe8Bm4GLcnfCnrG7uSmlR/JcsyRJUruT11CXUnoKiH2MqQKq9meuJEmSGviNEpIkSRlgqJMkScoAQ50kSVIGGOokSZIywFAnSZKUAYY6SZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqJMkScoAQ50kSVIGGOokSZIywFAnSZKUAYY6SZKkDDDUSZK0FzU1NUQEEydO3OfYfv36UVRUBMCaNWuIiEY/U6ZMadZr19fXc8kll1BUVERE8Pjjj+/HHuhg0am1C5AkKcv+/u//nqFDhwJw3HHHNXleXV0dv/3tb5kzZw4nn3wyU6dO5ROf+ES+ylQGeKROkpQp7x8hKy8v5/zzz6dnz55UVVVx4403UlxczEknncSaNWtYu3YtY8aMoaSkhD59+jBlyhS2bNkCwKJFi+jfvz99+/Zl7ty5jba/bt06xo0bt33e9OnTqa+v32M9ZWVljBo1iosvvpiTTz55j+PuvvtuIoKLLrqIE044gQsvvJATTzwRgBdeeIHx48ezefPmFuiQsspQJ0nKpGeeeYazzz6b0tJSKisrWbBgARMnTmT58uXcfPPNjB8/noceeohp06Zx7rnncssttzBz5ky2bNlCRUUFtbW1TJs2jWXLljXabkVFBQsXLmTy5MmMGjWKqqoqbr/99j3Wcfnll9OjRw9OOOEEnn322X3W/eijj3LllVcyYcIEZs6cCUB5eTn33XcfvXv3PrCmKNMMdZKkTDrttNO45pprGDJkCCklKisrmTRpEgArVqxgyZIlDB48mMrKSu644w46dOjAggULWLVqFevXr2f06NFcddVVXH/99du3uXHjRhYvXsyGDRuYMWMGd955JwCPPfbYLq/fo0cPZsyYwc9//nNmz57Nf/3XfzF+/Ph91n3ZZZcxadIkxowZwznnnANA//79ufjii+nRo0dLtEYZ5TV1kqR2bdPWOmo3bqW0qAvdu3zwa61Xr14AdO7cGYDi4mI6duzYaG5E7HP7KaVdlg0aNIjZs2dvf15cXLzLmN69ezcKhPfeey/PP/887777LocccsgeX69Pnz77rEnaHUOdJKndWvn623z94Vd4enUtpx9TynUjB9KtCfO6detGeXk5S5cuZdasWbz66qvU19dzwQUXMGDAAI444gjmz5/Pbbfdxrx587bPKyoqYujQoTz55JMsWbKEo446iqeeeooBAwZwyimnNHqN7373uyxbtozTTjuN1157jRdffJFBgwbtNdBJB8LTr5KkdmnT1rrtgQ7g6dW1fOPhV9i8ta5J8+fMmcPIkSOZNWsWjzzyCJMmTeLaa6+la9euzJkzh9LSUm644QZOPfXUXeaNHTuWW2+9lalTp7J69epdxgAce+yxvPTSS0yZMoXbbruN8847j/vvv//Ad1zag9jdYeX2qqysLNXU1LR2GZlVXV3NsGHDWruMzLPPhWOvCycfvV7750381Tef2GX5kn8azkdLurfoa7Wk9957j7feeqvRss6dO+/2FO7+8H1dOAfa64h4LqVU1lL1eKROktQulRZ14fRjShstG3JMKaU9urRSRU2zdOlSevfu3ehn9OjRrV2WMsBr6iRJ7VL3Lp24buTA7adghxxTyr+MHNjoZom2aNCgQSxcuLDRspKSklaqRlnStt/5kiTtxfFHHspdl5ZR+85WSnt0afOBDhoC3IgRI1q7DGVQ23/3S5K0F927dGoXYU7KN6+pkyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLAUCdJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgbkNdRFxEcj4omIWBkRL0fE5N2MGR0RL0XEixFRExFn7LDuvIj4bUT8LiKm57NWSZKk9qxTnrdfB/xjSun5iOgJPBcRC1NKr+wwZhHwYEopRcQngXnAgIjoCNwGnA38CVgWEQ/uNFeSJEnk+UhdSun1lNLzuccbgJXAUTuN2ZhSSrmnPYD3H58K/C6l9PuU0lZgLjA6n/VKkiS1VwW7pi4i+gEnA7/azbrPRsQq4BfAZbnFRwFrdxj2J3YKhJIkSWoQHxwky+OLRBQBi4GZKaWf7mVcOXB9SmlERPwNcG5K6Yu5dZcAp6aUrt5pzhXAFQCHH374p+fOnZuv3Tjobdy4kaKiotYuI/Psc+HY68Kx14VjrwvnQHs9fPjw51JKZS1VT76vqSMiOgM/Ae7ZW6ADSCk9GRHHRMRhNByZ++gOqz8C/Pdu5nwH+A5AWVlZGjZsWEuVrp1UV1djf/PPPheOvS4ce1049rpw2lqv8333awDfA1amlG7aw5i/yI0jIj4FdAFqgWXAxyOif0R0AS4GHsxnvZIkSe1Vvo/UDQEuAVZExIu5ZdcCRwOklO4AxgETIuI9YDNwUe7GibqI+D/Ao0BH4PsppZfzXK8kSVK7lNdQl1J6Coh9jKkCqvaw7hHgkTyUJkmSlCl+o4QkSVIGGOokSZIywFAnSZKUAYY6SZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqJMkScoAQ50kSVIGGOokSZIywFAnSZKUAYY6SZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqMu4mpoaIoKJEyfuc2y/fv0oKioCYPPmzZx11lkUFRUREcyePXuvc+++++4mjQOYOHEiEUFNTU2T9kGSJO2boU67tW3bNj70oQ9x3nnntXYpkiSpCQx1bcyaNWuICMrLyzn//PPp2bMnVVVV3HjjjRQXF3PSSSexZs0a1q5dy5gxYygpKaFPnz5MmTKFLVu2ALBo0SL69+9P3759mTt3bqPtr1u3jnHjxm2fN336dOrr63epo6ioiPvvv5+RI0fu13788pe/5GMf+xh9+/blK1/5ym6PFs6dO5d+/frRv39/Fi1aBHxwxG/ixIkMHDiQD3/4wyxYsIDx48fTo0cPxowZQ11d3X7VJElSlhnq2qhnnnmGs88+m9LSUiorK1mwYAETJ05k+fLl3HzzzYwfP56HHnqIadOmce6553LLLbcwc+ZMtmzZQkVFBbW1tUybNo1ly5Y12m5FRQULFy5k8uTJjBo1iqqqKm6//fYWrX3Lli2MHz+eN954g6lTp/L000/vdtyvf/1rvvKVr1BbW0tFRcX2UAoNofAf/uEfePPNNxk5ciS9evXijDPOYP78+Tz88MMtWq8kSVlgqGtlm7bWsfbPm9i0tfHRp9NOO41rrrmGIUOGkFKisrKSSZMmAbBixQqWLFnC4MGDqays5I477qBDhw4sWLCAVatWsX79ekaPHs1VV13F9ddfv32bGzduZPHixWzYsIEZM2Zw5513AvDYY4+16D7tWMPVV1/N1772td2O++pXv8pVV13F6NGjWb9+Pb/97W+3r5swYQJXX301Rx55JAD/8R//wYUXXgjAa6+91qL1SpKUBZ1au4CD2crX3+brD7/C06trOf2YUq4bOZBuuXW9evUCoHPnzgAUFxfTsWPHRvMjYp+vkVLaZdmgQYMa3dBQXFy8fzuwD02pD3Zf4477361bN7p06bJ9/7dt29ZiNUqSlBWGulayaWvd9kAH8PTqWr7x8Cv8y9DD9jm3W7dulJeXs3TpUmbNmsWrr75KfX09F1xwAQMGDOCII45g/vz53HbbbcybN2/7vKKiIoYOHcqTTz7JkiVLOOqoo3jqqacYMGAAp5xyyi6vc9ddd20/dfrrX/+a//mf/6GsrGz7HbJ7snMN9957727H/eu//iurVq3iwQcf5Mgjj+S4447j+eef3+f+S5KkXXn6tZXUbty6PdC9b+nqWv5383tNmj9nzhxGjhzJrFmzeOSRR5g0aRLXXnstXbt2Zc6cOZSWlnLDDTdw6qmn7jJv7Nix3HrrrUydOpXVq1fvMuZ9l19+OT/4wQ8AuP/++5k9ezZvvvnmPmvr2rUr99xzD6WlpcyaNYvBgwcDHxx9e98ZZ5xBVVUVH/rQh/jxj39M165dm7TvkiRpV7G7U1/tVVlZWWovn322aWsdX/xhTaNgN+SYUr57aRndu7TNA6jV1dUMGTKEt956q9Hyzp0773IKd/78+aSU6NatGzfddBOPPfYYDz300H7fTXswqa6uZtiwYa1dxkHBXheOvS4ce104B9rriHgupVTWUvV4pK6VdO/SietGDuT0Y0qBhkD3LyMHttlA976lS5fSu3fvRj+jR4/eZdwf//hHLr/8ckaPHs0f/vAHbr31VgOdJEl51LYTRMYdf+Sh3HVpGbXvbKW0R5c2H+ig4SaLhQsXNlpWUlKyy7irr76aq6++ulBlSZJ00Gv7KSLjunfp1C7C3PtKSkoYMWJEa5chSZJ24ulXSZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqJMkScoAQ50kSVIGGOokSZIywFAnSZKUAYY6SZKkDIiUUmvX0GIi4g3gD61dR4YdBrzZ2kUcBOxz4djrwrHXhWOvC+dAe903pdS7pYrJVKhTfkVETUqprLXryDr7XDj2unDsdeHY68Jpa7329KskSVIGGOokSZIywFCn5vhOaxdwkLDPhWOvC8deF469Lpw21WuvqZMkScoAj9RJkiRlgKHuIBUR50XEbyPidxExfTfrSyLiZxHxUkT8OiJO3GHdlyPi5Yj4TUTcFxGH5Jb/e0Ssys35WUT0KuAutVn56PUO66dGRIqIwwqxL21dvnodEVfntvtyRHyzUPvTVuXp34+TIuLZiHgxImoi4tRC7lNbdYC9npzr88sRMWWH5R+KiIUR8Wruz5IC7U6blqdeF/b3YkrJn4PsB+gIrAY+BnQBlgMDdxrz78BXc48HAItyj48CXgO65Z7PAybmHp8DdMo9rgKqWntfW/snX73OPf8o8CgNn814WGvva2v/5PF9PRx4HOiae/7h1t7XjPb5MeD83OMLgOrW3tfW/jnAXp8I/AboDnTKvYc/nlv3TWB67vF0/63Oa68L+nvRI3UHp1OB36WUfp9S2grMBUbvNGYgsAggpbQK6BcRh+fWdQK6RUQnGt7E/50b91hKqS435lngI/ndjXYhL73O+Q9gGuCFsQ3y1et/AGallLbk5v1/+d2NNi9ffU7AobnHxTR+rx+sDqTXxwPPppQ25f5dXgx8NjdnNPDD3OMfAmPyuhftQ156Xejfi4a6g9NRwNodnv8pt2xHy4GxALnTIH2Bj6SU1gGzgT8CrwNvpZQe281rXAYsaOG626O89DoiRgHrUkrL81t+u5Kv9/WxwF9FxK8iYnFEnJLHfWgP8tXnKcC/R8Ta3JjKfO1AO7LfvabhyFF5RJRGRHcajn5+NDfn8JTS6wC5Pz+ctz1oP/LV6x3l/feioe7gFLtZtvPRnllASUS8CFwNvADU5a69GA30B/oAPSKiotHGI/4ZqAPuaeG626MW73XuH41/Bq7PW9XtU77e152AEmAw8BVgXkTs7rUOFvnq8z8AX04pfRT4MvC9PNTe3ux3r1NKK2k43bcQ+L80BJI6tCd57XWhfi92yufG1Wb9icb/i/gIO53qSCm9DXwBIPcL7LXcz7nAaymlN3LrfgqcDszJPb8UGAmclXIXERzk8tHr5TT8UlyeyxYfAZ6PiFNTSuvzujdtW77e138Cfpp7P/86Iupp+L7HN/K6N21Xvvp8KTA5t4n7gbvytwvtxoH0mpTS98iF44i4Ibc9gP+JiCNTSq9HxJHAwX5JAeSv1wX9veiRuoPTMuDjEdE/IroAFwMP7jggInrl1gF8EXgy94b+IzA4Irrn3tRnAStzc84D/gkYlVLaVKB9aetavNcppRUppQ+nlPqllPrR8I/Hpw7yQAd5el8DPwfOzM0/loaLqA/mL0vPV5//Gxiae3wm8Gqe96M9OJBeExEfzv15NA2nDe/LjXuQhhBN7s/5ed2L9iEvvS7070WP1B2EUkp1EfF/aLhzsiPw/ZTSyxHx97n1d9Bw4eePImIb8Arwd7l1v4qIB4DnaTiU/AIffKL2rUBXYGHuCNKzKaW/L9yetT157LV2ksdefx/4fkT8BtgKXHowH4XOY58vB26Jhhso3gWuKOButUkH0uucn0REKfAecFVK6X9zy2fRcBnB39EQtP+mMHvUduWx1wX9veg3SkiSJGWAp18lSZIywFAnSZKUAYY6SZKkDDDUSZIkZYChTpIkKQMMdZIkSRlgqJOkA5D7XDVJanWGOknKiYgJEfFSRCyPiB9HRN+IWJRbtij3afFExN0RcVNEPEHDdz5KUqvzf5iSBETECcA/A0NSSm9GxIeAHwI/Sin9MCIuA74FjMlNORYYkVLa1ioFS9JOPFInSQ3OBB5IKb0JkFL6M/CXwL259T8Gzthh/P0GOkltiaFOkhoEsK/vTdxx/Tt5rEWSms1QJ0kNFgEX5r6Um9zp16eBi3PrxwNPtVJtkrRPXlMnSUBK6eWImAksjohtwAvAJOD7EfEV4A3gC61ZoyTtTaS0r7MNkiRJaus8/SpJkpQBhjpJkqQMMNRJkiRlgKFOkiQpAwx1kiRJGWCokyRJygBDnSRJUgYY6iRJkjLg/wfnHnHFYbLEdwAAAABJRU5ErkJggg==\n",
300 | "text/plain": [
301 | ""
302 | ]
303 | },
304 | "metadata": {
305 | "needs_background": "light"
306 | },
307 | "output_type": "display_data"
308 | }
309 | ],
310 | "source": [
311 | "plt.figure(figsize=(10,5))\n",
312 | "# 점의 사이즈가 30인 산점도 그래프\n",
313 | "graph = sns.scatterplot(x=\"cor\", y=\"public_rmse\", data=score_df, s=30)\n",
314 | "# 각 점에 대한 모델명 표시\n",
315 | "for idx in range(0, score_df.shape[0]):\n",
316 | " graph.text(score_df.cor[idx]+0.00005 , score_df.public_rmse[idx]-0.00003, \n",
317 | " score_df.model[idx], horizontalalignment='left', \n",
318 | " size='medium', color='black', weight='semibold')\n",
319 | "\n",
320 | "# x축 범위 지정\n",
321 | "plt.xlim((score_df.cor.min()-0.001, score_df.cor.max()+0.001))\n",
322 | "# y축 범위 지정\n",
323 | "plt.ylim((score_df.public_rmse.min()-0.005, score_df.public_rmse.max()+0.005))\n",
324 | "plt.grid() # 격자 무늬\n",
325 | "plt.show()"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "metadata": {},
331 | "source": [
332 | "#### 3.5.1.2. 여러 가지 앙상블 기법"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 7,
338 | "metadata": {
339 | "scrolled": true
340 | },
341 | "outputs": [
342 | {
343 | "name": "stdout",
344 | "output_type": "stream",
345 | "text": [
346 | "ensemble_model1+model5.csv가 저장되었습니다!\n"
347 | ]
348 | }
349 | ],
350 | "source": [
351 | "ensemble_dir = \"C:/dacon/ch03/submission/Ensemble1/\"\n",
352 | "\n",
353 | "# Ensemble1 폴더가 없다면 Ensemble1 폴더를 생성\n",
354 | "if not os.path.isdir(ensemble_dir):\n",
355 | " os.mkdir(ensemble_dir)\n",
356 | "\n",
357 | "# Ensemble1 폴더로 파일 이동\n",
358 | "import shutil\n",
359 | "shutil.move(sub_dir + 'model1_lgbm=2.29.csv', ensemble_dir + 'model1_lgbm=2.29.csv')\n",
360 | "shutil.move(sub_dir + 'model5_rf=2.31.csv', ensemble_dir + 'model5_rf=2.31.csv')\n",
361 | "\n",
362 | "# model1_lgbm=2.29.csv, model5_rf=2.31.csv 멱 평균\n",
363 | "nf = 0\n",
364 | "for f in os.listdir(ensemble_dir):\n",
365 | " ext = os.path.splitext(f)[-1] # 'Ensemble1' 폴더에 있는 파일의 확장자를 분리\n",
366 | "\n",
367 | " if ext == '.csv':\n",
368 | " sub = pd.read_csv(ensemble_dir + f) # 확장자명이 .csv라면 해당 데이터를 로드\n",
369 | " else:\n",
370 | " continue # 확장자명이 .csv가 아니라면 for문을 종료\n",
371 | "\n",
372 | " if len(sub.columns) !=2:\n",
373 | " continue # 로드한 데이터의 변수의 수가 2개가 아니라면 for문을 종료\n",
374 | "\n",
375 | " # 파일의 확장자명이 .csv이고, 변수의 수가 2개인 경우에만 밑의 조건문이 실행됩니다.\n",
376 | " if nf == 0:\n",
377 | " sub_df = sub # nf가 0이면 해당 데이터(sub)를 sub_df 에 저장\n",
378 | " else:\n",
379 | " sub_df = pd.merge(sub_df, sub, on=\"id\") # nf가 0이 아니면 해당 데이터를 sub_df와 병합\n",
380 | " \n",
381 | " nf += 1\n",
382 | "\n",
383 | "p = 21 # 하이퍼파라미터이므로 최적의 값을 찾아야 합니다.\n",
384 | "\n",
385 | "# 파일의 개수(nf)가 2개 이상인 경우에만 밑의 조건문이 실행됩니다.\n",
386 | "if nf >= 2: \n",
387 | " pred = 0\n",
388 | " \n",
389 | " # 앞서 소개한 멱 평균의 식을 나타낸 코드입니다.\n",
390 | " for j in range(nf):\n",
391 | " pred = pred + sub_df.iloc[:,j+1]**p\n",
392 | " pred = pred / nf \n",
393 | " pred = pred**(1/p)\n",
394 | " \n",
395 | " # 멱 평균 결과를 대입하여 데이터프레임 submit을 생성하고, csv파일로 저장 \n",
396 | " submit = pd.DataFrame({'id': sub_df.id, '18~20_ride': pred})\n",
397 | " fname = \"ensemble_model1+model5.csv\"\n",
398 | " submit.to_csv(fname, index=False)\n",
399 | " \n",
400 | " print(fname + '가 저장되었습니다!')"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": 8,
406 | "metadata": {},
407 | "outputs": [
408 | {
409 | "name": "stdout",
410 | "output_type": "stream",
411 | "text": [
412 | "ensemble_model2+model4.csv가 저장되었습니다!\n"
413 | ]
414 | }
415 | ],
416 | "source": [
417 | "ensemble_dir = \"C:/dacon/ch03/submission/Ensemble2/\"\n",
418 | "\n",
419 | "# Ensemble2 폴더가 없다면 Ensemble2 폴더를 생성\n",
420 | "if not os.path.isdir(ensemble_dir):\n",
421 | " os.mkdir(ensemble_dir)\n",
422 | "\n",
423 | "# Ensemble2 폴더로 파일 이동\n",
424 | "import shutil\n",
425 | "shutil.move(sub_dir + 'model2_rf=2.34.csv', ensemble_dir + 'model2_rf=2.34.csv')\n",
426 | "shutil.move(sub_dir + 'model4_rf=2.36.csv', ensemble_dir + 'model4_rf=2.36.csv')\n",
427 | "\n",
428 | "# model2_rf=2.34.csv, model4_rf=2.36.csv 멱 평균\n",
429 | "nf = 0\n",
430 | "for f in os.listdir(ensemble_dir):\n",
431 | " # 'Ensemble2' 폴더에 있는 파일의 확장자를 분리\n",
432 | " ext = os.path.splitext(f)[-1] \n",
433 | "\n",
434 | " if ext == '.csv': \n",
435 | " # 확장자명이 .csv라면 해당 데이터를 로드\n",
436 | " sub = pd.read_csv(ensemble_dir + f) \n",
437 | " else: \n",
438 | " # 확장자명이 .csv가 아니라면 for문을 종료\n",
439 | " continue \n",
440 | "\n",
441 | " if len(sub.columns) !=2:\n",
442 | " # 로드한 데이터의 변수의 수가 2개가 아니라면 for 문을 종료\n",
443 | " continue \n",
444 | "\n",
445 | " # 파일의 확장자명이 .csv이고, 변수의 수가 2개인 경우에만\n",
446 | " # 밑의 조건문이 실행됩니다.\n",
447 | " if nf == 0:\n",
448 | " sub_df = sub # nf가 0이면 해당 데이터(sub)를 sub_df 에 저장\n",
449 | " else: \n",
450 | " # nf가 0이 아니면 해당 데이터를 sub_df와 병합\n",
451 | " sub_df = pd.merge(sub_df, sub, on=\"id\") \n",
452 | " \n",
453 | " nf += 1\n",
454 | "\n",
455 | "p = 21 # 하이퍼파라미터이므로 최적의 값을 찾아야 합니다.\n",
456 | "\n",
457 | "# 파일의 개수(nf)가 2개 이상인 경우에만 밑의 조건문이 실행됩니다.\n",
458 | "if nf >= 2: \n",
459 | " pred = 0\n",
460 | " \n",
461 | " # 앞서 소개한 멱 평균의 식을 나타낸 코드입니다.\n",
462 | " for j in range(nf):\n",
463 | " pred = pred + sub_df.iloc[:,j+1]**p\n",
464 | " pred = pred / nf \n",
465 | " pred = pred**(1/p)\n",
466 | "\n",
467 | " # 멱 평균 결과를 대입하여 데이터프레임 submit을 생성하고, csv 파일로 저장\n",
468 | " submit = pd.DataFrame({'id': sub_df.id, '18~20_ride': pred})\n",
469 | " fname = \"ensemble_model2+model4.csv\"\n",
470 | " submit.to_csv(fname, index=False)\n",
471 | " \n",
472 | " print(fname + '가 저장되었습니다!')"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 9,
478 | "metadata": {},
479 | "outputs": [],
480 | "source": [
481 | "# 앙상블 파일 로드\n",
482 | "first = pd.read_csv('ensemble_model1+model5.csv')\n",
483 | "second = pd.read_csv('ensemble_model2+model4.csv')\n",
484 | "third = pd.read_csv('model3_rf=2.38.csv')\n",
485 | "\n",
486 | "# 가중산술평균\n",
487 | "w_mean = 0.22*first['18~20_ride'] + 0.30*second['18~20_ride'] + 0.48*third['18~20_ride']\n",
488 | "\n",
489 | "# 최종 submission 파일 생성\n",
490 | "sub['18~20_ride'] = w_mean\n",
491 | "sub.to_csv('final_submission.csv', index=False)"
492 | ]
493 | }
494 | ],
495 | "metadata": {
496 | "kernelspec": {
497 | "display_name": "Python 3",
498 | "language": "python",
499 | "name": "python3"
500 | },
501 | "language_info": {
502 | "codemirror_mode": {
503 | "name": "ipython",
504 | "version": 3
505 | },
506 | "file_extension": ".py",
507 | "mimetype": "text/x-python",
508 | "name": "python",
509 | "nbconvert_exporter": "python",
510 | "pygments_lexer": "ipython3",
511 | "version": "3.8.5"
512 | },
513 | "toc": {
514 | "base_numbering": 1,
515 | "nav_menu": {},
516 | "number_sections": true,
517 | "sideBar": true,
518 | "skip_h1_title": false,
519 | "title_cell": "Table of Contents",
520 | "title_sidebar": "Contents",
521 | "toc_cell": false,
522 | "toc_position": {},
523 | "toc_section_display": true,
524 | "toc_window_display": false
525 | }
526 | },
527 | "nbformat": 4,
528 | "nbformat_minor": 2
529 | }
530 |
--------------------------------------------------------------------------------