├── .idea
├── DataMining.iml
├── encodings.xml
├── inspectionProfiles
│ └── Project_Default.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── PULL_REQUEST_TEMPLATE.md
├── README.md
├── dust_weather
├── GRU.ipynb
├── callAPI.py
├── correlation.py
├── dust_weather.ipynb
└── raw_data
│ ├── dust
│ └── dust_20090101.csv
│ ├── weather
│ └── weather_20090101.csv
│ └── weather_column.txt
├── elasticsearch
└── elasticsearch_lyrics.ipynb
├── kaggle
└── Tabular Playground Series - Jan 2021
│ └── xgboost-lgbm-optuna.ipynb
└── regex_nlp_kko
├── clustering.py
├── font
├── NanumBrush.ttf
├── NanumGothic.ttf
└── denne.png
├── kkma_token.py
├── mwordcloud.py
├── raw_data
├── except_similar.txt
├── except_word.txt
└── kko.txt
├── regex.py
├── result
├── DBSCAN_eps0.7
│ ├── cluster_eps_3.xlsx
│ ├── cluster_eps_4.xlsx
│ ├── cluster_eps_5.xlsx
│ ├── cluster_eps_6.xlsx
│ ├── cluster_eps_7.xlsx
│ └── cluster_eps_8.xlsx
├── DBSCAN_eps0.75
│ ├── cluster_eps_3.xlsx
│ ├── cluster_eps_4.xlsx
│ ├── cluster_eps_5.xlsx
│ ├── cluster_eps_6.xlsx
│ ├── cluster_eps_7.xlsx
│ └── cluster_eps_8.xlsx
├── all_token.csv
├── cluster.xlsx
├── embedding.model
├── kko_regex.csv
└── noun_token.csv
├── similar_day.py
└── word2vector.py
/.idea/DataMining.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
161 |
162 |
163 |
164 | 2
165 | 꼬끄꼬끄
166 | 이니무가 뭐 클럽간다고 미룬것도아니고
167 | 사줄게여..
168 | 쩝ㅋㅋㅋㅋ 그래영^~^~^~^
169 | 🤔
170 | 사줄게여
171 | 생각좀해볼게여...
172 | 첫글자가대문자가
173 | 쭐
174 | 졍이
175 | 쭈니
176 | load
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 | 1533306958822
443 |
444 |
445 | 1533306958822
446 |
447 |
448 | 1544187773520
449 |
450 |
451 |
452 | 1544187773520
453 |
454 |
455 | 1544189856014
456 |
457 |
458 |
459 | 1544189856014
460 |
461 |
462 | 1544197029957
463 |
464 |
465 |
466 | 1544197029957
467 |
468 |
469 | 1544340339858
470 |
471 |
472 |
473 | 1544340339858
474 |
475 |
476 | 1544341799656
477 |
478 |
479 |
480 | 1544341799656
481 |
482 |
483 | 1544857966276
484 |
485 |
486 |
487 | 1544857966276
488 |
489 |
490 | 1545229841837
491 |
492 |
493 |
494 | 1545229841837
495 |
496 |
497 | 1545230437706
498 |
499 |
500 |
501 | 1545230437706
502 |
503 |
504 | 1545404276451
505 |
506 |
507 |
508 | 1545404276451
509 |
510 |
511 | 1545544390582
512 |
513 |
514 |
515 | 1545544390582
516 |
517 |
518 | 1546773210200
519 |
520 |
521 |
522 | 1546773210201
523 |
524 |
525 | 1547192958375
526 |
527 |
528 |
529 | 1547192958375
530 |
531 |
532 | 1547193166881
533 |
534 |
535 |
536 | 1547193166881
537 |
538 |
539 | 1547701017157
540 |
541 |
542 |
543 | 1547701017157
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
843 |
844 |
845 |
846 |
847 |
848 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 |
874 |
875 |
876 |
877 |
878 |
879 |
880 |
881 |
882 |
883 |
884 |
885 |
886 |
887 |
888 |
889 |
890 |
891 |
892 |
893 |
894 |
895 |
896 |
897 |
898 |
899 |
900 |
901 |
902 |
903 |
904 |
905 |
906 |
907 |
908 |
909 |
910 |
911 |
912 |
913 |
914 |
915 |
916 |
917 |
918 |
919 |
920 |
921 |
922 |
923 |
924 |
925 |
926 |
927 |
928 |
929 |
930 |
931 |
932 |
933 |
934 |
935 |
936 |
937 |
--------------------------------------------------------------------------------
/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Mining Practice
2 | 일상 속의 데이터를 이용해서 데이터 분석하는 연습장입니다.
3 |
4 |
5 | > 블로그 : http://ssoonidev.tistory.com
6 |
7 | ## regex_nlp_kko
8 | 카카오톡 대화 내용을 NLP에 적용해 보았습니다.
9 | > 실제 대화내용을 기반으로 작성되어 DataSet 공개가 어렵습니다.
10 |
11 | 1. 한글 형태소 분석기 **Kkma**로 Tokenize 적용
12 | 2. Token 들의 빈도를 기준으로 **WordCloud** 생성
13 | 3. **Word2Vec**로 Word Embedding 적용
14 | 4. Word Vector로 **DBSCAN** 클러스터링 적용
15 | 5. Cosine Similarity으로 유사도 측정
16 |
17 | ## dust_weather
18 | T시의 서울시 강남구의 미세먼지 농도를 T-20 ~ T-1 구간의 기상과 미세먼지 농도를 이용해서 모델링
19 |
20 | > **데이터 출처**
21 | > 서울시 열린데이터광장 (http://data.seoul.go.kr)
22 | > 공공데이터포털 (https://www.data.go.kr/)
23 |
24 | 1. API Call을 하여 Raw Data 구성하기 : https://ssoonidev.tistory.com/102
25 | 2. **GRU** 모델 적용 : https://ssoonidev.tistory.com/105
26 |
27 | Layer 1
28 | 
29 |
30 | Layer 3
31 | 
32 |
33 |
34 |
35 | ## Keggle : Tabular Playground Series - Jan 2021
36 | 캐글에서 자주 사용하는 XGBoost를 사용해보았습니다.
37 |
38 | 1. XGBoost Regression 사용 : https://ssoonidev.tistory.com/106
39 | 2. Optuna F/W 로 하이퍼 파리미터 최적화 : https://ssoonidev.tistory.com/107
40 |
--------------------------------------------------------------------------------
/dust_weather/callAPI.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import pandas as pd
3 | import os
4 | import datetime as dt
5 | import matplotlib.pyplot as plt
6 | import urllib3
7 |
8 | from time import sleep
9 |
10 |
11 | def call_api(api_name, start_date, end_date, dir_name):
12 | # API 키는 공개하기 힘든 점 양해 바랍니다.
13 | api_key = open("./raw_data/api_key").readlines()[0].strip()
14 | url_format = 'http://openAPI.seoul.go.kr:8088/{api_key}/json/{api_name}/1/{end_index}/{date}'
15 | headers = {'content-type': 'application/json;charset=utf-8'}
16 |
17 | for date in pd.date_range(start_date, end_date).strftime("%Y%m%d"):
18 | # 최초 1회 Call은 해당 일자의 데이터 수를 확인한다.
19 | url = url_format.format(api_name=api_name, api_key=api_key, end_index=1, date=date)
20 | response = requests.get(url, headers=headers)
21 | end_index = response.json()[api_name]["list_total_count"]
22 | print("Max Count(%s): %s" % (date, end_index))
23 |
24 | # 해당 일자의 모든 데이터를 불러온다.
25 | url = url_format.format(api_name=api_name, api_key=api_key, end_index=end_index, date=date)
26 | response = requests.get(url, headers=headers)
27 | result = pd.DataFrame(response.json()[api_name]["row"])
28 | result.to_csv("./raw_data/%s/dust_%s.csv" % (dir_name, date), index=False, encoding="utf-8")
29 |
30 | # API 부하 관리를 위해 0.5초 정도 쉬어 줍시다 (찡긋)
31 | sleep(0.5)
32 |
33 |
34 | def call_weather_api(start_date, end_date):
35 | # API 키는 공개하기 힘든 점 양해 바랍니다.
36 | api_key = open("./raw_data/weather_api").readlines()[0].strip()
37 | url_format = 'https://data.kma.go.kr/apiData/getData?type=json&dataCd=ASOS&dateCd=HR&startDt={date}&startHh=00&endDt={date}&endHh=23&stnIds={snt_id}&schListCnt=100&pageIndex=1&apiKey={api_key}'
38 |
39 | headers = {'content-type': 'application/json;charset=utf-8'}
40 | urllib3.disable_warnings()
41 |
42 | for date in pd.date_range(start_date, end_date).strftime("%Y%m%d"):
43 | print("%s Weather" % date)
44 | url = url_format.format(api_key=api_key, date=date, snt_id="108")
45 | response = requests.get(url, headers=headers, verify=False)
46 |
47 | # 200 (정상)의 경우에만 파일 생성
48 | print(response.status_code)
49 | if response.status_code == 200:
50 | result = pd.DataFrame(response.json()[-1]["info"])
51 | print(result.head())
52 | result.to_csv("./raw_data/weather/weather_%s.csv" % date, index=False, encoding="utf-8")
53 |
54 | # API 부하 관리를 위해 0.5초 정도 쉬어 줍시다 (찡긋)
55 | sleep(0.5)
56 |
57 |
58 | def concat_data():
59 | df_list = list()
60 |
61 | # ./raw_data/dust 아래의 모든 파일을 읽습니다.
62 | for root, dirs, files in os.walk("./raw_data/dust", topdown=False):
63 | for name in files:
64 | df_list.append(pd.read_csv(os.path.join(root, name)))
65 |
66 | dust = pd.DataFrame(pd.concat(df_list, sort=False))
67 |
68 | # Datetime 형태로 Index를 변경해줍니다.
69 | dust["MSRDT"] = dust["MSRDT"].apply(lambda x: dt.datetime.strptime(str(x), "%Y%m%d%H%M"))
70 | dust = dust.set_index("MSRDT")
71 |
72 | df_list.clear()
73 |
74 | # ./raw_data/weather 아래의 모든 파일을 읽습니다.
75 | for root, dirs, files in os.walk("./raw_data/weather", topdown=False):
76 | for name in files:
77 | df_list.append(pd.read_csv(os.path.join(root, name)))
78 | weather = pd.DataFrame(pd.concat(df_list, sort=False))
79 |
80 | # Datetime 형태로 Index를 변경해줍니다.
81 | weather["TM"] = weather["TM"].apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d %H:%M"))
82 | weather = weather.set_index("TM")
83 |
84 | # join() 함수는 같은 iㅠndex 끼리의 join을 제공합니다.
85 | master = weather.join(dust, how="inner")
86 |
87 | master.to_hdf("./raw_data/data.hdf", "master")
88 | dust.to_hdf("./raw_data/data.hdf", "dust")
89 | weather.to_hdf("./raw_data/data.hdf", "weather")
90 |
91 |
92 | def describe_dust_data():
93 | master = pd.read_hdf("./raw_data/data.hdf", "master")
94 |
95 | msr_nm_list = set(master["MSRSTE_NM"].dropna().tolist())
96 | print(msr_nm_list)
97 |
98 | # 한글 전용 폰트 적용
99 | plt.rcParams["font.family"] = 'D2Coding'
100 | plt.rcParams["font.size"] = 10
101 |
102 | fig, ax = plt.subplots()
103 | master.boxplot(column='PM25', by='MSRSTE_NM', ax=ax)
104 | plt.show()
105 |
106 |
107 | if __name__ == '__main__':
108 | # call_api("TimeAverageAirQuality", "2009-01-01", "2019-01-01", "dust")
109 | call_weather_api("2009-01-01", "2019-01-01")
110 | concat_data()
111 | describe_dust_data()
--------------------------------------------------------------------------------
/dust_weather/correlation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 |
5 |
6 | def load_data():
7 | return pd.read_hdf("./raw_data/data.hdf", "master")
8 |
9 |
10 | def correlation(master):
11 | preprocess = master.fillna(method='ffill')
12 | preprocess = preprocess[preprocess["PM25"] <= 500]
13 | preprocess = preprocess.groupby(preprocess.index).mean()
14 |
15 | corr = preprocess.corr(method='spearman')
16 | fig = plt.figure()
17 | ax = fig.add_subplot(111)
18 |
19 | cax = ax.matshow(corr, cmap='coolwarm', vmin=-1, vmax=1)
20 | fig.colorbar(cax)
21 | ticks = np.arange(0, len(corr.index), 1)
22 |
23 | ax.set_xticks(ticks)
24 | plt.xticks(rotation=90)
25 |
26 | ax.set_yticks(ticks)
27 | ax.set_xticklabels(corr.columns)
28 | ax.set_yticklabels(corr.index)
29 | plt.show()
30 | plt.show()
31 |
32 |
33 | if __name__ == '__main__':
34 | master = load_data()
35 | correlation(master)
--------------------------------------------------------------------------------
/dust_weather/raw_data/dust/dust_20090101.csv:
--------------------------------------------------------------------------------
1 | CO,MSRDT,MSRSTE_NM,NO2,O3,PM10,PM25,SO2
2 | 0.4,200901010000,강남구,0.023,0.015,48.0,14.0,0.006
3 | 0.4,200901010100,강남구,0.027,0.012,35.0,17.0,0.006
4 | 0.4,200901010200,강남구,0.024,0.014,33.0,13.0,0.006
5 | 0.4,200901010300,강남구,0.018,0.016,52.0,14.0,0.006
6 | 0.3,200901010400,강남구,0.017,0.015,26.0,17.0,0.006
7 | 0.3,200901010500,강남구,0.014,0.017,38.0,13.0,0.005
8 | 0.3,200901010600,강남구,0.014,0.016,38.0,20.0,0.005
9 | 0.3,200901010700,강남구,0.017,0.014,34.0,22.0,0.006
10 | 0.4,200901010800,강남구,0.018,0.014,52.0,28.0,0.005
11 | 0.4,200901010900,강남구,0.028,0.008,23.0,18.0,0.005
12 | 0.4,200901011000,강남구,0.026,0.011,27.0,13.0,0.006
13 | 0.5,200901011100,강남구,0.026,0.011,30.0,12.0,0.006
14 | 0.5,200901011200,강남구,0.024,0.013,33.0,21.0,0.007
15 | 0.5,200901011300,강남구,0.026,0.013,35.0,15.0,0.008
16 | 0.5,200901011400,강남구,0.021,0.017,52.0,17.0,0.006
17 | 0.4,200901011500,강남구,0.018,0.019,46.0,19.0,0.007
18 | 0.4,200901011600,강남구,0.023,0.017,52.0,18.0,0.007
19 | 0.4,200901011700,강남구,0.025,0.016,59.0,17.0,0.006
20 | 0.4,200901011800,강남구,0.031,0.012,41.0,15.0,0.006
21 | 0.4,200901011900,강남구,0.03,0.012,34.0,19.0,0.005
22 | 0.4,200901012000,강남구,0.03,0.01,47.0,24.0,0.006
23 | 0.4,200901012100,강남구,0.043,0.003,37.0,16.0,0.006
24 | 0.6,200901012200,강남구,0.049,0.001,53.0,25.0,0.006
25 | 0.6,200901012300,강남구,0.048,0.0,36.0,13.0,0.006
26 | 0.3,200901010000,강동구,0.024,0.011,39.0,13.0,0.007
27 | 0.3,200901010100,강동구,0.022,0.012,38.0,16.0,0.007
28 | 0.3,200901010200,강동구,0.02,0.012,38.0,13.0,0.007
29 | 0.3,200901010300,강동구,0.019,0.012,41.0,7.0,0.006
30 | 0.3,200901010400,강동구,0.022,0.009,47.0,15.0,0.006
31 | 0.3,200901010500,강동구,0.019,0.011,48.0,10.0,0.006
32 | 0.3,200901010600,강동구,0.019,0.011,41.0,16.0,0.006
33 | 0.3,200901010700,강동구,0.023,0.009,42.0,14.0,0.006
34 | 0.3,200901010800,강동구,0.022,0.008,34.0,11.0,0.006
35 | 0.3,200901010900,강동구,0.021,0.01,50.0,26.0,0.007
36 | 0.3,200901011000,강동구,0.018,0.012,45.0,11.0,0.006
37 | 0.3,200901011100,강동구,0.011,0.015,35.0,21.0,0.006
38 | 0.3,200901011200,강동구,0.01,0.016,55.0,14.0,0.007
39 | 0.3,200901011300,강동구,0.013,0.016,50.0,9.0,0.006
40 | 0.4,200901011400,강동구,0.024,0.013,58.0,19.0,0.007
41 | 0.4,200901011500,강동구,0.022,0.013,47.0,22.0,0.007
42 | 0.4,200901011600,강동구,0.026,0.012,43.0,16.0,0.007
43 | 0.3,200901011700,강동구,0.027,0.011,52.0,23.0,0.007
44 | 0.3,200901011800,강동구,0.033,0.008,50.0,13.0,0.006
45 | 0.4,200901011900,강동구,0.035,0.008,50.0,15.0,0.006
46 | 0.3,200901012000,강동구,0.035,0.007,39.0,15.0,0.007
47 | 0.4,200901012100,강동구,0.044,0.003,46.0,21.0,0.006
48 | 0.5,200901012200,강동구,0.05,0.001,50.0,26.0,0.007
49 | 0.6,200901012300,강동구,0.052,0.002,55.0,15.0,0.008
50 | 0.5,200901010000,강북구,0.028,0.021,46.0,12.0,0.008
51 | 0.5,200901010100,강북구,0.027,0.018,49.0,38.0,0.007
52 | 0.5,200901010200,강북구,0.025,0.02,35.0,26.0,0.006
53 | 0.5,200901010300,강북구,0.023,0.02,43.0,18.0,0.007
54 | 0.5,200901010400,강북구,0.022,0.021,42.0,24.0,0.006
55 | 0.5,200901010500,강북구,0.021,0.02,46.0,10.0,0.006
56 | 0.5,200901010600,강북구,0.022,0.02,34.0,27.0,0.007
57 | 0.5,200901010700,강북구,0.024,0.013,34.0,24.0,0.007
58 | 0.6,200901010800,강북구,0.029,0.007,48.0,13.0,0.008
59 | 0.6,200901010900,강북구,0.029,0.009,45.0,19.0,0.008
60 | 0.6,200901011000,강북구,0.029,0.009,42.0,14.0,0.008
61 | 0.5,200901011100,강북구,0.027,0.02,42.0,28.0,0.007
62 | 0.5,200901011200,강북구,0.025,0.02,31.0,33.0,0.007
63 | 0.5,200901011300,강북구,0.025,0.024,42.0,29.0,0.007
64 | 0.5,200901011400,강북구,0.025,0.026,50.0,25.0,0.006
65 | 0.5,200901011500,강북구,0.025,0.028,35.0,23.0,0.006
66 | 0.4,200901011600,강북구,0.023,0.03,40.0,34.0,0.006
67 | 0.5,200901011700,강북구,0.024,0.024,34.0,30.0,0.006
68 | 0.5,200901011800,강북구,0.027,0.023,0.0,25.0,0.006
69 | 0.6,200901011900,강북구,0.028,0.017,38.0,31.0,0.008
70 | 0.8,200901012000,강북구,0.033,0.007,49.0,37.0,0.008
71 | 0.8,200901012100,강북구,0.034,0.007,48.0,21.0,0.008
72 | 0.9,200901012200,강북구,0.037,0.004,38.0,36.0,0.009
73 | 0.8,200901012300,강북구,0.037,0.005,46.0,21.0,0.009
74 | 0.5,200901010000,강서구,0.026,0.013,43.0,18.0,0.006
75 | 0.5,200901010100,강서구,0.019,0.019,40.0,19.0,0.005
76 | 0.5,200901010200,강서구,0.018,0.018,41.0,18.0,0.005
77 | 0.5,200901010300,강서구,0.016,0.019,37.0,18.0,0.005
78 | 0.5,200901010400,강서구,0.013,0.021,39.0,18.0,0.005
79 | 0.5,200901010500,강서구,0.013,0.018,39.0,20.0,0.005
80 | 0.5,200901010600,강서구,0.013,0.019,38.0,20.0,0.006
81 | 0.5,200901010700,강서구,0.015,0.017,43.0,18.0,0.006
82 | 0.9,200901010800,강서구,0.029,0.005,46.0,24.0,0.007
83 | 0.9,200901010900,강서구,0.023,0.01,51.0,24.0,0.007
84 | 0.9,200901011000,강서구,0.024,0.012,48.0,23.0,0.008
85 | 0.8,200901011100,강서구,0.016,0.021,47.0,27.0,0.007
86 | 0.8,200901011200,강서구,0.013,0.023,40.0,23.0,0.007
87 | 0.7,200901011300,강서구,0.015,0.024,46.0,25.0,0.007
88 | 0.8,200901011400,강서구,0.013,0.027,45.0,21.0,0.006
89 | 0.7,200901011500,강서구,0.013,0.028,39.0,24.0,0.005
90 | 0.7,200901011600,강서구,0.012,0.03,42.0,23.0,0.005
91 | 0.7,200901011700,강서구,0.016,0.027,42.0,25.0,0.005
92 | 0.7,200901011800,강서구,0.023,0.019,38.0,24.0,0.005
93 | 0.8,200901011900,강서구,0.04,0.006,43.0,19.0,0.005
94 | 0.8,200901012000,강서구,0.042,0.004,39.0,23.0,0.006
95 | 0.8,200901012100,강서구,0.045,0.002,52.0,25.0,0.006
96 | 0.9,200901012200,강서구,0.045,0.002,49.0,29.0,0.005
97 | 0.8,200901012300,강서구,0.045,0.003,45.0,21.0,0.006
98 | 0.5,200901010000,관악구,0.024,0.0,70.0,19.0,0.007
99 | 0.4,200901010100,관악구,0.022,0.0,62.0,13.0,0.007
100 | 0.4,200901010200,관악구,0.019,0.012,61.0,11.0,0.008
101 | 0.4,200901010300,관악구,0.016,0.044,37.0,11.0,0.006
102 | 0.4,200901010400,관악구,0.019,0.029,44.0,10.0,0.006
103 | 0.4,200901010500,관악구,0.02,0.023,43.0,11.0,0.007
104 | 0.4,200901010600,관악구,0.018,0.023,32.0,14.0,0.008
105 | 0.5,200901010700,관악구,0.024,0.013,64.0,15.0,0.003
106 | 0.5,200901010800,관악구,0.029,0.008,37.0,19.0,0.004
107 | 0.5,200901010900,관악구,0.022,0.016,45.0,15.0,0.003
108 | 0.5,200901011000,관악구,0.02,0.022,51.0,14.0,0.006
109 | 0.4,200901011100,관악구,0.015,0.036,33.0,13.0,0.01
110 | 0.4,200901011200,관악구,0.014,0.0,36.0,15.0,0.007
111 | 0.4,200901011300,관악구,0.015,0.0,44.0,19.0,0.01
112 | 0.5,200901011400,관악구,0.016,0.0,41.0,16.0,0.011
113 | 0.4,200901011500,관악구,0.017,0.0,39.0,19.0,0.008
114 | 0.4,200901011600,관악구,0.017,0.013,42.0,16.0,0.007
115 | 0.4,200901011700,관악구,0.018,0.017,50.0,17.0,0.005
116 | 0.5,200901011800,관악구,0.023,0.013,44.0,13.0,0.007
117 | 0.6,200901011900,관악구,0.032,0.0,40.0,20.0,0.011
118 | 0.6,200901012000,관악구,0.038,0.0,51.0,13.0,0.006
119 | 0.6,200901012100,관악구,0.038,0.0,37.0,17.0,0.003
120 | 0.6,200901012200,관악구,0.034,0.0,36.0,12.0,0.003
121 | 0.6,200901012300,관악구,0.037,0.001,41.0,16.0,0.006
122 | 0.6,200901010000,광진구,0.023,0.017,41.0,22.0,0.005
123 | 0.5,200901010100,광진구,0.015,0.023,41.0,19.0,0.004
124 | 0.4,200901010200,광진구,0.015,0.022,41.0,17.0,0.004
125 | 0.4,200901010300,광진구,0.014,0.023,40.0,18.0,0.004
126 | 0.4,200901010400,광진구,0.015,0.021,40.0,21.0,0.003
127 | 0.4,200901010500,광진구,0.011,0.025,38.0,13.0,0.003
128 | 0.5,200901010600,광진구,0.019,0.018,41.0,21.0,0.003
129 | 0.5,200901010700,광진구,0.024,0.012,39.0,19.0,0.003
130 | 0.6,200901010800,광진구,0.029,0.008,44.0,19.0,0.003
131 | 0.5,200901010900,광진구,0.021,0.013,41.0,19.0,0.004
132 | 0.5,200901011000,광진구,0.012,0.022,40.0,21.0,0.005
133 | 0.5,200901011100,광진구,0.01,0.026,39.0,16.0,0.005
134 | 0.5,200901011200,광진구,0.012,0.026,37.0,19.0,0.005
135 | 0.6,200901011300,광진구,0.018,0.023,37.0,23.0,0.005
136 | 0.6,200901011400,광진구,0.019,0.022,43.0,27.0,0.005
137 | 0.5,200901011500,광진구,0.018,0.024,44.0,26.0,0.004
138 | 0.5,200901011600,광진구,0.019,0.023,42.0,27.0,0.004
139 | 0.5,200901011700,광진구,0.022,0.021,41.0,35.0,0.004
140 | 0.5,200901011800,광진구,0.022,0.02,39.0,28.0,0.003
141 | 0.5,200901011900,광진구,0.025,0.019,41.0,24.0,0.003
142 | 0.6,200901012000,광진구,0.033,0.011,39.0,24.0,0.004
143 | 0.8,200901012100,광진구,0.032,0.011,0.0,20.0,0.004
144 | 0.6,200901012200,광진구,0.034,0.009,49.0,28.0,0.004
145 | 0.5,200901012300,광진구,0.032,0.008,44.0,23.0,0.003
146 | 0.5,200901010000,구로구,0.026,0.015,36.0,10.0,0.005
147 | 0.5,200901010100,구로구,0.022,0.016,36.0,16.0,0.005
148 | 0.4,200901010200,구로구,0.015,0.022,34.0,14.0,0.004
149 | 0.5,200901010300,구로구,0.018,0.019,33.0,16.0,0.004
150 | 0.4,200901010400,구로구,0.018,0.018,34.0,12.0,0.004
151 | 0.5,200901010500,구로구,0.019,0.017,32.0,15.0,0.005
152 | 0.5,200901010600,구로구,0.018,0.016,41.0,17.0,0.005
153 | 0.5,200901010700,구로구,0.023,0.012,37.0,19.0,0.005
154 | 0.5,200901010800,구로구,0.022,0.012,43.0,17.0,0.005
155 | 0.6,200901010900,구로구,0.023,0.012,42.0,23.0,0.005
156 | 0.6,200901011000,구로구,0.019,0.016,46.0,18.0,0.006
157 | 0.5,200901011100,구로구,0.012,0.022,41.0,6.0,0.006
158 | 0.5,200901011200,구로구,0.011,0.025,37.0,23.0,0.006
159 | 0.5,200901011300,구로구,0.014,0.025,44.0,19.0,0.006
160 | 0.5,200901011400,구로구,0.015,0.025,44.0,26.0,0.006
161 | 0.5,200901011500,구로구,0.015,0.027,42.0,31.0,0.005
162 | 0.5,200901011600,구로구,0.015,0.028,49.0,34.0,0.005
163 | 0.5,200901011700,구로구,0.019,0.025,49.0,43.0,0.005
164 | 0.5,200901011800,구로구,0.024,0.02,45.0,29.0,0.005
165 | 0.6,200901011900,구로구,0.036,0.01,41.0,17.0,0.004
166 | 0.6,200901012000,구로구,0.038,0.007,36.0,28.0,0.004
167 | 0.6,200901012100,구로구,0.037,0.007,42.0,16.0,0.004
168 | 0.6,200901012200,구로구,0.038,0.006,39.0,20.0,0.004
169 | 0.5,200901012300,구로구,0.035,0.008,41.0,19.0,0.004
170 | 0.7,200901010000,구로구2,0.019,0.013,39.0,0.0,0.006
171 | 0.6,200901010000,금천구,0.027,0.015,26.0,13.0,0.006
172 | 0.5,200901010100,금천구,0.022,0.019,51.0,8.0,0.005
173 | 0.5,200901010200,금천구,0.017,0.023,41.0,10.0,0.004
174 | 0.4,200901010300,금천구,0.018,0.021,25.0,7.0,0.004
175 | 0.4,200901010400,금천구,0.016,0.021,32.0,15.0,0.005
176 | 0.5,200901010500,금천구,0.017,0.018,41.0,4.0,0.004
177 | 0.5,200901010600,금천구,0.017,0.018,35.0,16.0,0.005
178 | 0.6,200901010700,금천구,0.023,0.012,27.0,11.0,0.005
179 | 0.6,200901010800,금천구,0.027,0.008,53.0,7.0,0.005
180 | 0.6,200901010900,금천구,0.026,0.009,43.0,16.0,0.005
181 | 0.6,200901011000,금천구,0.02,0.016,29.0,17.0,0.007
182 | 0.6,200901011100,금천구,0.015,0.022,48.0,15.0,0.006
183 | 0.6,200901011200,금천구,0.014,0.025,25.0,23.0,0.006
184 | 0.6,200901011300,금천구,0.016,0.025,38.0,32.0,0.006
185 | 0.7,200901011400,금천구,0.019,0.024,46.0,24.0,0.007
186 | 0.6,200901011500,금천구,0.019,0.026,35.0,33.0,0.006
187 | 0.5,200901011600,금천구,0.018,0.028,38.0,27.0,0.006
188 | 0.5,200901011700,금천구,0.02,0.027,51.0,28.0,0.006
189 | 0.5,200901011800,금천구,0.026,0.021,57.0,18.0,0.005
190 | 0.6,200901011900,금천구,0.03,0.017,27.0,10.0,0.005
191 | 0.7,200901012000,금천구,0.039,0.009,37.0,8.0,0.005
192 | 0.9,200901012100,금천구,0.045,0.004,47.0,7.0,0.007
193 | 0.7,200901012200,금천구,0.038,0.007,34.0,11.0,0.005
194 | 0.7,200901012300,금천구,0.039,0.006,34.0,14.0,0.004
195 | 0.4,200901010000,노원구,0.02,0.022,45.0,22.0,0.007
196 | 0.4,200901010100,노원구,0.015,0.026,50.0,17.0,0.007
197 | 0.4,200901010200,노원구,0.019,0.022,40.0,15.0,0.006
198 | 0.5,200901010300,노원구,0.017,0.023,43.0,11.0,0.006
199 | 0.4,200901010400,노원구,0.014,0.025,51.0,26.0,0.006
200 | 0.4,200901010500,노원구,0.012,0.025,47.0,21.0,0.006
201 | 0.4,200901010600,노원구,0.013,0.025,53.0,17.0,0.006
202 | 0.4,200901010700,노원구,0.013,0.024,49.0,12.0,0.006
203 | 0.4,200901010800,노원구,0.012,0.025,50.0,14.0,0.006
204 | 0.5,200901010900,노원구,0.013,0.023,63.0,12.0,0.006
205 | 0.5,200901011000,노원구,0.012,0.025,53.0,10.0,0.007
206 | 0.5,200901011100,노원구,0.012,0.027,59.0,15.0,0.006
207 | 0.5,200901011200,노원구,0.014,0.026,0.0,18.0,0.008
208 | 0.4,200901011300,노원구,0.012,0.03,56.0,15.0,0.007
209 | 0.4,200901011400,노원구,0.011,0.032,56.0,14.0,0.006
210 | 0.4,200901011500,노원구,0.011,0.033,47.0,16.0,0.006
211 | 0.4,200901011600,노원구,0.012,0.032,47.0,19.0,0.005
212 | 0.4,200901011700,노원구,0.019,0.027,53.0,19.0,0.005
213 | 0.5,200901011800,노원구,0.026,0.023,41.0,18.0,0.006
214 | 0.7,200901011900,노원구,0.033,0.015,55.0,18.0,0.007
215 | 0.7,200901012000,노원구,0.041,0.009,51.0,28.0,0.008
216 | 1.3,200901012100,노원구,0.056,0.003,0.0,26.0,0.013
217 | 0.9,200901012200,노원구,0.051,0.003,57.0,18.0,0.011
218 | 1.0,200901012300,노원구,0.052,0.003,58.0,17.0,0.011
219 | 0.4,200901010000,도봉구,0.02,0.023,45.0,16.0,0.006
220 | 0.4,200901010100,도봉구,0.016,0.024,43.0,20.0,0.006
221 | 0.4,200901010200,도봉구,0.016,0.024,42.0,19.0,0.005
222 | 0.4,200901010300,도봉구,0.018,0.022,43.0,18.0,0.005
223 | 0.4,200901010400,도봉구,0.017,0.022,44.0,18.0,0.005
224 | 0.5,200901010500,도봉구,0.027,0.014,43.0,21.0,0.006
225 | 0.6,200901010600,도봉구,0.031,0.01,44.0,22.0,0.007
226 | 0.6,200901010700,도봉구,0.033,0.008,44.0,19.0,0.006
227 | 0.6,200901010800,도봉구,0.036,0.006,47.0,19.0,0.007
228 | 0.6,200901010900,도봉구,0.033,0.008,44.0,22.0,0.007
229 | 0.6,200901011000,도봉구,0.025,0.014,36.0,15.0,0.008
230 | 0.5,200901011100,도봉구,0.017,0.021,45.0,19.0,0.007
231 | 0.5,200901011200,도봉구,0.016,0.023,43.0,23.0,0.007
232 | 0.5,200901011300,도봉구,0.016,0.025,48.0,24.0,0.007
233 | 0.5,200901011400,도봉구,0.015,0.027,46.0,23.0,0.006
234 | 0.4,200901011500,도봉구,0.015,0.028,44.0,22.0,0.005
235 | 0.4,200901011600,도봉구,0.015,0.028,45.0,17.0,0.005
236 | 0.4,200901011700,도봉구,0.017,0.028,46.0,28.0,0.004
237 | 0.5,200901011800,도봉구,0.021,0.026,47.0,24.0,0.005
238 | 0.5,200901011900,도봉구,0.021,0.026,46.0,25.0,0.004
239 | 0.8,200901012000,도봉구,0.042,0.013,54.0,30.0,0.006
240 | 1.3,200901012100,도봉구,0.059,0.004,53.0,31.0,0.009
241 | 1.2,200901012200,도봉구,0.058,0.002,66.0,41.0,0.011
242 | 1.4,200901012300,도봉구,0.06,0.002,62.0,42.0,0.011
243 | 0.4,200901010000,동대문구,0.021,0.022,32.0,24.0,0.006
244 | 0.4,200901010100,동대문구,0.016,0.026,35.0,18.0,0.008
245 | 0.4,200901010200,동대문구,0.015,0.025,26.0,27.0,0.004
246 | 0.4,200901010300,동대문구,0.014,0.024,23.0,15.0,0.003
247 | 0.4,200901010400,동대문구,0.015,0.023,35.0,16.0,0.01
248 | 0.4,200901010500,동대문구,0.013,0.024,21.0,15.0,0.009
249 | 0.4,200901010600,동대문구,0.018,0.019,26.0,11.0,0.004
250 | 0.5,200901010700,동대문구,0.023,0.013,32.0,22.0,0.004
251 | 0.5,200901010800,동대문구,0.027,0.012,30.0,15.0,0.004
252 | 0.6,200901010900,동대문구,0.031,0.009,15.0,22.0,0.008
253 | 0.6,200901011000,동대문구,0.026,0.014,20.0,15.0,0.008
254 | 0.5,200901011100,동대문구,0.02,0.02,21.0,13.0,0.009
255 | 0.6,200901011200,동대문구,0.019,0.021,25.0,25.0,0.008
256 | 0.7,200901011300,동대문구,0.021,0.021,27.0,27.0,0.012
257 | 0.5,200901011400,동대문구,0.013,0.029,31.0,31.0,0.006
258 | 0.5,200901011500,동대문구,0.016,0.028,28.0,21.0,0.008
259 | 0.6,200901011600,동대문구,0.017,0.028,32.0,19.0,0.005
260 | 0.6,200901011700,동대문구,0.02,0.026,43.0,21.0,0.005
261 | 0.7,200901011800,동대문구,0.022,0.025,44.0,14.0,0.004
262 | 0.7,200901011900,동대문구,0.027,0.019,29.0,22.0,0.012
263 | 0.8,200901012000,동대문구,0.035,0.012,38.0,23.0,0.006
264 | 0.7,200901012100,동대문구,0.038,0.009,35.0,28.0,0.007
265 | 0.8,200901012200,동대문구,0.04,0.007,30.0,19.0,0.005
266 | 0.7,200901012300,동대문구,0.039,0.007,31.0,23.0,0.005
267 | 0.4,200901010000,동작구,0.027,0.015,51.0,14.0,0.006
268 | 0.3,200901010100,동작구,0.021,0.019,46.0,36.0,0.005
269 | 0.3,200901010200,동작구,0.02,0.019,42.0,23.0,0.004
270 | 0.3,200901010300,동작구,0.02,0.019,44.0,33.0,0.005
271 | 0.3,200901010400,동작구,0.016,0.02,39.0,26.0,0.005
272 | 0.5,200901010500,동작구,0.019,0.016,48.0,5.0,0.005
273 | 0.5,200901010600,동작구,0.022,0.014,46.0,34.0,0.005
274 | 0.4,200901010700,동작구,0.026,0.011,54.0,20.0,0.006
275 | 0.4,200901010800,동작구,0.027,0.01,52.0,16.0,0.005
276 | 0.4,200901010900,동작구,0.021,0.015,56.0,32.0,0.006
277 | 0.4,200901011000,동작구,0.018,0.018,52.0,40.0,0.006
278 | 0.4,200901011100,동작구,0.017,0.021,0.0,28.0,0.007
279 | 0.4,200901011200,동작구,0.017,0.022,51.0,31.0,0.007
280 | 0.4,200901011300,동작구,0.017,0.023,55.0,25.0,0.007
281 | 0.4,200901011400,동작구,0.017,0.025,52.0,19.0,0.006
282 | 0.4,200901011500,동작구,0.018,0.025,50.0,20.0,0.006
283 | 0.4,200901011600,동작구,0.02,0.024,50.0,26.0,0.005
284 | 0.3,200901011700,동작구,0.02,0.025,65.0,34.0,0.005
285 | 0.4,200901011800,동작구,0.023,0.021,0.0,14.0,0.005
286 | 0.4,200901011900,동작구,0.027,0.019,50.0,18.0,0.004
287 | 0.4,200901012000,동작구,0.04,0.008,62.0,30.0,0.005
288 | 0.5,200901012100,동작구,0.038,0.007,73.0,37.0,0.005
289 | 0.6,200901012200,동작구,0.044,0.004,49.0,23.0,0.005
290 | 0.5,200901012300,동작구,0.041,0.005,45.0,28.0,0.004
291 | 0.4,200901010000,마포구,0.024,0.013,46.0,13.0,0.01
292 | 0.4,200901010100,마포구,0.026,0.011,40.0,14.0,0.006
293 | 0.4,200901010200,마포구,0.021,0.013,32.0,16.0,0.003
294 | 0.3,200901010300,마포구,0.016,0.018,35.0,16.0,0.007
295 | 0.3,200901010400,마포구,0.014,0.019,48.0,17.0,0.0
296 | 0.3,200901010500,마포구,0.013,0.018,47.0,10.0,0.003
297 | 0.4,200901010600,마포구,0.018,0.014,46.0,15.0,0.009
298 | 0.4,200901010700,마포구,0.018,0.014,46.0,17.0,0.007
299 | 0.4,200901010800,마포구,0.021,0.011,33.0,21.0,0.004
300 | 0.5,200901010900,마포구,0.021,0.011,32.0,14.0,0.006
301 | 0.4,200901011000,마포구,0.017,0.016,45.0,15.0,0.004
302 | 0.4,200901011100,마포구,0.012,0.021,42.0,10.0,0.007
303 | 0.4,200901011200,마포구,0.014,0.02,40.0,9.0,0.007
304 | 0.4,200901011300,마포구,0.012,0.023,48.0,19.0,0.006
305 | 0.4,200901011400,마포구,0.011,0.025,50.0,12.0,0.005
306 | 0.4,200901011500,마포구,0.014,0.024,52.0,17.0,0.009
307 | 0.4,200901011600,마포구,0.015,0.023,48.0,17.0,0.007
308 | 0.4,200901011700,마포구,0.02,0.021,55.0,15.0,0.007
309 | 0.4,200901011800,마포구,0.024,0.018,46.0,20.0,0.005
310 | 0.4,200901011900,마포구,0.028,0.012,42.0,19.0,0.009
311 | 0.5,200901012000,마포구,0.038,0.004,53.0,15.0,0.0
312 | 0.5,200901012100,마포구,0.036,0.004,49.0,18.0,0.006
313 | 0.6,200901012200,마포구,0.04,0.002,44.0,14.0,0.005
314 | 0.6,200901012300,마포구,0.04,0.002,40.0,15.0,0.01
315 | 0.4,200901010000,서대문구,0.021,0.013,40.0,3.0,0.005
316 | 0.3,200901010100,서대문구,0.022,0.013,22.0,13.0,0.005
317 | 0.3,200901010200,서대문구,0.013,0.019,27.0,8.0,0.004
318 | 0.3,200901010300,서대문구,0.012,0.019,32.0,10.0,0.004
319 | 0.3,200901010400,서대문구,0.011,0.019,28.0,20.0,0.005
320 | 0.4,200901010500,서대문구,0.012,0.017,35.0,18.0,0.004
321 | 0.4,200901010600,서대문구,0.015,0.014,35.0,12.0,0.005
322 | 0.3,200901010700,서대문구,0.015,0.014,42.0,7.0,0.005
323 | 0.4,200901010800,서대문구,0.016,0.013,28.0,16.0,0.005
324 | 0.4,200901010900,서대문구,0.016,0.013,33.0,18.0,0.005
325 | 0.3,200901011000,서대문구,0.012,0.017,26.0,27.0,0.005
326 | 0.4,200901011100,서대문구,0.011,0.023,38.0,21.0,0.005
327 | 0.4,200901011200,서대문구,0.012,0.019,57.0,31.0,0.005
328 | 0.3,200901011300,서대문구,0.011,0.021,30.0,28.0,0.004
329 | 0.3,200901011400,서대문구,0.013,0.02,28.0,17.0,0.005
330 | 0.3,200901011500,서대문구,0.015,0.02,24.0,18.0,0.005
331 | 0.3,200901011600,서대문구,0.016,0.02,25.0,13.0,0.004
332 | 0.3,200901011700,서대문구,0.019,0.02,23.0,28.0,0.005
333 | 0.3,200901011800,서대문구,0.028,0.011,37.0,32.0,0.004
334 | 0.4,200901011900,서대문구,0.032,0.008,39.0,18.0,0.005
335 | 0.6,200901012000,서대문구,0.046,0.002,56.0,10.0,0.005
336 | 0.7,200901012100,서대문구,0.0,0.001,59.0,9.0,0.005
337 | 0.6,200901012200,서대문구,0.0,0.001,48.0,27.0,0.005
338 | 0.6,200901012300,서대문구,0.0,0.001,61.0,29.0,0.005
339 | 0.5,200901010000,서초구,0.025,0.013,46.0,18.0,0.006
340 | 0.4,200901010100,서초구,0.022,0.014,44.0,23.0,0.005
341 | 0.4,200901010200,서초구,0.017,0.018,41.0,16.0,0.004
342 | 0.4,200901010300,서초구,0.017,0.017,40.0,20.0,0.005
343 | 0.4,200901010400,서초구,0.016,0.017,42.0,22.0,0.005
344 | 0.4,200901010500,서초구,0.016,0.016,44.0,20.0,0.005
345 | 0.4,200901010600,서초구,0.018,0.014,38.0,19.0,0.005
346 | 0.4,200901010700,서초구,0.02,0.013,46.0,23.0,0.005
347 | 0.5,200901010800,서초구,0.025,0.009,47.0,21.0,0.005
348 | 0.5,200901010900,서초구,0.022,0.011,40.0,23.0,0.005
349 | 0.5,200901011000,서초구,0.019,0.014,47.0,24.0,0.006
350 | 0.5,200901011100,서초구,0.015,0.018,36.0,16.0,0.006
351 | 0.5,200901011200,서초구,0.017,0.018,41.0,24.0,0.006
352 | 0.6,200901011300,서초구,0.017,0.019,48.0,26.0,0.007
353 | 0.5,200901011400,서초구,0.014,0.022,48.0,27.0,0.006
354 | 0.5,200901011500,서초구,0.017,0.022,48.0,27.0,0.005
355 | 0.5,200901011600,서초구,0.02,0.02,48.0,26.0,0.005
356 | 0.5,200901011700,서초구,0.022,0.019,53.0,32.0,0.005
357 | 0.5,200901011800,서초구,0.025,0.017,47.0,26.0,0.004
358 | 0.5,200901011900,서초구,0.031,0.011,39.0,16.0,0.004
359 | 0.5,200901012000,서초구,0.035,0.008,46.0,19.0,0.005
360 | 0.5,200901012100,서초구,0.035,0.006,40.0,29.0,0.004
361 | 0.5,200901012200,서초구,0.034,0.006,45.0,26.0,0.004
362 | 0.5,200901012300,서초구,0.037,0.004,48.0,23.0,0.004
363 | 0.3,200901010000,성동구,0.017,0.021,40.0,17.0,0.005
364 | 0.3,200901010100,성동구,0.015,0.021,48.0,22.0,0.005
365 | 0.3,200901010200,성동구,0.015,0.02,44.0,20.0,0.005
366 | 0.3,200901010300,성동구,0.009,0.023,38.0,17.0,0.004
367 | 0.3,200901010400,성동구,0.013,0.021,44.0,22.0,0.004
368 | 0.3,200901010500,성동구,0.012,0.02,41.0,18.0,0.004
369 | 0.3,200901010600,성동구,0.014,0.019,41.0,21.0,0.004
370 | 0.3,200901010700,성동구,0.023,0.009,41.0,16.0,0.004
371 | 0.3,200901010800,성동구,0.022,0.012,39.0,18.0,0.004
372 | 0.4,200901010900,성동구,0.023,0.011,42.0,22.0,0.004
373 | 0.4,200901011000,성동구,0.018,0.017,36.0,15.0,0.006
374 | 0.4,200901011100,성동구,0.013,0.022,37.0,18.0,0.006
375 | 0.4,200901011200,성동구,0.015,0.022,39.0,23.0,0.006
376 | 0.4,200901011300,성동구,0.018,0.021,42.0,20.0,0.005
377 | 0.4,200901011400,성동구,0.015,0.025,46.0,23.0,0.005
378 | 0.3,200901011500,성동구,0.013,0.027,43.0,23.0,0.004
379 | 0.3,200901011600,성동구,0.015,0.025,44.0,22.0,0.004
380 | 0.3,200901011700,성동구,0.02,0.021,48.0,29.0,0.004
381 | 0.4,200901011800,성동구,0.024,0.018,43.0,23.0,0.004
382 | 0.3,200901011900,성동구,0.024,0.017,48.0,28.0,0.004
383 | 0.3,200901012000,성동구,0.022,0.018,50.0,20.0,0.004
384 | 0.4,200901012100,성동구,0.034,0.007,46.0,30.0,0.004
385 | 0.4,200901012200,성동구,0.032,0.008,50.0,25.0,0.003
386 | 0.4,200901012300,성동구,0.034,0.005,41.0,24.0,0.003
387 | 0.5,200901010000,성북구,0.021,0.019,39.0,19.0,0.005
388 | 0.5,200901010100,성북구,0.018,0.019,42.0,16.0,0.005
389 | 0.5,200901010200,성북구,0.016,0.021,39.0,22.0,0.004
390 | 0.5,200901010300,성북구,0.014,0.021,36.0,18.0,0.004
391 | 0.5,200901010400,성북구,0.015,0.02,39.0,17.0,0.004
392 | 0.5,200901010500,성북구,0.017,0.018,39.0,18.0,0.004
393 | 0.5,200901010600,성북구,0.018,0.017,37.0,19.0,0.004
394 | 0.6,200901010700,성북구,0.026,0.014,38.0,16.0,0.004
395 | 0.6,200901010800,성북구,0.029,0.01,34.0,18.0,0.005
396 | 0.6,200901010900,성북구,0.027,0.01,36.0,19.0,0.005
397 | 0.6,200901011000,성북구,0.023,0.013,37.0,17.0,0.006
398 | 0.6,200901011100,성북구,0.019,0.016,37.0,19.0,0.006
399 | 0.6,200901011200,성북구,0.017,0.019,41.0,18.0,0.006
400 | 0.6,200901011300,성북구,0.02,0.02,44.0,24.0,0.006
401 | 0.6,200901011400,성북구,0.02,0.02,43.0,24.0,0.006
402 | 0.6,200901011500,성북구,0.017,0.022,43.0,23.0,0.006
403 | 0.5,200901011600,성북구,0.015,0.025,40.0,20.0,0.005
404 | 0.6,200901011700,성북구,0.021,0.022,41.0,24.0,0.005
405 | 0.6,200901011800,성북구,0.026,0.018,40.0,26.0,0.005
406 | 0.6,200901011900,성북구,0.028,0.017,35.0,17.0,0.004
407 | 0.9,200901012000,성북구,0.045,0.005,36.0,21.0,0.005
408 | 0.8,200901012100,성북구,0.045,0.003,45.0,29.0,0.006
409 | 0.7,200901012200,성북구,0.044,0.004,41.0,24.0,0.006
410 | 0.7,200901012300,성북구,0.045,0.003,45.0,29.0,0.006
411 | 0.4,200901010000,송파구,0.022,0.018,43.0,15.0,0.005
412 | 0.3,200901010100,송파구,0.015,0.024,40.0,19.0,0.005
413 | 0.3,200901010200,송파구,0.013,0.025,37.0,18.0,0.005
414 | 0.3,200901010300,송파구,0.013,0.024,41.0,15.0,0.005
415 | 0.3,200901010400,송파구,0.014,0.022,36.0,20.0,0.005
416 | 0.3,200901010500,송파구,0.014,0.022,40.0,15.0,0.004
417 | 0.3,200901010600,송파구,0.016,0.019,41.0,15.0,0.004
418 | 0.4,200901010700,송파구,0.023,0.014,40.0,20.0,0.004
419 | 0.6,200901010800,송파구,0.028,0.008,39.0,20.0,0.005
420 | 0.4,200901010900,송파구,0.021,0.014,43.0,19.0,0.005
421 | 0.4,200901011000,송파구,0.014,0.022,41.0,18.0,0.007
422 | 0.4,200901011100,송파구,0.013,0.024,44.0,27.0,0.007
423 | 0.3,200901011200,송파구,0.012,0.026,45.0,23.0,0.007
424 | 0.3,200901011300,송파구,0.016,0.026,42.0,22.0,0.007
425 | 0.4,200901011400,송파구,0.021,0.022,48.0,24.0,0.008
426 | 0.3,200901011500,송파구,0.018,0.025,43.0,26.0,0.007
427 | 0.3,200901011600,송파구,0.021,0.023,40.0,30.0,0.006
428 | 0.3,200901011700,송파구,0.025,0.02,43.0,19.0,0.006
429 | 0.3,200901011800,송파구,0.025,0.02,38.0,19.0,0.005
430 | 0.4,200901011900,송파구,0.027,0.017,38.0,18.0,0.005
431 | 0.4,200901012000,송파구,0.03,0.013,32.0,20.0,0.004
432 | 0.4,200901012100,송파구,0.033,0.011,38.0,30.0,0.004
433 | 0.4,200901012200,송파구,0.036,0.006,45.0,29.0,0.004
434 | 0.5,200901012300,송파구,0.04,0.003,44.0,21.0,0.004
435 | 0.4,200901010000,송파구2,0.025,0.016,0.0,0.0,0.008
436 | 0.4,200901010100,송파구2,0.025,0.016,48.0,0.0,0.007
437 | 0.4,200901010200,송파구2,0.026,0.015,44.0,0.0,0.009
438 | 0.3,200901010300,송파구2,0.023,0.018,39.0,0.0,0.008
439 | 0.3,200901010400,송파구2,0.025,0.017,44.0,0.0,0.007
440 | 0.3,200901010500,송파구2,0.025,0.017,42.0,0.0,0.008
441 | 0.4,200901010600,송파구2,0.029,0.015,41.0,0.0,0.004
442 | 0.3,200901010700,송파구2,0.032,0.013,31.0,0.0,0.009
443 | 0.4,200901010800,송파구2,0.036,0.008,29.0,0.0,0.007
444 | 0.4,200901010900,송파구2,0.035,0.01,23.0,0.0,0.004
445 | 0.4,200901011000,송파구2,0.031,0.014,26.0,0.0,0.007
446 | 0.4,200901011100,송파구2,0.026,0.019,35.0,0.0,0.006
447 | 0.4,200901011200,송파구2,0.026,0.02,31.0,0.0,0.006
448 | 0.5,200901011300,송파구2,0.029,0.019,34.0,0.0,0.006
449 | 0.4,200901011400,송파구2,0.028,0.02,46.0,0.0,0.005
450 | 0.4,200901011500,송파구2,0.027,0.022,38.0,0.0,0.004
451 | 0.4,200901011600,송파구2,0.03,0.019,37.0,0.0,0.01
452 | 0.4,200901011700,송파구2,0.025,0.017,45.0,0.0,0.005
453 | 0.4,200901011800,송파구2,0.032,0.012,57.0,0.0,0.008
454 | 0.4,200901011900,송파구2,0.031,0.011,40.0,0.0,0.005
455 | 0.4,200901012000,송파구2,0.027,0.016,30.0,0.0,0.009
456 | 0.4,200901012100,송파구2,0.036,0.006,46.0,0.0,0.008
457 | 0.4,200901012200,송파구2,0.037,0.005,40.0,0.0,0.005
458 | 0.5,200901012300,송파구2,0.038,0.004,32.0,0.0,0.007
459 | 0.5,200901010000,양천구,0.031,0.012,24.0,9.0,0.005
460 | 0.4,200901010100,양천구,0.025,0.015,45.0,17.0,0.005
461 | 0.4,200901010200,양천구,0.024,0.015,31.0,16.0,0.004
462 | 0.4,200901010300,양천구,0.023,0.015,35.0,19.0,0.004
463 | 0.4,200901010400,양천구,0.017,0.019,24.0,11.0,0.004
464 | 0.4,200901010500,양천구,0.017,0.018,40.0,9.0,0.004
465 | 0.4,200901010600,양천구,0.018,0.017,35.0,22.0,0.004
466 | 0.4,200901010700,양천구,0.019,0.015,41.0,11.0,0.005
467 | 0.5,200901010800,양천구,0.025,0.01,38.0,21.0,0.005
468 | 0.5,200901010900,양천구,0.025,0.01,56.0,25.0,0.005
469 | 0.5,200901011000,양천구,0.02,0.015,51.0,22.0,0.007
470 | 0.4,200901011100,양천구,0.014,0.021,32.0,28.0,0.006
471 | 0.5,200901011200,양천구,0.015,0.023,49.0,33.0,0.006
472 | 0.5,200901011300,양천구,0.014,0.024,40.0,26.0,0.005
473 | 0.5,200901011400,양천구,0.016,0.026,0.0,24.0,0.006
474 | 0.4,200901011500,양천구,0.014,0.027,47.0,22.0,0.005
475 | 0.4,200901011600,양천구,0.015,0.028,36.0,21.0,0.005
476 | 0.5,200901011700,양천구,0.022,0.022,36.0,16.0,0.005
477 | 0.4,200901011800,양천구,0.026,0.019,31.0,15.0,0.004
478 | 0.7,200901011900,양천구,0.043,0.006,32.0,13.0,0.005
479 | 0.7,200901012000,양천구,0.043,0.005,47.0,13.0,0.005
480 | 0.6,200901012100,양천구,0.043,0.004,24.0,18.0,0.005
481 | 0.6,200901012200,양천구,0.042,0.005,31.0,15.0,0.004
482 | 0.7,200901012300,양천구,0.041,0.005,38.0,20.0,0.004
483 | 0.7,200901010000,영등포구,0.024,0.009,24.0,18.0,0.004
484 | 0.6,200901010100,영등포구,0.02,0.014,22.0,9.0,0.007
485 | 0.6,200901010200,영등포구,0.019,0.014,53.0,11.0,0.005
486 | 0.6,200901010300,영등포구,0.016,0.015,43.0,15.0,0.004
487 | 0.6,200901010400,영등포구,0.016,0.013,32.0,14.0,0.005
488 | 0.5,200901010500,영등포구,0.015,0.014,20.0,15.0,0.005
489 | 0.6,200901010600,영등포구,0.019,0.009,30.0,13.0,0.002
490 | 0.6,200901010700,영등포구,0.021,0.007,36.0,27.0,0.004
491 | 0.6,200901010800,영등포구,0.021,0.007,50.0,19.0,0.003
492 | 0.7,200901010900,영등포구,0.023,0.007,37.0,33.0,0.003
493 | 0.7,200901011000,영등포구,0.021,0.011,24.0,14.0,0.004
494 | 0.6,200901011100,영등포구,0.017,0.018,23.0,13.0,0.007
495 | 0.6,200901011200,영등포구,0.017,0.018,26.0,11.0,0.009
496 | 0.6,200901011300,영등포구,0.017,0.019,37.0,18.0,0.006
497 | 0.6,200901011400,영등포구,0.018,0.02,27.0,17.0,0.009
498 | 0.6,200901011500,영등포구,0.021,0.018,31.0,16.0,0.004
499 | 0.6,200901011600,영등포구,0.019,0.022,19.0,26.0,0.002
500 | 0.6,200901011700,영등포구,0.016,0.022,39.0,15.0,0.006
501 | 0.6,200901011800,영등포구,0.021,0.016,26.0,12.0,0.006
502 | 0.6,200901011900,영등포구,0.026,0.009,24.0,20.0,0.003
503 | 0.7,200901012000,영등포구,0.031,0.006,45.0,23.0,0.002
504 | 0.8,200901012100,영등포구,0.031,0.003,39.0,13.0,0.004
505 | 0.8,200901012200,영등포구,0.029,0.004,27.0,13.0,0.003
506 | 0.9,200901012300,영등포구,0.032,0.001,35.0,25.0,0.004
507 | 0.6,200901010000,용산구,0.035,0.008,49.0,15.0,0.006
508 | 0.5,200901010100,용산구,0.024,0.011,43.0,20.0,0.004
509 | 0.5,200901010200,용산구,0.02,0.011,46.0,19.0,0.004
510 | 0.5,200901010300,용산구,0.018,0.012,38.0,17.0,0.004
511 | 0.5,200901010400,용산구,0.018,0.012,47.0,16.0,0.005
512 | 0.5,200901010500,용산구,0.018,0.012,41.0,19.0,0.004
513 | 0.5,200901010600,용산구,0.023,0.009,41.0,17.0,0.004
514 | 0.6,200901010700,용산구,0.031,0.006,48.0,17.0,0.005
515 | 0.6,200901010800,용산구,0.033,0.006,42.0,17.0,0.004
516 | 0.6,200901010900,용산구,0.036,0.006,47.0,21.0,0.005
517 | 0.6,200901011000,용산구,0.024,0.01,34.0,11.0,0.006
518 | 0.6,200901011100,용산구,0.023,0.011,42.0,23.0,0.006
519 | 0.6,200901011200,용산구,0.023,0.012,46.0,27.0,0.006
520 | 0.6,200901011300,용산구,0.02,0.013,51.0,25.0,0.007
521 | 0.6,200901011400,용산구,0.02,0.014,51.0,27.0,0.005
522 | 0.6,200901011500,용산구,0.019,0.015,47.0,26.0,0.006
523 | 0.6,200901011600,용산구,0.024,0.014,52.0,28.0,0.005
524 | 0.5,200901011700,용산구,0.029,0.013,49.0,30.0,0.005
525 | 0.5,200901011800,용산구,0.029,0.012,44.0,23.0,0.005
526 | 0.6,200901011900,용산구,0.037,0.008,46.0,20.0,0.004
527 | 0.6,200901012000,용산구,0.042,0.006,46.0,19.0,0.005
528 | 0.6,200901012100,용산구,0.04,0.006,53.0,22.0,0.004
529 | 0.7,200901012200,용산구,0.047,0.004,49.0,24.0,0.004
530 | 0.7,200901012300,용산구,0.046,0.003,47.0,20.0,0.005
531 | 0.2,200901010000,은평구,0.017,0.017,47.0,19.0,0.008
532 | 0.3,200901010100,은평구,0.018,0.017,43.0,16.0,0.007
533 | 0.3,200901010200,은평구,0.02,0.016,48.0,20.0,0.008
534 | 0.2,200901010300,은평구,0.015,0.013,45.0,14.0,0.007
535 | 0.2,200901010400,은평구,0.012,0.016,45.0,18.0,0.006
536 | 0.2,200901010500,은평구,0.011,0.019,41.0,12.0,0.005
537 | 0.2,200901010600,은평구,0.013,0.018,39.0,18.0,0.006
538 | 0.2,200901010700,은평구,0.016,0.016,41.0,16.0,0.006
539 | 0.3,200901010800,은평구,0.021,0.011,43.0,17.0,0.006
540 | 0.3,200901010900,은평구,0.018,0.008,41.0,16.0,0.006
541 | 0.3,200901011000,은평구,0.018,0.013,40.0,19.0,0.006
542 | 0.3,200901011100,은평구,0.012,0.017,37.0,21.0,0.007
543 | 0.2,200901011200,은평구,0.011,0.023,32.0,18.0,0.007
544 | 0.2,200901011300,은평구,0.011,0.024,38.0,24.0,0.007
545 | 0.2,200901011400,은평구,0.01,0.025,44.0,27.0,0.006
546 | 0.2,200901011500,은평구,0.011,0.028,44.0,26.0,0.005
547 | 0.2,200901011600,은평구,0.013,0.028,45.0,24.0,0.005
548 | 0.2,200901011700,은평구,0.015,0.026,0.0,33.0,0.005
549 | 0.2,200901011800,은평구,0.017,0.025,36.0,23.0,0.005
550 | 0.4,200901011900,은평구,0.026,0.021,53.0,22.0,0.006
551 | 0.4,200901012000,은평구,0.032,0.011,49.0,20.0,0.006
552 | 0.4,200901012100,은평구,0.031,0.006,58.0,28.0,0.006
553 | 0.4,200901012200,은평구,0.029,0.003,51.0,23.0,0.006
554 | 0.4,200901012300,은평구,0.029,0.004,49.0,26.0,0.005
555 | 0.4,200901010000,종로구,0.015,0.024,33.0,10.0,0.007
556 | 0.5,200901010100,종로구,0.016,0.022,38.0,8.0,0.006
557 | 0.5,200901010200,종로구,0.015,0.021,44.0,10.0,0.005
558 | 0.4,200901010300,종로구,0.013,0.023,29.0,24.0,0.005
559 | 0.4,200901010400,종로구,0.012,0.023,31.0,17.0,0.005
560 | 0.4,200901010500,종로구,0.018,0.015,34.0,15.0,0.005
561 | 0.4,200901010600,종로구,0.02,0.015,38.0,8.0,0.005
562 | 0.4,200901010700,종로구,0.019,0.016,33.0,26.0,0.005
563 | 0.5,200901010800,종로구,0.024,0.012,42.0,18.0,0.005
564 | 0.5,200901010900,종로구,0.026,0.01,48.0,17.0,0.006
565 | 0.6,200901011000,종로구,0.021,0.016,32.0,13.0,0.006
566 | 0.5,200901011100,종로구,0.018,0.02,36.0,16.0,0.007
567 | 0.6,200901011200,종로구,0.021,0.019,35.0,19.0,0.007
568 | 0.6,200901011300,종로구,0.02,0.021,41.0,11.0,0.007
569 | 0.5,200901011400,종로구,0.011,0.029,38.0,18.0,0.007
570 | 0.4,200901011500,종로구,0.011,0.03,31.0,19.0,0.006
571 | 0.4,200901011600,종로구,0.016,0.027,46.0,22.0,0.006
572 | 0.4,200901011700,종로구,0.018,0.025,48.0,21.0,0.006
573 | 0.5,200901011800,종로구,0.024,0.021,32.0,19.0,0.006
574 | 0.5,200901011900,종로구,0.022,0.022,44.0,16.0,0.006
575 | 0.5,200901012000,종로구,0.031,0.013,40.0,14.0,0.006
576 | 0.7,200901012100,종로구,0.037,0.008,44.0,23.0,0.006
577 | 0.5,200901012200,종로구,0.029,0.013,38.0,17.0,0.006
578 | 0.5,200901012300,종로구,0.025,0.015,31.0,13.0,0.005
579 | 1.1,200901010000,중구,0.025,0.015,38.0,16.0,0.005
580 | 1.1,200901010100,중구,0.021,0.015,47.0,19.0,0.005
581 | 1.1,200901010200,중구,0.016,0.019,36.0,16.0,0.004
582 | 1.0,200901010300,중구,0.007,0.025,42.0,17.0,0.004
583 | 1.0,200901010400,중구,0.005,0.026,34.0,17.0,0.004
584 | 1.1,200901010500,중구,0.01,0.021,41.0,15.0,0.004
585 | 1.1,200901010600,중구,0.009,0.022,38.0,20.0,0.004
586 | 1.1,200901010700,중구,0.017,0.015,42.0,20.0,0.004
587 | 1.1,200901010800,중구,0.019,0.015,41.0,16.0,0.004
588 | 1.2,200901010900,중구,0.021,0.013,37.0,17.0,0.004
589 | 1.2,200901011000,중구,0.017,0.018,36.0,18.0,0.005
590 | 1.2,200901011100,중구,0.011,0.022,38.0,17.0,0.005
591 | 1.2,200901011200,중구,0.014,0.022,36.0,24.0,0.005
592 | 1.2,200901011300,중구,0.009,0.026,36.0,16.0,0.005
593 | 1.1,200901011400,중구,0.005,0.031,45.0,26.0,0.004
594 | 1.1,200901011500,중구,0.007,0.031,48.0,26.0,0.004
595 | 1.1,200901011600,중구,0.008,0.03,38.0,23.0,0.004
596 | 1.1,200901011700,중구,0.011,0.029,40.0,18.0,0.004
597 | 1.1,200901011800,중구,0.019,0.022,44.0,28.0,0.004
598 | 1.1,200901011900,중구,0.024,0.017,42.0,18.0,0.004
599 | 1.1,200901012000,중구,0.031,0.011,38.0,26.0,0.004
600 | 1.1,200901012100,중구,0.031,0.01,42.0,24.0,0.004
601 | 1.2,200901012200,중구,0.041,0.004,47.0,29.0,0.004
602 | 1.2,200901012300,중구,0.039,0.004,44.0,22.0,0.004
603 | 0.7,200901010000,중랑구,0.03,0.014,38.0,13.0,0.014
604 | 0.6,200901010100,중랑구,0.02,0.02,25.0,8.0,0.014
605 | 0.6,200901010200,중랑구,0.024,0.017,36.0,26.0,0.008
606 | 0.5,200901010300,중랑구,0.02,0.019,30.0,11.0,0.006
607 | 0.5,200901010400,중랑구,0.021,0.018,39.0,15.0,0.012
608 | 0.5,200901010500,중랑구,0.018,0.02,34.0,19.0,0.011
609 | 0.6,200901010600,중랑구,0.022,0.016,35.0,17.0,0.012
610 | 0.6,200901010700,중랑구,0.028,0.01,45.0,22.0,0.007
611 | 0.7,200901010800,중랑구,0.028,0.01,31.0,23.0,0.008
612 | 0.6,200901010900,중랑구,0.025,0.013,43.0,14.0,0.007
613 | 0.6,200901011000,중랑구,0.021,0.017,33.0,23.0,0.006
614 | 0.6,200901011100,중랑구,0.016,0.022,24.0,10.0,0.009
615 | 0.6,200901011200,중랑구,0.014,0.023,27.0,20.0,0.006
616 | 0.7,200901011300,중랑구,0.021,0.022,42.0,23.0,0.014
617 | 0.8,200901011400,중랑구,0.028,0.018,40.0,15.0,0.01
618 | 1.0,200901011500,중랑구,0.027,0.019,37.0,23.0,0.008
619 | 0.8,200901011600,중랑구,0.028,0.019,44.0,21.0,0.008
620 | 0.8,200901011700,중랑구,0.033,0.014,51.0,18.0,0.012
621 | 1.0,200901011800,중랑구,0.045,0.01,56.0,19.0,0.012
622 | 0.7,200901011900,중랑구,0.035,0.012,36.0,19.0,0.011
623 | 0.9,200901012000,중랑구,0.041,0.008,40.0,10.0,0.008
624 | 1.2,200901012100,중랑구,0.054,0.003,39.0,20.0,0.01
625 | 1.6,200901012200,중랑구,0.053,0.002,39.0,14.0,0.014
626 | 1.8,200901012300,중랑구,0.051,0.002,45.0,21.0,0.008
627 |
--------------------------------------------------------------------------------
/dust_weather/raw_data/weather/weather_20090101.csv:
--------------------------------------------------------------------------------
1 | DC10_LMCS_CA,DC10_TCA,GND_STT_CD,HM,ICSR,M0_05_TE,M0_1_TE,M0_2_TE,M0_3_TE,PA,PS,PV,RNUM,SS,STN_ID,STN_NM,TA,TD,TM,TS,VS,WD,WS
2 | 0.0,0.0,,55,,,,,,1013.7,1024.9,1.9,1,,108,서울,-7.6,-15.0,2009-01-01 00:00,-10.1,1600.0,290,2.1
3 | ,,,54,,,,,,1013.7,1024.9,1.8,2,,108,서울,-7.8,-15.5,2009-01-01 01:00,-10.3,,320,2.3
4 | ,,,52,,,,,,1013.8,1025.1,1.7,3,,108,서울,-8.1,-16.2,2009-01-01 02:00,-10.8,,340,2.0
5 | 0.0,0.0,4.0,52,,-4.0,-2.3,0.0,1.2,1014.2,1025.5,1.7,4,,108,서울,-8.5,-16.6,2009-01-01 03:00,-10.7,1500.0,290,2.4
6 | 0.0,0.0,,53,,,,,,1014.3,1025.6,1.7,5,,108,서울,-8.8,-16.6,2009-01-01 04:00,-11.0,1500.0,320,2.1
7 | 0.0,0.0,,55,,,,,,1014.6,1025.9,1.7,6,,108,서울,-9.0,-16.4,2009-01-01 05:00,-11.2,1500.0,340,1.7
8 | 0.0,0.0,,54,,,,,,1014.7,1026.0,1.7,7,,108,서울,-9.0,-16.6,2009-01-01 06:00,-10.9,1500.0,290,3.0
9 | 0.0,0.0,,54,,,,,,1015.3,1026.6,1.6,8,,108,서울,-9.4,-17.0,2009-01-01 07:00,-11.7,1500.0,360,0.7
10 | 0.0,0.0,,53,0.0,,,,,1015.7,1027.0,1.6,9,0.0,108,서울,-9.3,-17.1,2009-01-01 08:00,-11.1,1600.0,290,2.2
11 | 0.0,0.0,4.0,49,0.2,-4.7,-3.1,-0.3,0.9,1016.2,1027.5,1.6,10,0.8,108,서울,-8.6,-17.4,2009-01-01 09:00,-8.9,1600.0,320,3.0
12 | 0.0,0.0,,45,0.75,,,,,1016.8,1028.1,1.5,11,1.0,108,서울,-7.8,-17.6,2009-01-01 10:00,-1.0,1200.0,290,3.7
13 | 0.0,0.0,,42,1.29,,,,,1016.6,1027.8,1.6,12,1.0,108,서울,-6.7,-17.5,2009-01-01 11:00,3.8,1100.0,290,2.7
14 | 0.0,0.0,,39,1.6,,,,,1015.9,1027.1,1.6,13,1.0,108,서울,-5.3,-17.1,2009-01-01 12:00,-2.4,1100.0,270,2.6
15 | 0.0,0.0,,38,1.72,,,,,1015.6,1026.7,1.7,14,1.0,108,서울,-4.6,-16.7,2009-01-01 13:00,-1.2,1100.0,320,2.8
16 | 0.0,0.0,,36,1.63,,,,,1015.2,1026.3,1.7,15,1.0,108,서울,-3.8,-16.7,2009-01-01 14:00,6.3,1100.0,290,3.6
17 | 0.0,0.0,1.0,35,1.31,-0.5,-1.3,-0.3,0.9,1015.2,1026.3,1.7,16,1.0,108,서울,-3.1,-16.4,2009-01-01 15:00,2.9,1200.0,290,3.9
18 | 0.0,0.0,,34,0.76,,,,,1015.4,1026.5,1.7,17,1.0,108,서울,-2.9,-16.6,2009-01-01 16:00,-1.5,1200.0,290,2.9
19 | 0.0,0.0,,36,0.23,,,,,1015.7,1026.8,1.8,18,1.0,108,서울,-3.1,-16.0,2009-01-01 17:00,-3.6,1400.0,290,2.9
20 | 0.0,0.0,,38,0.0,,,,,1016.0,1027.1,1.8,19,0.0,108,서울,-3.4,-15.7,2009-01-01 18:00,-5.8,1300.0,290,2.1
21 | ,,,40,,,,,,1016.5,1027.6,1.8,20,,108,서울,-3.9,-15.5,2009-01-01 19:00,-7.7,,340,0.8
22 | ,,,40,,,,,,1016.9,1028.0,1.8,21,,108,서울,-3.8,-15.4,2009-01-01 20:00,-7.9,,290,1.6
23 | 0.0,0.0,4.0,45,,-2.5,-1.3,-0.1,0.9,1017.0,1028.1,2.0,22,,108,서울,-4.0,-14.2,2009-01-01 21:00,-8.3,1300.0,270,1.4
24 | ,,,49,,,,,,1017.1,1028.2,2.2,23,,108,서울,-4.3,-13.4,2009-01-01 22:00,-8.8,,270,1.6
25 | ,,,52,,,,,,1017.4,1028.5,2.3,24,,108,서울,-4.4,-12.7,2009-01-01 23:00,-9.0,,290,1.0
26 |
--------------------------------------------------------------------------------
/dust_weather/raw_data/weather_column.txt:
--------------------------------------------------------------------------------
1 | tm 시간
2 | stn_id 지점 번호
3 | rnum 목록 순서
4 | m0_3_te 30cm 지중온도
5 | m0_2_te 20cm 지중온도
6 | m0_1_te 10cm 지중온도
7 | m0_05_te 5cm 지중온도
8 | ts 지면온도
9 | dmst_mtph_no 현상번호
10 | gnd_stt_cd 지면상태
11 | vs 시정
12 | lcs_ch 최저운고
13 | clfm_abbr_cd 운형
14 | dc10_lmcs_ca 중하층운량
15 | dc10_tca 전운량
16 | hr3_fhsc 3시간신적설
17 | dsnw 적설
18 | icsr 일사
19 | ss 일조
20 | ps 해면기압
21 | pa 현지기압
22 | td 이슬점온도
23 | pv 증기압
24 | hm 습도
25 | wd 풍향
26 | ws 풍속
27 | rn 강수량
28 | ta 기온
29 |
--------------------------------------------------------------------------------
/kaggle/Tabular Playground Series - Jan 2021/xgboost-lgbm-optuna.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
9 | "execution": {
10 | "iopub.execute_input": "2021-01-31T11:38:39.720911Z",
11 | "iopub.status.busy": "2021-01-31T11:38:39.717832Z",
12 | "iopub.status.idle": "2021-01-31T11:38:39.815654Z",
13 | "shell.execute_reply": "2021-01-31T11:38:39.815002Z"
14 | },
15 | "papermill": {
16 | "duration": 0.117137,
17 | "end_time": "2021-01-31T11:38:39.815770",
18 | "exception": false,
19 | "start_time": "2021-01-31T11:38:39.698633",
20 | "status": "completed"
21 | },
22 | "tags": []
23 | },
24 | "outputs": [
25 | {
26 | "name": "stdout",
27 | "output_type": "stream",
28 | "text": [
29 | "/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv\n",
30 | "/kaggle/input/tabular-playground-series-jan-2021/train.csv\n",
31 | "/kaggle/input/tabular-playground-series-jan-2021/test.csv\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "# This Python 3 environment comes with many helpful analytics libraries installed\n",
37 | "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
38 | "# For example, here's several helpful packages to load\n",
39 | "\n",
40 | "import numpy as np # linear algebra\n",
41 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
42 | "import matplotlib.pyplot as plt \n",
43 | "import joblib\n",
44 | "\n",
45 | "%matplotlib inline\n",
46 | "# Input data files are available in the read-only \"../input/\" directory\n",
47 | "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
48 | "\n",
49 | "import os\n",
50 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
51 | " for filename in filenames:\n",
52 | " print(os.path.join(dirname, filename))\n",
53 | "\n",
54 | "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
55 | "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 2,
61 | "metadata": {
62 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
63 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
64 | "execution": {
65 | "iopub.execute_input": "2021-01-31T11:38:39.845836Z",
66 | "iopub.status.busy": "2021-01-31T11:38:39.845182Z",
67 | "iopub.status.idle": "2021-01-31T11:38:43.226278Z",
68 | "shell.execute_reply": "2021-01-31T11:38:43.224950Z"
69 | },
70 | "papermill": {
71 | "duration": 3.399747,
72 | "end_time": "2021-01-31T11:38:43.226402",
73 | "exception": false,
74 | "start_time": "2021-01-31T11:38:39.826655",
75 | "status": "completed"
76 | },
77 | "tags": []
78 | },
79 | "outputs": [],
80 | "source": [
81 | "train_df = pd.read_csv(\"/kaggle/input/tabular-playground-series-jan-2021/train.csv\", index_col=[\"id\"])\n",
82 | "test_df = pd.read_csv(\"/kaggle/input/tabular-playground-series-jan-2021/test.csv\", index_col=[\"id\"])\n",
83 | "\n",
84 | "X = train_df.iloc[:, :-1].to_numpy()\n",
85 | "y = train_df.iloc[:, -1].to_numpy()\n",
86 | "X_test = test_df.to_numpy()"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 3,
92 | "metadata": {
93 | "execution": {
94 | "iopub.execute_input": "2021-01-31T11:38:43.253010Z",
95 | "iopub.status.busy": "2021-01-31T11:38:43.251767Z",
96 | "iopub.status.idle": "2021-01-31T11:38:43.409830Z",
97 | "shell.execute_reply": "2021-01-31T11:38:43.410306Z"
98 | },
99 | "papermill": {
100 | "duration": 0.173962,
101 | "end_time": "2021-01-31T11:38:43.410454",
102 | "exception": false,
103 | "start_time": "2021-01-31T11:38:43.236492",
104 | "status": "completed"
105 | },
106 | "tags": []
107 | },
108 | "outputs": [
109 | {
110 | "data": {
111 | "text/plain": [
112 | "cont1 -0.032994\n",
113 | "cont10 -0.021143\n",
114 | "cont9 -0.013029\n",
115 | "cont13 -0.006642\n",
116 | "cont14 -0.006609\n",
117 | "cont5 -0.005358\n",
118 | "cont4 0.005522\n",
119 | "cont8 0.014698\n",
120 | "cont6 0.027955\n",
121 | "cont12 0.047809\n",
122 | "cont11 0.050996\n",
123 | "cont3 0.058936\n",
124 | "cont2 0.067102\n",
125 | "cont7 0.067234\n",
126 | "Name: target, dtype: float64"
127 | ]
128 | },
129 | "execution_count": 3,
130 | "metadata": {},
131 | "output_type": "execute_result"
132 | }
133 | ],
134 | "source": [
135 | "corr=train_df.corr()[\"target\"]\n",
136 | "corr[np.argsort(corr, axis=0)[:-1]]"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 4,
142 | "metadata": {
143 | "execution": {
144 | "iopub.execute_input": "2021-01-31T11:38:43.442840Z",
145 | "iopub.status.busy": "2021-01-31T11:38:43.442099Z",
146 | "iopub.status.idle": "2021-01-31T11:38:49.555086Z",
147 | "shell.execute_reply": "2021-01-31T11:38:49.554439Z"
148 | },
149 | "papermill": {
150 | "duration": 6.13411,
151 | "end_time": "2021-01-31T11:38:49.555198",
152 | "exception": false,
153 | "start_time": "2021-01-31T11:38:43.421088",
154 | "status": "completed"
155 | },
156 | "tags": []
157 | },
158 | "outputs": [
159 | {
160 | "data": {
161 | "text/html": [
162 | "\n"
180 | ],
181 | "text/plain": [
182 | ""
183 | ]
184 | },
185 | "metadata": {},
186 | "output_type": "display_data"
187 | }
188 | ],
189 | "source": [
190 | "from sklearn.model_selection import train_test_split\n",
191 | "from sklearn.experimental import enable_hist_gradient_boosting\n",
192 | "from sklearn.ensemble import AdaBoostRegressor, HistGradientBoostingRegressor, StackingRegressor, RandomForestRegressor\n",
193 | "from sklearn.metrics import mean_squared_error\n",
194 | "from sklearn.model_selection import KFold\n",
195 | "from sklearn.neural_network import MLPRegressor\n",
196 | "from sklearn.linear_model import SGDRegressor\n",
197 | "\n",
198 | "import xgboost as xgb\n",
199 | "\n",
200 | "from lightgbm import LGBMRegressor\n",
201 | "from xgboost import XGBRegressor\n",
202 | "from catboost import CatBoostRegressor\n",
203 | "\n",
204 | "import optuna \n",
205 | "from optuna import Trial, visualization\n",
206 | "from optuna.samplers import TPESampler\n",
207 | "\n",
208 | "# optuna.logging.set_verbosity(optuna.logging.WARNING)\n",
209 | "\n",
210 | "def train(model):\n",
211 | " X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size=0.1, random_state=156)\n",
212 | " y_train = y_train.reshape(-1, 1)\n",
213 | " y_test = y_test.reshape(-1, 1)\n",
214 | " \n",
215 | " model = model.fit(X_train, y_train, early_stopping_rounds=100, verbose=False, eval_set=[(X_test, y_test)])\n",
216 | " score = mean_squared_error(model.predict(X_train), y_train, squared=False)\n",
217 | " print(score)\n",
218 | " return model"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {
224 | "papermill": {
225 | "duration": 0.010938,
226 | "end_time": "2021-01-31T11:38:49.577831",
227 | "exception": false,
228 | "start_time": "2021-01-31T11:38:49.566893",
229 | "status": "completed"
230 | },
231 | "tags": []
232 | },
233 | "source": [
234 | "Integer parameter : A uniform distribution on integers. \n",
235 | "```n_estimators = trial.suggest_int('n_estimators',100,500)```\n",
236 | "\n",
237 | "Categorical parameter : A categorical distribution. \n",
238 | "```criterion = trial.suggest_categorical('criterion' ,['gini', 'entropy'])```\n",
239 | "\n",
240 | "Uniform parameter : A uniform distribution in linear domain. \n",
241 | "```subsample = trial.suggest_uniform('subsample' ,0.2,0.8)```\n",
242 | "\n",
243 | "Discrete-uniform parameter : A discretized uniform distribution in linear domain. \n",
244 | "```max_features = trial.suggest_discrete_uniform('max_features', 0.05,1,0.05)```\n",
245 | "\n",
246 | "Loguniform parameter : A uniform distribution in log domain. **\n",
247 | "```learning_rate = trial.sugget_loguniform('learning_rate' : 1e-6, 1e-3)```"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 5,
253 | "metadata": {
254 | "execution": {
255 | "iopub.execute_input": "2021-01-31T11:38:49.618630Z",
256 | "iopub.status.busy": "2021-01-31T11:38:49.617775Z",
257 | "iopub.status.idle": "2021-01-31T11:38:49.621042Z",
258 | "shell.execute_reply": "2021-01-31T11:38:49.620511Z"
259 | },
260 | "papermill": {
261 | "duration": 0.032071,
262 | "end_time": "2021-01-31T11:38:49.621149",
263 | "exception": false,
264 | "start_time": "2021-01-31T11:38:49.589078",
265 | "status": "completed"
266 | },
267 | "tags": []
268 | },
269 | "outputs": [],
270 | "source": [
271 | "def objectiveXGB(trial: Trial, X, y, test):\n",
272 | " param = {\n",
273 | " \"n_estimators\" : trial.suggest_int('n_estimators', 500, 4000),\n",
274 | " 'max_depth':trial.suggest_int('max_depth', 8, 16),\n",
275 | " 'min_child_weight':trial.suggest_int('min_child_weight', 1, 300),\n",
276 | " 'gamma':trial.suggest_int('gamma', 1, 3),\n",
277 | " 'learning_rate': 0.01,\n",
278 | " 'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.5, 1, 0.1),\n",
279 | " 'nthread' : -1,\n",
280 | " 'tree_method': 'gpu_hist',\n",
281 | " 'predictor': 'gpu_predictor',\n",
282 | " 'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),\n",
283 | " 'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),\n",
284 | " 'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0] ),\n",
285 | " 'random_state': 42\n",
286 | " }\n",
287 | " X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size=0.1)\n",
288 | " \n",
289 | " y_train = y_train.reshape(-1, 1)\n",
290 | " y_test = y_test.reshape(-1, 1)\n",
291 | "\n",
292 | " model = xgb.XGBRegressor(**param)\n",
293 | " xgb_model = model.fit(X_train, y_train, verbose=False, eval_set=[(X_test, y_test)])\n",
294 | " score = mean_squared_error(xgb_model.predict(X_test), y_test, squared=False)\n",
295 | "\n",
296 | " return score"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 6,
302 | "metadata": {
303 | "execution": {
304 | "iopub.execute_input": "2021-01-31T11:38:49.655352Z",
305 | "iopub.status.busy": "2021-01-31T11:38:49.654468Z",
306 | "iopub.status.idle": "2021-01-31T12:00:55.536650Z",
307 | "shell.execute_reply": "2021-01-31T12:00:55.535642Z"
308 | },
309 | "papermill": {
310 | "duration": 1325.903429,
311 | "end_time": "2021-01-31T12:00:55.536815",
312 | "exception": false,
313 | "start_time": "2021-01-31T11:38:49.633386",
314 | "status": "completed"
315 | },
316 | "tags": []
317 | },
318 | "outputs": [
319 | {
320 | "name": "stderr",
321 | "output_type": "stream",
322 | "text": [
323 | "\u001b[32m[I 2021-01-31 11:38:49,652]\u001b[0m A new study created in memory with name: no-name-be7f46a1-b6aa-4024-8669-e3fdccbcfcf6\u001b[0m\n",
324 | "\u001b[32m[I 2021-01-31 11:39:14,633]\u001b[0m Trial 0 finished with value: 0.6989588738361633 and parameters: {'n_estimators': 1838, 'max_depth': 14, 'min_child_weight': 215, 'gamma': 2, 'colsample_bytree': 0.9, 'lambda': 4.961641935114362, 'alpha': 6.979446411760841, 'subsample': 0.7}. Best is trial 0 with value: 0.6989588738361633.\u001b[0m\n",
325 | "\u001b[32m[I 2021-01-31 11:39:34,924]\u001b[0m Trial 1 finished with value: 0.6984395858421006 and parameters: {'n_estimators': 2054, 'max_depth': 16, 'min_child_weight': 11, 'gamma': 3, 'colsample_bytree': 0.6, 'lambda': 0.533191120541982, 'alpha': 0.03322032484290466, 'subsample': 0.7}. Best is trial 1 with value: 0.6984395858421006.\u001b[0m\n",
326 | "\u001b[32m[I 2021-01-31 11:39:40,401]\u001b[0m Trial 2 finished with value: 0.698131899643169 and parameters: {'n_estimators': 770, 'max_depth': 8, 'min_child_weight': 189, 'gamma': 3, 'colsample_bytree': 0.5, 'lambda': 0.11775793473741883, 'alpha': 0.0014192139140771667, 'subsample': 0.8}. Best is trial 2 with value: 0.698131899643169.\u001b[0m\n",
327 | "\u001b[32m[I 2021-01-31 11:40:02,525]\u001b[0m Trial 3 finished with value: 0.6976421069550431 and parameters: {'n_estimators': 1517, 'max_depth': 10, 'min_child_weight': 25, 'gamma': 1, 'colsample_bytree': 0.5, 'lambda': 0.0179140957072102, 'alpha': 0.10088209496147234, 'subsample': 1.0}. Best is trial 3 with value: 0.6976421069550431.\u001b[0m\n",
328 | "\u001b[32m[I 2021-01-31 11:40:32,692]\u001b[0m Trial 4 finished with value: 0.6970622324852312 and parameters: {'n_estimators': 1186, 'max_depth': 13, 'min_child_weight': 37, 'gamma': 2, 'colsample_bytree': 1.0, 'lambda': 0.16439255591530072, 'alpha': 0.002676775566056092, 'subsample': 0.8}. Best is trial 4 with value: 0.6970622324852312.\u001b[0m\n",
329 | "\u001b[32m[I 2021-01-31 11:40:50,809]\u001b[0m Trial 5 finished with value: 0.6959992404366083 and parameters: {'n_estimators': 1173, 'max_depth': 12, 'min_child_weight': 125, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.0056280113754363075, 'alpha': 0.004261306343552877, 'subsample': 1.0}. Best is trial 5 with value: 0.6959992404366083.\u001b[0m\n",
330 | "\u001b[32m[I 2021-01-31 11:41:08,943]\u001b[0m Trial 6 finished with value: 0.7012721367069924 and parameters: {'n_estimators': 844, 'max_depth': 11, 'min_child_weight': 180, 'gamma': 1, 'colsample_bytree': 0.6, 'lambda': 0.01469958925815164, 'alpha': 0.16666820802055135, 'subsample': 1.0}. Best is trial 5 with value: 0.6959992404366083.\u001b[0m\n",
331 | "\u001b[32m[I 2021-01-31 11:41:27,106]\u001b[0m Trial 7 finished with value: 0.7000299232783013 and parameters: {'n_estimators': 2865, 'max_depth': 14, 'min_child_weight': 184, 'gamma': 3, 'colsample_bytree': 0.7, 'lambda': 0.0012689902981628213, 'alpha': 0.33020982219946193, 'subsample': 0.6}. Best is trial 5 with value: 0.6959992404366083.\u001b[0m\n",
332 | "\u001b[32m[I 2021-01-31 11:41:40,826]\u001b[0m Trial 8 finished with value: 0.6963422853114274 and parameters: {'n_estimators': 869, 'max_depth': 12, 'min_child_weight': 279, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.0018300357587094495, 'alpha': 0.003920930532885174, 'subsample': 1.0}. Best is trial 5 with value: 0.6959992404366083.\u001b[0m\n",
333 | "\u001b[32m[I 2021-01-31 11:41:51,189]\u001b[0m Trial 9 finished with value: 0.7043952543184514 and parameters: {'n_estimators': 1904, 'max_depth': 9, 'min_child_weight': 10, 'gamma': 3, 'colsample_bytree': 0.9, 'lambda': 0.7606818379656084, 'alpha': 3.2064855207156686, 'subsample': 1.0}. Best is trial 5 with value: 0.6959992404366083.\u001b[0m\n",
334 | "\u001b[32m[I 2021-01-31 11:43:25,577]\u001b[0m Trial 10 finished with value: 0.6994857199111865 and parameters: {'n_estimators': 3345, 'max_depth': 16, 'min_child_weight': 94, 'gamma': 1, 'colsample_bytree': 0.8, 'lambda': 0.007949518781138589, 'alpha': 0.01942796410046955, 'subsample': 0.6}. Best is trial 5 with value: 0.6959992404366083.\u001b[0m\n",
335 | "\u001b[32m[I 2021-01-31 11:43:38,832]\u001b[0m Trial 11 finished with value: 0.6996774362642536 and parameters: {'n_estimators': 629, 'max_depth': 12, 'min_child_weight': 296, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.0012065750853555825, 'alpha': 0.006010770112203588, 'subsample': 1.0}. Best is trial 5 with value: 0.6959992404366083.\u001b[0m\n",
336 | "\u001b[32m[I 2021-01-31 11:43:57,211]\u001b[0m Trial 12 finished with value: 0.6998228378192336 and parameters: {'n_estimators': 1253, 'max_depth': 12, 'min_child_weight': 112, 'gamma': 2, 'colsample_bytree': 0.8, 'lambda': 0.003856062285053633, 'alpha': 0.009835430523117073, 'subsample': 1.0}. Best is trial 5 with value: 0.6959992404366083.\u001b[0m\n",
337 | "\u001b[32m[I 2021-01-31 11:44:10,629]\u001b[0m Trial 13 finished with value: 0.6991680666391518 and parameters: {'n_estimators': 2734, 'max_depth': 11, 'min_child_weight': 284, 'gamma': 2, 'colsample_bytree': 0.6, 'lambda': 0.0011011796506697198, 'alpha': 0.0012053059451050568, 'subsample': 1.0}. Best is trial 5 with value: 0.6959992404366083.\u001b[0m\n",
338 | "\u001b[32m[I 2021-01-31 11:44:28,119]\u001b[0m Trial 14 finished with value: 0.6944786821568005 and parameters: {'n_estimators': 549, 'max_depth': 14, 'min_child_weight': 248, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.0038593653692490706, 'alpha': 0.004072296246572295, 'subsample': 1.0}. Best is trial 14 with value: 0.6944786821568005.\u001b[0m\n",
339 | "\u001b[32m[I 2021-01-31 11:44:47,746]\u001b[0m Trial 15 finished with value: 0.7011655402642222 and parameters: {'n_estimators': 513, 'max_depth': 14, 'min_child_weight': 248, 'gamma': 1, 'colsample_bytree': 0.8, 'lambda': 0.02967701138824667, 'alpha': 0.034210823090693315, 'subsample': 1.0}. Best is trial 14 with value: 0.6944786821568005.\u001b[0m\n",
340 | "\u001b[32m[I 2021-01-31 11:45:20,751]\u001b[0m Trial 16 finished with value: 0.6926691405717094 and parameters: {'n_estimators': 1415, 'max_depth': 15, 'min_child_weight': 124, 'gamma': 2, 'colsample_bytree': 0.6, 'lambda': 0.039217785684337855, 'alpha': 0.7187906148566993, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
341 | "\u001b[32m[I 2021-01-31 11:46:05,917]\u001b[0m Trial 17 finished with value: 0.6987081675834381 and parameters: {'n_estimators': 2456, 'max_depth': 15, 'min_child_weight': 64, 'gamma': 2, 'colsample_bytree': 0.6, 'lambda': 0.07679362218099238, 'alpha': 0.9838407835917466, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
342 | "\u001b[32m[I 2021-01-31 11:46:52,913]\u001b[0m Trial 18 finished with value: 0.695832748072481 and parameters: {'n_estimators': 1566, 'max_depth': 15, 'min_child_weight': 152, 'gamma': 1, 'colsample_bytree': 0.5, 'lambda': 0.04785966526907583, 'alpha': 0.6144863474731813, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
343 | "\u001b[32m[I 2021-01-31 11:47:09,267]\u001b[0m Trial 19 finished with value: 0.6954112418276062 and parameters: {'n_estimators': 1570, 'max_depth': 15, 'min_child_weight': 245, 'gamma': 3, 'colsample_bytree': 0.6, 'lambda': 0.616174248626708, 'alpha': 2.419761514693048, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
344 | "\u001b[32m[I 2021-01-31 11:47:50,399]\u001b[0m Trial 20 finished with value: 0.6967594009165083 and parameters: {'n_estimators': 3768, 'max_depth': 16, 'min_child_weight': 143, 'gamma': 2, 'colsample_bytree': 0.9, 'lambda': 0.003320167603978342, 'alpha': 1.3856466626651178, 'subsample': 0.6}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
345 | "\u001b[32m[I 2021-01-31 11:48:06,643]\u001b[0m Trial 21 finished with value: 0.6967656770849345 and parameters: {'n_estimators': 1582, 'max_depth': 15, 'min_child_weight': 239, 'gamma': 3, 'colsample_bytree': 0.6, 'lambda': 1.326660816861036, 'alpha': 3.5801936078505197, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
346 | "\u001b[32m[I 2021-01-31 11:48:20,041]\u001b[0m Trial 22 finished with value: 0.6959556160712814 and parameters: {'n_estimators': 2183, 'max_depth': 14, 'min_child_weight': 255, 'gamma': 3, 'colsample_bytree': 0.6, 'lambda': 5.614984672574503, 'alpha': 2.455311348095379, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
347 | "\u001b[32m[I 2021-01-31 11:48:36,784]\u001b[0m Trial 23 finished with value: 0.6991658072707637 and parameters: {'n_estimators': 1122, 'max_depth': 13, 'min_child_weight': 232, 'gamma': 3, 'colsample_bytree': 0.5, 'lambda': 0.2866170323195122, 'alpha': 9.500883488978364, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
348 | "\u001b[32m[I 2021-01-31 11:49:10,368]\u001b[0m Trial 24 finished with value: 0.6929209881851918 and parameters: {'n_estimators': 1660, 'max_depth': 15, 'min_child_weight': 84, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.23380403303040995, 'alpha': 0.3917914748442161, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
349 | "\u001b[32m[I 2021-01-31 11:49:34,616]\u001b[0m Trial 25 finished with value: 0.6999434622933833 and parameters: {'n_estimators': 1781, 'max_depth': 13, 'min_child_weight': 69, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 2.0437074746652075, 'alpha': 0.3107461036389493, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
350 | "\u001b[32m[I 2021-01-31 11:50:11,994]\u001b[0m Trial 26 finished with value: 0.6961935137887199 and parameters: {'n_estimators': 2390, 'max_depth': 16, 'min_child_weight': 88, 'gamma': 2, 'colsample_bytree': 0.8, 'lambda': 0.214027628463951, 'alpha': 0.2933530584276648, 'subsample': 0.7}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
351 | "\u001b[32m[I 2021-01-31 11:50:37,034]\u001b[0m Trial 27 finished with value: 0.6960195430644573 and parameters: {'n_estimators': 978, 'max_depth': 15, 'min_child_weight': 154, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.038784512855256914, 'alpha': 0.0743132346670548, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
352 | "\u001b[32m[I 2021-01-31 11:51:18,692]\u001b[0m Trial 28 finished with value: 0.6982975048453451 and parameters: {'n_estimators': 1408, 'max_depth': 14, 'min_child_weight': 53, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.011488898042448679, 'alpha': 0.823661735652166, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
353 | "\u001b[32m[I 2021-01-31 11:51:38,621]\u001b[0m Trial 29 finished with value: 0.6984882426735585 and parameters: {'n_estimators': 1845, 'max_depth': 14, 'min_child_weight': 208, 'gamma': 2, 'colsample_bytree': 0.8, 'lambda': 2.6072511839761576, 'alpha': 0.12746489939239564, 'subsample': 0.7}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
354 | "\u001b[32m[I 2021-01-31 11:52:26,905]\u001b[0m Trial 30 finished with value: 0.6958668383219296 and parameters: {'n_estimators': 2655, 'max_depth': 13, 'min_child_weight': 113, 'gamma': 1, 'colsample_bytree': 0.7, 'lambda': 0.06352391330215072, 'alpha': 0.47367628390755273, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
355 | "\u001b[32m[I 2021-01-31 11:52:53,240]\u001b[0m Trial 31 finished with value: 0.6997493719550192 and parameters: {'n_estimators': 1727, 'max_depth': 15, 'min_child_weight': 210, 'gamma': 2, 'colsample_bytree': 0.6, 'lambda': 0.5285503275209107, 'alpha': 6.143835933994607, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
356 | "\u001b[32m[I 2021-01-31 11:53:17,661]\u001b[0m Trial 32 finished with value: 0.697336754960971 and parameters: {'n_estimators': 2058, 'max_depth': 16, 'min_child_weight': 127, 'gamma': 3, 'colsample_bytree': 0.6, 'lambda': 0.3679817474859144, 'alpha': 1.5394156182344625, 'subsample': 0.8}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
357 | "\u001b[32m[I 2021-01-31 11:53:31,512]\u001b[0m Trial 33 finished with value: 0.69469161655782 and parameters: {'n_estimators': 521, 'max_depth': 15, 'min_child_weight': 268, 'gamma': 3, 'colsample_bytree': 0.6, 'lambda': 0.1346329951918241, 'alpha': 1.665781838012356, 'subsample': 0.7}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
358 | "\u001b[32m[I 2021-01-31 11:53:48,783]\u001b[0m Trial 34 finished with value: 0.6972009557953684 and parameters: {'n_estimators': 528, 'max_depth': 16, 'min_child_weight': 268, 'gamma': 2, 'colsample_bytree': 0.5, 'lambda': 0.12608097821961942, 'alpha': 1.4774868791635178, 'subsample': 0.7}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
359 | "\u001b[32m[I 2021-01-31 11:54:12,627]\u001b[0m Trial 35 finished with value: 0.6954789875504872 and parameters: {'n_estimators': 737, 'max_depth': 15, 'min_child_weight': 96, 'gamma': 2, 'colsample_bytree': 0.6, 'lambda': 0.10381263531265492, 'alpha': 0.05789210910050885, 'subsample': 0.7}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
360 | "\u001b[32m[I 2021-01-31 11:54:24,647]\u001b[0m Trial 36 finished with value: 0.6995115871891587 and parameters: {'n_estimators': 1313, 'max_depth': 14, 'min_child_weight': 299, 'gamma': 3, 'colsample_bytree': 0.5, 'lambda': 0.023770334408292255, 'alpha': 0.15156263305627066, 'subsample': 0.7}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
361 | "\u001b[32m[I 2021-01-31 11:55:00,130]\u001b[0m Trial 37 finished with value: 0.6988825306851013 and parameters: {'n_estimators': 1007, 'max_depth': 13, 'min_child_weight': 36, 'gamma': 1, 'colsample_bytree': 0.7, 'lambda': 9.913090057708677, 'alpha': 5.88567511773672, 'subsample': 0.7}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
362 | "\u001b[32m[I 2021-01-31 11:55:20,568]\u001b[0m Trial 38 finished with value: 0.6974217099483263 and parameters: {'n_estimators': 507, 'max_depth': 16, 'min_child_weight': 162, 'gamma': 2, 'colsample_bytree': 0.6, 'lambda': 0.18460237451595957, 'alpha': 0.548091059692696, 'subsample': 0.7}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
363 | "\u001b[32m[I 2021-01-31 11:55:36,074]\u001b[0m Trial 39 finished with value: 0.7016544157718939 and parameters: {'n_estimators': 1025, 'max_depth': 14, 'min_child_weight': 76, 'gamma': 3, 'colsample_bytree': 0.7, 'lambda': 1.0605389613945055, 'alpha': 0.9444484996983884, 'subsample': 0.6}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
364 | "\u001b[32m[I 2021-01-31 11:55:57,944]\u001b[0m Trial 40 finished with value: 0.6948280596792296 and parameters: {'n_estimators': 708, 'max_depth': 15, 'min_child_weight': 170, 'gamma': 2, 'colsample_bytree': 0.5, 'lambda': 0.3685535386246737, 'alpha': 0.49341874254207085, 'subsample': 1.0}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
365 | "\u001b[32m[I 2021-01-31 11:56:16,570]\u001b[0m Trial 41 finished with value: 0.6978372575268349 and parameters: {'n_estimators': 762, 'max_depth': 15, 'min_child_weight': 225, 'gamma': 2, 'colsample_bytree': 0.5, 'lambda': 0.3233169600125592, 'alpha': 0.24830272906741135, 'subsample': 1.0}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
366 | "\u001b[32m[I 2021-01-31 11:56:40,340]\u001b[0m Trial 42 finished with value: 0.6987604133851342 and parameters: {'n_estimators': 678, 'max_depth': 15, 'min_child_weight': 181, 'gamma': 2, 'colsample_bytree': 0.5, 'lambda': 0.1451683783710794, 'alpha': 0.450406993935908, 'subsample': 1.0}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
367 | "\u001b[32m[I 2021-01-31 11:57:09,058]\u001b[0m Trial 43 finished with value: 0.6991807486552817 and parameters: {'n_estimators': 882, 'max_depth': 16, 'min_child_weight': 133, 'gamma': 2, 'colsample_bytree': 0.6, 'lambda': 0.08180373929345874, 'alpha': 0.18165603394017166, 'subsample': 1.0}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
368 | "\u001b[32m[I 2021-01-31 11:57:27,379]\u001b[0m Trial 44 finished with value: 0.6995858107340279 and parameters: {'n_estimators': 1381, 'max_depth': 14, 'min_child_weight': 263, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.403657118742876, 'alpha': 1.935992161460453, 'subsample': 1.0}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
369 | "\u001b[32m[I 2021-01-31 11:58:04,615]\u001b[0m Trial 45 finished with value: 0.6972660189085221 and parameters: {'n_estimators': 1193, 'max_depth': 15, 'min_child_weight': 108, 'gamma': 2, 'colsample_bytree': 0.5, 'lambda': 0.22843599832641476, 'alpha': 4.056377062345579, 'subsample': 1.0}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
370 | "\u001b[32m[I 2021-01-31 11:58:37,146]\u001b[0m Trial 46 finished with value: 0.6979395982702321 and parameters: {'n_estimators': 610, 'max_depth': 16, 'min_child_weight': 195, 'gamma': 1, 'colsample_bytree': 0.6, 'lambda': 0.0022876987494719, 'alpha': 0.0019647973585286975, 'subsample': 1.0}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
371 | "\u001b[32m[I 2021-01-31 11:58:56,035]\u001b[0m Trial 47 finished with value: 0.700737607090333 and parameters: {'n_estimators': 853, 'max_depth': 14, 'min_child_weight': 277, 'gamma': 2, 'colsample_bytree': 0.8, 'lambda': 0.016129260522084573, 'alpha': 0.7114338705788977, 'subsample': 1.0}. Best is trial 16 with value: 0.6926691405717094.\u001b[0m\n",
372 | "\u001b[32m[I 2021-01-31 11:59:24,697]\u001b[0m Trial 48 finished with value: 0.6918131483269861 and parameters: {'n_estimators': 1988, 'max_depth': 15, 'min_child_weight': 171, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.009025835256777458, 'alpha': 1.0684343216052703, 'subsample': 0.6}. Best is trial 48 with value: 0.6918131483269861.\u001b[0m\n",
373 | "\u001b[32m[I 2021-01-31 12:00:34,445]\u001b[0m Trial 49 finished with value: 0.6973523901244232 and parameters: {'n_estimators': 2138, 'max_depth': 13, 'min_child_weight': 54, 'gamma': 1, 'colsample_bytree': 0.7, 'lambda': 0.0041450614876976455, 'alpha': 1.2584887220213783, 'subsample': 0.6}. Best is trial 48 with value: 0.6918131483269861.\u001b[0m\n"
374 | ]
375 | },
376 | {
377 | "name": "stdout",
378 | "output_type": "stream",
379 | "text": [
380 | "Best trial: score 0.6918131483269861,\n",
381 | "params {'n_estimators': 1988, 'max_depth': 15, 'min_child_weight': 171, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.009025835256777458, 'alpha': 1.0684343216052703, 'subsample': 0.6}\n",
382 | "0.6635022057160923\n"
383 | ]
384 | }
385 | ],
386 | "source": [
387 | "study = optuna.create_study(direction='minimize',sampler=TPESampler())\n",
388 | "study.optimize(lambda trial : objectiveXGB(trial, X, y, X_test), n_trials=50)\n",
389 | "print('Best trial: score {},\\nparams {}'.format(study.best_trial.value,study.best_trial.params))\n",
390 | "\n",
391 | "#best_param = study.best_trial.params\n",
392 | "#xgbReg = train(xgb.XGBRegressor(**best_param, tree_method='gpu_hist', random_state=42, predictor='gpu_predictor', learning_rate=0.01, nthread=-1))\n",
393 | "\n",
394 | "params = {'n_estimators': 3520, 'max_depth': 11, 'min_child_weight': 231, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.014950936465569798, 'alpha': 0.28520156840812494, 'subsample': 0.6}\n",
395 | "xgbReg = train(xgb.XGBRegressor(**params, tree_method='gpu_hist', random_state=42, predictor='gpu_predictor', learning_rate=0.01, nthread=-1))\n",
396 | "\n",
397 | "# 0.6744648190960726"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 7,
403 | "metadata": {
404 | "execution": {
405 | "iopub.execute_input": "2021-01-31T12:00:55.643600Z",
406 | "iopub.status.busy": "2021-01-31T12:00:55.642518Z",
407 | "iopub.status.idle": "2021-01-31T12:00:55.652566Z",
408 | "shell.execute_reply": "2021-01-31T12:00:55.650357Z"
409 | },
410 | "papermill": {
411 | "duration": 0.070897,
412 | "end_time": "2021-01-31T12:00:55.652729",
413 | "exception": false,
414 | "start_time": "2021-01-31T12:00:55.581832",
415 | "status": "completed"
416 | },
417 | "tags": []
418 | },
419 | "outputs": [],
420 | "source": [
421 | "def objectiveLGBM(trial: Trial, X, y, test):\n",
422 | " param = {\n",
423 | " 'objective': 'regression',\n",
424 | " 'metric': 'root_mean_squared_error',\n",
425 | " 'verbosity': -1,\n",
426 | " 'boosting_type': 'gbdt',\n",
427 | " 'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),\n",
428 | " 'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),\n",
429 | " 'num_leaves': trial.suggest_int('num_leaves', 2, 512),\n",
430 | " 'learning_rate': 0.01,\n",
431 | " 'n_estimators': trial.suggest_int('n_estimators', 700, 3000),\n",
432 | " 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),\n",
433 | " 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),\n",
434 | " 'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),\n",
435 | " 'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),\n",
436 | " 'device':\"gpu\",\n",
437 | " 'gpu_use_dp':True\n",
438 | " }\n",
439 | " X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size=0.1)\n",
440 | " \n",
441 | " lgbm_regr = LGBMRegressor(**param)\n",
442 | " lgbm_regr = lgbm_regr.fit(X_train, y_train, verbose=False)\n",
443 | " \n",
444 | " score = mean_squared_error(lgbm_regr.predict(X_test), y_test, squared=False)\n",
445 | " return score"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 8,
451 | "metadata": {
452 | "execution": {
453 | "iopub.execute_input": "2021-01-31T12:00:55.729840Z",
454 | "iopub.status.busy": "2021-01-31T12:00:55.729008Z",
455 | "iopub.status.idle": "2021-01-31T12:50:00.611181Z",
456 | "shell.execute_reply": "2021-01-31T12:50:00.610578Z"
457 | },
458 | "papermill": {
459 | "duration": 2944.920069,
460 | "end_time": "2021-01-31T12:50:00.611306",
461 | "exception": false,
462 | "start_time": "2021-01-31T12:00:55.691237",
463 | "status": "completed"
464 | },
465 | "tags": []
466 | },
467 | "outputs": [
468 | {
469 | "name": "stderr",
470 | "output_type": "stream",
471 | "text": [
472 | "\u001b[32m[I 2021-01-31 12:00:55,727]\u001b[0m A new study created in memory with name: no-name-905cd327-cea7-4cb3-8357-197a3832860b\u001b[0m\n",
473 | "\u001b[32m[I 2021-01-31 12:03:06,069]\u001b[0m Trial 0 finished with value: 0.6966205326623366 and parameters: {'lambda_l1': 5.849892300490624e-06, 'lambda_l2': 1.0985904987808198e-08, 'num_leaves': 146, 'n_estimators': 1673, 'feature_fraction': 0.8305103691181817, 'bagging_fraction': 0.6317746830544261, 'bagging_freq': 2, 'min_child_samples': 84}. Best is trial 0 with value: 0.6966205326623366.\u001b[0m\n",
474 | "\u001b[32m[I 2021-01-31 12:04:01,637]\u001b[0m Trial 1 finished with value: 0.7001111417396245 and parameters: {'lambda_l1': 4.533754839995915e-08, 'lambda_l2': 7.069857056821609e-07, 'num_leaves': 35, 'n_estimators': 1488, 'feature_fraction': 0.5020424533902141, 'bagging_fraction': 0.6726453486940405, 'bagging_freq': 6, 'min_child_samples': 98}. Best is trial 0 with value: 0.6966205326623366.\u001b[0m\n",
475 | "\u001b[32m[I 2021-01-31 12:05:43,097]\u001b[0m Trial 2 finished with value: 0.6994264817598198 and parameters: {'lambda_l1': 0.01918343702312786, 'lambda_l2': 0.06061507145964506, 'num_leaves': 160, 'n_estimators': 1375, 'feature_fraction': 0.9726333823896037, 'bagging_fraction': 0.6317438857784323, 'bagging_freq': 4, 'min_child_samples': 33}. Best is trial 0 with value: 0.6966205326623366.\u001b[0m\n",
476 | "\u001b[32m[I 2021-01-31 12:09:55,840]\u001b[0m Trial 3 finished with value: 0.6962075832414456 and parameters: {'lambda_l1': 1.1783663255075734e-06, 'lambda_l2': 1.5587806773470125, 'num_leaves': 286, 'n_estimators': 2942, 'feature_fraction': 0.7193705637082513, 'bagging_fraction': 0.4790113483246446, 'bagging_freq': 4, 'min_child_samples': 5}. Best is trial 3 with value: 0.6962075832414456.\u001b[0m\n",
477 | "\u001b[32m[I 2021-01-31 12:13:14,628]\u001b[0m Trial 4 finished with value: 0.6949611615253513 and parameters: {'lambda_l1': 0.04151984387118269, 'lambda_l2': 1.0397327592044077, 'num_leaves': 219, 'n_estimators': 2862, 'feature_fraction': 0.6199913602490396, 'bagging_fraction': 0.9563375356864136, 'bagging_freq': 5, 'min_child_samples': 56}. Best is trial 4 with value: 0.6949611615253513.\u001b[0m\n",
478 | "\u001b[32m[I 2021-01-31 12:16:21,772]\u001b[0m Trial 5 finished with value: 0.6967644495675683 and parameters: {'lambda_l1': 0.3686036629504521, 'lambda_l2': 0.019552488101672896, 'num_leaves': 450, 'n_estimators': 1394, 'feature_fraction': 0.9947624280621166, 'bagging_fraction': 0.4801812192227328, 'bagging_freq': 7, 'min_child_samples': 17}. Best is trial 4 with value: 0.6949611615253513.\u001b[0m\n",
479 | "\u001b[32m[I 2021-01-31 12:20:26,429]\u001b[0m Trial 6 finished with value: 0.7003945586310989 and parameters: {'lambda_l1': 0.036639372901604635, 'lambda_l2': 3.031838881595904, 'num_leaves': 398, 'n_estimators': 1868, 'feature_fraction': 0.6182426388229039, 'bagging_fraction': 0.890820933378503, 'bagging_freq': 1, 'min_child_samples': 91}. Best is trial 4 with value: 0.6949611615253513.\u001b[0m\n",
480 | "\u001b[32m[I 2021-01-31 12:22:41,520]\u001b[0m Trial 7 finished with value: 0.693323470317844 and parameters: {'lambda_l1': 7.503057673677594e-05, 'lambda_l2': 6.149782129041217e-05, 'num_leaves': 348, 'n_estimators': 1148, 'feature_fraction': 0.5294521356189122, 'bagging_fraction': 0.4921220765095396, 'bagging_freq': 1, 'min_child_samples': 82}. Best is trial 7 with value: 0.693323470317844.\u001b[0m\n",
481 | "\u001b[32m[I 2021-01-31 12:25:32,001]\u001b[0m Trial 8 finished with value: 0.6980029109150448 and parameters: {'lambda_l1': 6.869055456909293e-05, 'lambda_l2': 0.0009513864054031107, 'num_leaves': 211, 'n_estimators': 2113, 'feature_fraction': 0.9771291252694632, 'bagging_fraction': 0.5536644090417298, 'bagging_freq': 7, 'min_child_samples': 84}. Best is trial 7 with value: 0.693323470317844.\u001b[0m\n",
482 | "\u001b[32m[I 2021-01-31 12:30:26,033]\u001b[0m Trial 9 finished with value: 0.6937886976373929 and parameters: {'lambda_l1': 1.1400391669641804e-06, 'lambda_l2': 2.7196101544118676e-06, 'num_leaves': 474, 'n_estimators': 2573, 'feature_fraction': 0.6012234190148116, 'bagging_fraction': 0.8348947646092013, 'bagging_freq': 4, 'min_child_samples': 67}. Best is trial 7 with value: 0.693323470317844.\u001b[0m\n",
483 | "\u001b[32m[I 2021-01-31 12:32:07,627]\u001b[0m Trial 10 finished with value: 0.6908095699684338 and parameters: {'lambda_l1': 9.13896277171559, 'lambda_l2': 1.1206649989321402e-05, 'num_leaves': 338, 'n_estimators': 825, 'feature_fraction': 0.41317513422837376, 'bagging_fraction': 0.7766613375334941, 'bagging_freq': 2, 'min_child_samples': 69}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n",
484 | "\u001b[32m[I 2021-01-31 12:33:35,711]\u001b[0m Trial 11 finished with value: 0.7011811670099828 and parameters: {'lambda_l1': 0.0005723285101720917, 'lambda_l2': 1.8504941049272386e-05, 'num_leaves': 332, 'n_estimators': 766, 'feature_fraction': 0.4191456797763409, 'bagging_fraction': 0.7744257787545956, 'bagging_freq': 2, 'min_child_samples': 68}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n",
485 | "\u001b[32m[I 2021-01-31 12:35:04,745]\u001b[0m Trial 12 finished with value: 0.6952629169206984 and parameters: {'lambda_l1': 0.00026429476636171604, 'lambda_l2': 0.00012895805241730098, 'num_leaves': 348, 'n_estimators': 707, 'feature_fraction': 0.4077967120279897, 'bagging_fraction': 0.7394928089370464, 'bagging_freq': 1, 'min_child_samples': 44}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n",
486 | "\u001b[32m[I 2021-01-31 12:37:06,511]\u001b[0m Trial 13 finished with value: 0.7002502451323074 and parameters: {'lambda_l1': 5.5415533189696715, 'lambda_l2': 1.213420375199967e-07, 'num_leaves': 379, 'n_estimators': 1033, 'feature_fraction': 0.48546474800554584, 'bagging_fraction': 0.4081490818860416, 'bagging_freq': 2, 'min_child_samples': 72}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n",
487 | "\u001b[32m[I 2021-01-31 12:38:54,020]\u001b[0m Trial 14 finished with value: 0.6978771863599228 and parameters: {'lambda_l1': 1.6409410481605596e-08, 'lambda_l2': 0.0007527744290259774, 'num_leaves': 297, 'n_estimators': 1001, 'feature_fraction': 0.5028352505804287, 'bagging_fraction': 0.9973269614147429, 'bagging_freq': 3, 'min_child_samples': 54}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n",
488 | "\u001b[32m[I 2021-01-31 12:41:33,808]\u001b[0m Trial 15 finished with value: 0.6964051644254869 and parameters: {'lambda_l1': 0.002370108140172043, 'lambda_l2': 2.5249891905321112e-05, 'num_leaves': 507, 'n_estimators': 1062, 'feature_fraction': 0.42339673757771895, 'bagging_fraction': 0.833235329523894, 'bagging_freq': 1, 'min_child_samples': 100}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n",
489 | "\u001b[32m[I 2021-01-31 12:43:16,423]\u001b[0m Trial 16 finished with value: 0.6969590720277937 and parameters: {'lambda_l1': 6.038797451151379, 'lambda_l2': 4.8200311199363066e-08, 'num_leaves': 419, 'n_estimators': 779, 'feature_fraction': 0.5545450823212372, 'bagging_fraction': 0.5658564109863532, 'bagging_freq': 3, 'min_child_samples': 77}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n",
490 | "\u001b[32m[I 2021-01-31 12:45:04,638]\u001b[0m Trial 17 finished with value: 0.698717690308695 and parameters: {'lambda_l1': 2.4342000496075205e-05, 'lambda_l2': 5.079845168845365e-06, 'num_leaves': 231, 'n_estimators': 1224, 'feature_fraction': 0.7692217524283714, 'bagging_fraction': 0.4151884100501966, 'bagging_freq': 3, 'min_child_samples': 61}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n",
491 | "\u001b[32m[I 2021-01-31 12:48:39,206]\u001b[0m Trial 18 finished with value: 0.6977293083727247 and parameters: {'lambda_l1': 1.7980352493868115e-07, 'lambda_l2': 0.008597599988321873, 'num_leaves': 341, 'n_estimators': 2032, 'feature_fraction': 0.4546736326244339, 'bagging_fraction': 0.7438018579123211, 'bagging_freq': 1, 'min_child_samples': 41}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n",
492 | "\u001b[32m[I 2021-01-31 12:50:00,562]\u001b[0m Trial 19 finished with value: 0.6983050029069896 and parameters: {'lambda_l1': 0.0024664927046054654, 'lambda_l2': 9.872111806265727e-05, 'num_leaves': 285, 'n_estimators': 706, 'feature_fraction': 0.5583690366256002, 'bagging_fraction': 0.8230364910420525, 'bagging_freq': 2, 'min_child_samples': 79}. Best is trial 10 with value: 0.6908095699684338.\u001b[0m\n"
493 | ]
494 | },
495 | {
496 | "name": "stdout",
497 | "output_type": "stream",
498 | "text": [
499 | "Best trial: score 0.6908095699684338,\n",
500 | "params {'lambda_l1': 9.13896277171559, 'lambda_l2': 1.1206649989321402e-05, 'num_leaves': 338, 'n_estimators': 825, 'feature_fraction': 0.41317513422837376, 'bagging_fraction': 0.7766613375334941, 'bagging_freq': 2, 'min_child_samples': 69}\n"
501 | ]
502 | }
503 | ],
504 | "source": [
505 | "study = optuna.create_study(direction='minimize',sampler=TPESampler())\n",
506 | "study.optimize(lambda trial : objectiveLGBM(trial, X, y, X_test), n_trials=20)\n",
507 | "print('Best trial: score {},\\nparams {}'.format(study.best_trial.value,study.best_trial.params))\n",
508 | "\n",
509 | "best_param2 = study.best_trial.params\n",
510 | "lgbm = LGBMRegressor(**best_param2, device=\"gpu\",gpu_use_dp=True, objective='regression', metric='root_mean_squared_error', learning_rate= 0.01, boosting_type='gbdt')\n",
511 | "\n",
512 | "# Best trial: score 0.6934602592622415,\n",
513 | "# params {'lambda_l1': 4.168316306871167e-05, 'lambda_l2': 1.1288557039193647e-05, 'num_leaves': 98, 'n_estimators': 2280, 'feature_fraction': 0.7977209715952681, 'bagging_fraction': 0.4353577523638581, 'bagging_freq': 4, 'min_child_samples': 69}"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": 9,
519 | "metadata": {
520 | "execution": {
521 | "iopub.execute_input": "2021-01-31T12:50:00.703892Z",
522 | "iopub.status.busy": "2021-01-31T12:50:00.703101Z",
523 | "iopub.status.idle": "2021-01-31T13:10:58.909329Z",
524 | "shell.execute_reply": "2021-01-31T13:10:58.909842Z"
525 | },
526 | "papermill": {
527 | "duration": 1258.257938,
528 | "end_time": "2021-01-31T13:10:58.910027",
529 | "exception": false,
530 | "start_time": "2021-01-31T12:50:00.652089",
531 | "status": "completed"
532 | },
533 | "tags": []
534 | },
535 | "outputs": [
536 | {
537 | "name": "stderr",
538 | "output_type": "stream",
539 | "text": [
540 | "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_split.py:297: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.\n",
541 | " FutureWarning\n"
542 | ]
543 | },
544 | {
545 | "name": "stdout",
546 | "output_type": "stream",
547 | "text": [
548 | "task: [regression]\n",
549 | "metric: [mean_squared_error]\n",
550 | "mode: [oof_pred_bag]\n",
551 | "n_models: [5]\n",
552 | "\n",
553 | "model 0: [LGBMRegressor]\n",
554 | " fold 0: [0.48895248]\n",
555 | " fold 1: [0.48742877]\n",
556 | " fold 2: [0.48392999]\n",
557 | " fold 3: [0.48360045]\n",
558 | " fold 4: [0.48409103]\n",
559 | " ----\n",
560 | " MEAN: [0.48560055] + [0.00217475]\n",
561 | " FULL: [0.48560055]\n",
562 | "\n",
563 | "model 1: [CatBoostRegressor]\n",
564 | " fold 0: [0.51095145]\n",
565 | " fold 1: [0.50854355]\n",
566 | " fold 2: [0.50425050]\n",
567 | " fold 3: [0.50109258]\n",
568 | " fold 4: [0.50131361]\n",
569 | " ----\n",
570 | " MEAN: [0.50523034] + [0.00392760]\n",
571 | " FULL: [0.50523034]\n",
572 | "\n",
573 | "model 2: [SGDRegressor]\n",
574 | " fold 0: [0.53226601]\n",
575 | " fold 1: [0.52932436]\n",
576 | " fold 2: [0.52919457]\n",
577 | " fold 3: [0.52505756]\n",
578 | " fold 4: [0.52478973]\n",
579 | " ----\n",
580 | " MEAN: [0.52812644] + [0.00283773]\n",
581 | " FULL: [0.52812644]\n",
582 | "\n",
583 | "model 3: [HistGradientBoostingRegressor]\n",
584 | " fold 0: [0.51027910]\n",
585 | " fold 1: [0.50796183]\n",
586 | " fold 2: [0.50614704]\n",
587 | " fold 3: [0.50462725]\n",
588 | " fold 4: [0.50417062]\n",
589 | " ----\n",
590 | " MEAN: [0.50663717] + [0.00225310]\n",
591 | " FULL: [0.50663717]\n",
592 | "\n",
593 | "model 4: [XGBRegressor]\n",
594 | " fold 0: [0.48842847]\n",
595 | " fold 1: [0.48704542]\n",
596 | " fold 2: [0.48337367]\n",
597 | " fold 3: [0.48344787]\n",
598 | " fold 4: [0.48386448]\n",
599 | " ----\n",
600 | " MEAN: [0.48523198] + [0.00209822]\n",
601 | " FULL: [0.48523198]\n",
602 | "\n",
603 | "0.040654579526564295\n"
604 | ]
605 | }
606 | ],
607 | "source": [
608 | "from vecstack import stacking\n",
609 | "\n",
610 | "final_model = xgb.XGBRegressor(n_estimators= 2000, max_depth= 16,tree_method='gpu_hist', predictor='gpu_predictor')\n",
611 | "sgd = SGDRegressor(max_iter=1000)\n",
612 | "hgb = HistGradientBoostingRegressor( max_depth=3, min_samples_leaf=1)\n",
613 | "cat = CatBoostRegressor(task_type=\"GPU\", verbose=False)\n",
614 | "\n",
615 | "estimators = [\n",
616 | " lgbm, cat, sgd, hgb, xgbReg\n",
617 | "]\n",
618 | "\n",
619 | "S_train, S_test = stacking(estimators, X, y, X_test, regression=True, metric=mean_squared_error, n_folds=5, \n",
620 | " shuffle=False, random_state=0, verbose=2)\n",
621 | "final_model.fit(S_train, y)\n",
622 | "\n",
623 | "print(mean_squared_error(final_model.predict(S_train), y, squared=False))"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": 10,
629 | "metadata": {
630 | "execution": {
631 | "iopub.execute_input": "2021-01-31T13:10:59.019687Z",
632 | "iopub.status.busy": "2021-01-31T13:10:59.018879Z",
633 | "iopub.status.idle": "2021-01-31T13:11:01.877705Z",
634 | "shell.execute_reply": "2021-01-31T13:11:01.878192Z"
635 | },
636 | "papermill": {
637 | "duration": 2.920554,
638 | "end_time": "2021-01-31T13:11:01.878331",
639 | "exception": false,
640 | "start_time": "2021-01-31T13:10:58.957777",
641 | "status": "completed"
642 | },
643 | "tags": []
644 | },
645 | "outputs": [
646 | {
647 | "data": {
648 | "text/plain": [
649 | "['/kaggle/working/skacking.pkl']"
650 | ]
651 | },
652 | "execution_count": 10,
653 | "metadata": {},
654 | "output_type": "execute_result"
655 | }
656 | ],
657 | "source": [
658 | "submission = pd.read_csv(\"/kaggle/input/tabular-playground-series-jan-2021/test.csv\", index_col=[\"id\"])\n",
659 | "y_hat = final_model.predict(S_test) \n",
660 | "\n",
661 | "submission[\"target\"] = y_hat \n",
662 | "submission[[\"target\"]].to_csv(\"/kaggle/working/submission_stacking.csv\")\n",
663 | "joblib.dump(final_model, '/kaggle/working/skacking.pkl')"
664 | ]
665 | },
666 | {
667 | "cell_type": "code",
668 | "execution_count": 11,
669 | "metadata": {
670 | "execution": {
671 | "iopub.execute_input": "2021-01-31T13:11:01.985522Z",
672 | "iopub.status.busy": "2021-01-31T13:11:01.984661Z",
673 | "iopub.status.idle": "2021-01-31T13:15:05.734317Z",
674 | "shell.execute_reply": "2021-01-31T13:15:05.734845Z"
675 | },
676 | "papermill": {
677 | "duration": 243.808788,
678 | "end_time": "2021-01-31T13:15:05.734995",
679 | "exception": false,
680 | "start_time": "2021-01-31T13:11:01.926207",
681 | "status": "completed"
682 | },
683 | "tags": []
684 | },
685 | "outputs": [
686 | {
687 | "name": "stdout",
688 | "output_type": "stream",
689 | "text": [
690 | "0.6632514386518857\n"
691 | ]
692 | },
693 | {
694 | "data": {
695 | "text/plain": [
696 | "['/kaggle/working/lgbm.pkl']"
697 | ]
698 | },
699 | "execution_count": 11,
700 | "metadata": {},
701 | "output_type": "execute_result"
702 | }
703 | ],
704 | "source": [
705 | "submission = pd.read_csv(\"/kaggle/input/tabular-playground-series-jan-2021/test.csv\", index_col=[\"id\"])\n",
706 | "lgbm = LGBMRegressor(**best_param2, device=\"gpu\",gpu_use_dp=True, objective='regression', learning_rate= 0.01, metric='root_mean_squared_error', boosting_type='gbdt')\n",
707 | "lgbm = lgbm.fit(X, y, verbose=False)\n",
708 | "y_hat = lgbm.predict(submission.to_numpy()) \n",
709 | "print(mean_squared_error(lgbm.predict(X), y, squared=False))\n",
710 | "submission[\"target\"] = y_hat \n",
711 | "submission[[\"target\"]].to_csv(\"/kaggle/working/submission_lgbm.csv\")\n",
712 | "joblib.dump(lgbm, '/kaggle/working/lgbm.pkl')"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 12,
718 | "metadata": {
719 | "execution": {
720 | "iopub.execute_input": "2021-01-31T13:15:05.846189Z",
721 | "iopub.status.busy": "2021-01-31T13:15:05.845293Z",
722 | "iopub.status.idle": "2021-01-31T13:15:28.840381Z",
723 | "shell.execute_reply": "2021-01-31T13:15:28.839338Z"
724 | },
725 | "papermill": {
726 | "duration": 23.054885,
727 | "end_time": "2021-01-31T13:15:28.840515",
728 | "exception": false,
729 | "start_time": "2021-01-31T13:15:05.785630",
730 | "status": "completed"
731 | },
732 | "tags": []
733 | },
734 | "outputs": [
735 | {
736 | "name": "stdout",
737 | "output_type": "stream",
738 | "text": [
739 | "0.6635022057160923\n",
740 | "0.6666673015230643\n"
741 | ]
742 | },
743 | {
744 | "data": {
745 | "text/plain": [
746 | "['/kaggle/working/xgb_reg.pkl']"
747 | ]
748 | },
749 | "execution_count": 12,
750 | "metadata": {},
751 | "output_type": "execute_result"
752 | }
753 | ],
754 | "source": [
755 | "submission = pd.read_csv(\"/kaggle/input/tabular-playground-series-jan-2021/test.csv\", index_col=[\"id\"])\n",
756 | "params = {'n_estimators': 3520, 'max_depth': 11, 'min_child_weight': 231, 'gamma': 2, 'colsample_bytree': 0.7, 'lambda': 0.014950936465569798, 'alpha': 0.28520156840812494, 'subsample': 0.6}\n",
757 | "xgbReg = train(xgb.XGBRegressor(**params, tree_method='gpu_hist', random_state=42, predictor='gpu_predictor', learning_rate=0.01, nthread=-1))\n",
758 | "\n",
759 | "y_hat = xgbReg.predict(submission.to_numpy()) \n",
760 | "print(mean_squared_error(xgbReg.predict(X), y, squared=False))\n",
761 | "\n",
762 | "submission[\"target\"] = y_hat \n",
763 | "submission[[\"target\"]].to_csv(\"/kaggle/working/submission_xgb.csv\")\n",
764 | "joblib.dump(xgbReg, '/kaggle/working/xgb_reg.pkl')"
765 | ]
766 | }
767 | ],
768 | "metadata": {
769 | "kernelspec": {
770 | "display_name": "Python 3",
771 | "language": "python",
772 | "name": "python3"
773 | },
774 | "language_info": {
775 | "codemirror_mode": {
776 | "name": "ipython",
777 | "version": 3
778 | },
779 | "file_extension": ".py",
780 | "mimetype": "text/x-python",
781 | "name": "python",
782 | "nbconvert_exporter": "python",
783 | "pygments_lexer": "ipython3",
784 | "version": "3.7.6"
785 | },
786 | "papermill": {
787 | "duration": 5814.134527,
788 | "end_time": "2021-01-31T13:15:29.301782",
789 | "environment_variables": {},
790 | "exception": null,
791 | "input_path": "__notebook__.ipynb",
792 | "output_path": "__notebook__.ipynb",
793 | "parameters": {},
794 | "start_time": "2021-01-31T11:38:35.167255",
795 | "version": "2.1.0"
796 | }
797 | },
798 | "nbformat": 4,
799 | "nbformat_minor": 4
800 | }
801 |
--------------------------------------------------------------------------------
/regex_nlp_kko/clustering.py:
--------------------------------------------------------------------------------
1 | from sklearn.cluster import DBSCAN
2 | from gensim.models import Word2Vec
3 |
4 | import pandas as pd
5 | import re
6 |
7 | import matplotlib.pyplot as plt
8 | import matplotlib.font_manager as fm
9 |
10 |
11 | def cluster(eps, min_sample):
12 | # Word Vector를 Load 합니다.
13 | model = Word2Vec.load("./result/embedding.model")
14 |
15 | word_vector = model.wv.vectors
16 | match_index = model.wv.index2word
17 | model.init_sims(replace=True)
18 |
19 | # 두 글자이상 한글인 경우만 사용
20 | han = re.compile(r"[가-힣]{2,}")
21 |
22 | # DBSCAN 알고리즘 적용
23 | dbscan = DBSCAN(eps=eps, min_samples=min_sample)
24 | clusters = dbscan.fit_predict(word_vector)
25 |
26 | df = pd.DataFrame(clusters, columns=["cluster"], index=match_index).reset_index()
27 | df.columns = ["word", "cluster"]
28 | print(df.head())
29 |
30 | # 한글만 필터링 처리
31 | df = df[df["word"].apply(lambda x: len(han.findall(x)) > 0)]
32 |
33 | # 노이즈 포인트 제거
34 | df = df[df["cluster"] != -1]
35 |
36 | print(df.groupby(["cluster"]).count())
37 |
38 | df.to_excel(pd.ExcelWriter("./result/cluster.xlsx"), index=False)
39 |
40 |
41 | def plot():
42 | clstr = pd.read_excel(pd.ExcelFile("./result/cluster.xlsx"))
43 | min_cluster = clstr["cluster"].min()
44 | max_cluster = clstr["cluster"].max()
45 |
46 | print(min_cluster, max_cluster)
47 | for clstr_num in range(min_cluster, max_cluster + 1):
48 | clstr_index = clstr[clstr["cluster"] == clstr_num].index
49 | clstr.loc[clstr_index, "value"] = list(range(0, len(clstr_index) * 3, 3))
50 |
51 | font = fm.FontProperties(fname="./font/NanumGothic.ttf", size=12)
52 |
53 | fig, ax = plt.subplots()
54 | clstr.plot.scatter(x="cluster", y="value", ax=ax)
55 | clstr[["cluster", "value", "word"]].apply(lambda x: ax.text(*x, fontproperties=font), axis=1)
56 | plt.show()
57 |
58 |
59 | if __name__ == '__main__':
60 | cluster(eps=0.75, min_sample=6)
61 | plot()
--------------------------------------------------------------------------------
/regex_nlp_kko/font/NanumBrush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/font/NanumBrush.ttf
--------------------------------------------------------------------------------
/regex_nlp_kko/font/NanumGothic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/font/NanumGothic.ttf
--------------------------------------------------------------------------------
/regex_nlp_kko/font/denne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/font/denne.png
--------------------------------------------------------------------------------
/regex_nlp_kko/kkma_token.py:
--------------------------------------------------------------------------------
1 | import re
2 | import pandas as pd
3 |
4 | from konlpy.tag import Kkma
5 |
6 |
7 | def get_noun(msg_txt):
8 | kkma = Kkma()
9 | nouns = list()
10 | pattern = re.compile("[ㄱ-ㅎㅏ-ㅣ]+")
11 | msg_txt = re.sub(pattern, "", msg_txt).strip()
12 |
13 | if len(msg_txt) > 0:
14 | pos = kkma.pos(msg_txt)
15 | for keyword, type in pos:
16 | # 고유명사 또는 보통명사
17 | if type == "NNG" or type == "NNP":
18 | nouns.append(keyword)
19 | print(msg_txt, "->", nouns)
20 |
21 | return nouns
22 |
23 |
24 | def get_all_token(msg_txt):
25 | kkma = Kkma()
26 | nouns = list()
27 | pattern = re.compile("[ㄱ-ㅎㅏ-ㅣ]+")
28 | msg_txt = re.sub(pattern, "", msg_txt).strip()
29 |
30 | if len(msg_txt) > 0:
31 | pos = kkma.pos(msg_txt)
32 | for keyword, type in pos:
33 | nouns.append(keyword)
34 | print(msg_txt, "->", nouns)
35 |
36 | return nouns
37 |
38 |
39 | if __name__ == '__main__':
40 | raw_data = pd.read_csv("./result/kko_regex.csv")
41 | print(raw_data.head())
42 | raw_data = raw_data.dropna()
43 |
44 | raw_data["token"] = raw_data["contents"].apply(lambda x: get_noun(x))
45 | raw_data.to_csv("./result/noun_token.csv", index=False)
46 |
47 | raw_data["token"] = raw_data["contents"].apply(lambda x: get_all_token(x))
48 | raw_data.to_csv("./result/all_token.csv", index=False)
49 |
50 |
--------------------------------------------------------------------------------
/regex_nlp_kko/mwordcloud.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import ast
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 |
6 | from PIL import Image
7 | from wordcloud import WordCloud
8 |
9 |
10 | def get_except_keyword(filename):
11 | keyword_list = list()
12 | with open(filename, encoding='utf-8') as f:
13 | for keyword in f.readlines():
14 | keyword_list.append(keyword.strip())
15 | print(keyword_list)
16 | return keyword_list
17 |
18 |
19 | def draw_wordcloud(kkma_result):
20 | # List로 되어있는 열을 Row 단위로 분리
21 | tokens = pd.DataFrame(kkma_result["token"].apply(lambda x: ast.literal_eval(x)).tolist())
22 |
23 | tokens["Date"] = kkma_result["Date"]
24 | tokens["Speaker"] = kkma_result["Speaker"]
25 | tokens["timetype"] = kkma_result["timetype"]
26 | tokens["time"] = kkma_result["time"]
27 | tokens["contents"] = kkma_result["contents"]
28 |
29 | tokens = tokens.set_index(["Date", "Speaker", "timetype", "time", "contents"])
30 | tokens = tokens.T.unstack().dropna().reset_index()
31 |
32 | tokens.columns = ["Date", "Person", "time_type", "time", "sntc", "index", "token"]
33 | print(tokens.head())
34 |
35 | # 빈도수 집계
36 | summary = tokens.groupby(["token"])["index"].count().reset_index()
37 | summary = summary.sort_values(["index"], ascending=[False]).reset_index(drop=True)
38 |
39 | # 특정 단어 필터링
40 | except_keyword = get_except_keyword("./raw_data/except_word.txt")
41 | summary = summary[summary["token"].apply(lambda x: x not in except_keyword)]
42 | summary = summary[summary["token"].apply(lambda x: len(x) > 1)]
43 |
44 | # 이미지 Mask 생성
45 | denne_mask = np.array(Image.open("./font/denne.png"))
46 |
47 | # 워드클라우드 생성
48 | wc = WordCloud(font_path='./font/NanumBrush.ttf', background_color='white', mask=denne_mask, width=800, height=600).generate(" ".join(summary["token"]))
49 | plt.imshow(wc)
50 | plt.axis("off")
51 | plt.show()
52 |
53 |
54 | if __name__ == '__main__':
55 | kkma_result = pd.read_csv("./result/noun_token.csv")
56 | draw_wordcloud(kkma_result)
57 |
58 |
--------------------------------------------------------------------------------
/regex_nlp_kko/raw_data/except_similar.txt:
--------------------------------------------------------------------------------
1 | 오늘
2 | 내일
3 | 모레
4 | 어제
5 | 내가
6 | 나도
7 | 옹이
8 | 자수
9 | 보수
--------------------------------------------------------------------------------
/regex_nlp_kko/raw_data/except_word.txt:
--------------------------------------------------------------------------------
1 | 이모티콘
2 | 오늘
3 | 내일
4 | 모레
5 | 어제
6 | 사진
7 | 동영상
8 | 내가
9 | 나도
10 | 옹이
--------------------------------------------------------------------------------
/regex_nlp_kko/raw_data/kko.txt:
--------------------------------------------------------------------------------
1 | --------------- 2018년 1월 1일 월요일 ---------------
2 | [손찬호] [오전 12:00] 이모티콘
3 | [손찬호] [오전 12:00] 새해복
4 | [손찬호] [오전 12:00] 마니받아랑
5 | [손찬호] [오전 12:00] 이모티콘
6 | [손찬호] [오전 12:01] 2018년에도 행복하장^~^
7 | [여보♥] [오전 12:01] 야야 ㅋㅋㅋ 사장님도
8 | [여보♥] [오전 12:01] 복마니받고
9 | [여보♥] [오전 12:01] 올한해도 많은 추억을
10 | [여보♥] [오전 12:01] 보내보자구엿
11 | [여보♥] [오전 12:01] 이모티콘
12 |
--------------------------------------------------------------------------------
/regex_nlp_kko/regex.py:
--------------------------------------------------------------------------------
1 | import re
2 | import pandas as pd
3 | import datetime as dt
4 |
5 |
6 | def read_kko_msg(filename):
7 | with open(filename, encoding='utf-8') as f:
8 | msg_list = f.readlines()
9 |
10 | return msg_list
11 |
12 |
13 | def apply_kko_regex(msg_list):
14 | kko_pattern = re.compile("\[([\S\s]+)\] \[(오전|오후) ([0-9:\s]+)\] ([^\n]+)")
15 | kko_date_pattern = re.compile("--------------- ([0-9]+년 [0-9]+월 [0-9]+일) ")
16 |
17 | emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F" # emoticons
18 | u"\U0001F300-\U0001F5FF" # symbols & pictographs
19 | u"\U0001F680-\U0001F6FF" # transport & map symbols
20 | u"\U0001F1E0-\U0001F1FF" # flags (iOS)
21 | "]+", flags=re.UNICODE)
22 |
23 | kko_parse_result = list()
24 | cur_date = ""
25 |
26 | for msg in msg_list:
27 | # 날짜 부분인 경우
28 | if len(kko_date_pattern.findall(msg)) > 0:
29 | cur_date = dt.datetime.strptime(kko_date_pattern.findall(msg)[0], "%Y년 %m월 %d일")
30 | cur_date = cur_date.strftime("%Y-%m-%d")
31 | else:
32 | kko_pattern_result = kko_pattern.findall(msg)
33 | if len(kko_pattern_result) > 0:
34 | tokens = list(kko_pattern_result[0])
35 | # 이모지 데이터 삭제
36 | tokens[-1] = re.sub(emoji_pattern, "", tokens[-1])
37 | tokens.insert(0, cur_date)
38 | kko_parse_result.append(tokens)
39 |
40 | kko_parse_result = pd.DataFrame(kko_parse_result, columns=["Date", "Speaker", "timetype", "time", "contents"])
41 | kko_parse_result.to_csv("./result/kko_regex.csv", index=False)
42 |
43 | return kko_parse_result
44 |
45 |
46 | if __name__ == '__main__':
47 | msg_list = read_kko_msg("./raw_data/kko.txt")
48 | apply_kko_regex(msg_list)
49 |
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_3.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_4.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_4.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_5.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_5.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_6.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_6.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_7.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_7.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_8.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.7/cluster_eps_8.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_3.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_4.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_4.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_5.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_5.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_6.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_6.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_7.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_7.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_8.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/DBSCAN_eps0.75/cluster_eps_8.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/all_token.csv:
--------------------------------------------------------------------------------
1 | Date,Speaker,timetype,time,contents,token
2 | 2018-01-01,손찬호,오전,12:00,이모티콘,['이모티콘']
3 | 2018-01-01,손찬호,오전,12:00,새해복,"['새해', '복']"
4 | 2018-01-01,손찬호,오전,12:00,마니받아랑,"['마니', '받아랑']"
5 | 2018-01-01,손찬호,오전,12:00,이모티콘,['이모티콘']
6 | 2018-01-01,손찬호,오전,12:01,2018년에도 행복하장^~^,"['2018', '년', '에', '도', '행복', '하장', '^~^']"
7 | 2018-01-01,여보♥,오전,12:01,야야 ㅋㅋㅋ 사장님도,"['야', '야', '사장님', '도']"
8 | 2018-01-01,여보♥,오전,12:01,복마니받고,"['복', '마니', '받', '고']"
9 | 2018-01-01,여보♥,오전,12:01,올한해도 많은 추억을,"['오', 'ㄹ', '한해', '도', '많', '은', '추억', '을']"
10 | 2018-01-01,여보♥,오전,12:01,보내보자구엿,"['보', '내', '보', '자', '구', '엿']"
11 |
--------------------------------------------------------------------------------
/regex_nlp_kko/result/cluster.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/cluster.xlsx
--------------------------------------------------------------------------------
/regex_nlp_kko/result/embedding.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ssooni/data_mining_practice/cd21e062155f8c1cb4f22a4449467dc55882b96a/regex_nlp_kko/result/embedding.model
--------------------------------------------------------------------------------
/regex_nlp_kko/result/kko_regex.csv:
--------------------------------------------------------------------------------
1 | Date,Speaker,timetype,time,contents
2 | 2018-01-01,손찬호,오전,12:00,이모티콘
3 | 2018-01-01,손찬호,오전,12:00,새해복
4 | 2018-01-01,손찬호,오전,12:00,마니받아랑
5 | 2018-01-01,손찬호,오전,12:00,이모티콘
6 | 2018-01-01,손찬호,오전,12:01,2018년에도 행복하장^~^
7 | 2018-01-01,여보♥,오전,12:01,야야 ㅋㅋㅋ 사장님도
8 | 2018-01-01,여보♥,오전,12:01,복마니받고
9 | 2018-01-01,여보♥,오전,12:01,올한해도 많은 추억을
10 | 2018-01-01,여보♥,오전,12:01,보내보자구엿
11 |
--------------------------------------------------------------------------------
/regex_nlp_kko/result/noun_token.csv:
--------------------------------------------------------------------------------
1 | Date,Speaker,timetype,time,contents,token
2 | 2018-01-01,손찬호,오전,12:00,이모티콘,['이모티콘']
3 | 2018-01-01,손찬호,오전,12:00,새해복,"['새해', '복']"
4 | 2018-01-01,손찬호,오전,12:00,마니받아랑,['마니']
5 | 2018-01-01,손찬호,오전,12:00,이모티콘,['이모티콘']
6 | 2018-01-01,손찬호,오전,12:01,2018년에도 행복하장^~^,"['행복', '하장']"
7 | 2018-01-01,여보♥,오전,12:01,야야 ㅋㅋㅋ 사장님도,['사장님']
8 | 2018-01-01,여보♥,오전,12:01,복마니받고,"['복', '마니']"
9 | 2018-01-01,여보♥,오전,12:01,올한해도 많은 추억을,"['한해', '추억']"
10 | 2018-01-01,여보♥,오전,12:01,보내보자구엿,"['구', '엿']"
11 |
--------------------------------------------------------------------------------
/regex_nlp_kko/similar_day.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import ast
3 | import re
4 |
5 | from sklearn.metrics.pairwise import linear_kernel
6 | from sklearn.feature_extraction.text import TfidfVectorizer
7 |
8 |
9 | def preprocess(x):
10 | result = list()
11 | except_word = open("./raw_data/except_similar.txt", mode="r", encoding="utf-8").readlines()
12 | han = re.compile(r"[가-힣]{2,}")
13 |
14 | for i in x:
15 | # 두글자 이상 한글이 포함되는 경우만 사용
16 | if len(han.findall(i)) == 0:
17 | continue
18 | # 이모티콘 / 사진 / 동영상을 보낸 채팅내역 제거
19 | if len(x) == 1 and i in ["사진", "이모티콘", "동영상"]:
20 | continue
21 | # 제거 대상 키워드리스트에 포함되는 경우제거
22 | if i in except_word:
23 | continue
24 |
25 | result.append(i)
26 | return result
27 |
28 |
29 | def generate_daily_bow(tokens):
30 | tokens["token"] = tokens["token"].apply(lambda x: preprocess(x))
31 |
32 | date_list = sorted(list(set(tokens["Date"].tolist())))
33 | print("Date : ", len(date_list), " days")
34 |
35 | document_list = list()
36 | for date in sorted(date_list):
37 | daily_tokens = tokens[tokens["Date"] == date]
38 | document = ""
39 | for t in daily_tokens["token"].tolist():
40 | if len(" ".join(t)) > 0:
41 | document += " "
42 | document += " ".join(t)
43 |
44 | print(date, document[0:100])
45 | document_list.append(document)
46 |
47 | vectorizer = TfidfVectorizer()
48 | X = vectorizer.fit_transform(document_list)
49 |
50 | similar_result = list()
51 | for index, date in enumerate(sorted(date_list)):
52 | for i, score in find_similar(X, index):
53 | similar_result.append([date, date_list[i], score])
54 |
55 | df = pd.DataFrame(similar_result, columns=["Date", "Smlr_Date", "Similarity"])
56 | df = df.sort_values(["Similarity"], ascending=False)
57 | print(df)
58 |
59 |
60 | def find_similar(tfidf_matrix, index, top_n=5):
61 | cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
62 | related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
63 | return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]
64 |
65 |
66 | if __name__ == '__main__':
67 | tokens = pd.read_csv("./result/noun_token_1.csv")
68 | tokens["token"] = tokens["token"].apply(lambda x: ast.literal_eval(x))
69 | print(tokens.head())
70 |
71 | generate_daily_bow(tokens)
72 |
--------------------------------------------------------------------------------
/regex_nlp_kko/word2vector.py:
--------------------------------------------------------------------------------
1 | from gensim.models.word2vec import Word2Vec
2 | import ast
3 | import pandas as pd
4 |
5 | import logging
6 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
7 |
8 |
9 | def create_model(filename, skip_gram=False):
10 | tokens = pd.read_csv(filename)
11 | tokens = tokens[tokens["contents"].apply(lambda x: 'http' not in x)]
12 |
13 | sentence = tokens["token"].apply(lambda x: ast.literal_eval(x)).tolist()
14 |
15 | if skip_gram:
16 | model = Word2Vec(sentence, min_count=10, iter=20, size=300, sg=1)
17 | else:
18 | model = Word2Vec(sentence, min_count=10, iter=20, size=300, sg=0)
19 |
20 | model.init_sims(replace=True)
21 | model.save("./result/embedding.model")
22 |
23 |
24 | def most_similar():
25 | model = Word2Vec.load("./result/embedding.model")
26 | print("용돈과 관련된 키워드 : ", model.most_similar("용돈"))
27 | print("졍이와 관련된 키워드 : ", model.most_similar("졍이"))
28 | print("쭈니와 관련된 키워드 : ", model.most_similar("쭈니"))
29 |
30 |
31 | if __name__ == '__main__':
32 | # create_model("./result/all_token_1.csv")
33 | most_similar()
34 |
--------------------------------------------------------------------------------