├── README.md
├── mkb_benchmark.ipynb
├── dj_recomgenerator.ipynb
├── dj_sas_benchmark.ipynb
├── dj_benchmark_GMSC_01.ipynb
├── Benchmark_mmp_digital_reputation_challenge_1.ipynb
├── dj_invest_GMSC.ipynb
└── dj_Benchmark_12trip.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # notebooks
2 | Некоторые фрагменты кода
3 |
4 |
5 | [Бенчмарк для студентов 2 курса ВМК МГУ](dj_benchmark_GMSC_01.ipynb) для задачи https://www.kaggle.com/c/msu-iml-2018
6 |
7 | [Чуть подробнее для 4 курса ВМК МГУ](dj_invest_GMSC.ipynb) для задачи https://www.kaggle.com/c/msu-iml-2018
8 |
9 | [Бенчмарк для студентов 5 курса ВМК МГУ](Benchmark_mmp_digital_reputation_challenge_1.ipynb) для задачи https://boosters.pro/championship/digital_reputation_challenge/overview
10 |
11 | [Бенчмарк для студентов 5 курса ВМК МГУ](dj_sas_benchmark.ipynb) для задачи https://sascompetitions.ru
12 |
13 | [Бенчмарк для студентов 2 курса ВМК МГУ](dj_Benchmark_12trip.ipynb) для задачи 1 соревнования https://boosters.pro/championship/onetwotrip_challenge/overview
14 |
15 | [Бенчмарк для студентов ПЗАД](mkb_benchmark.ipynb) для задачи соревнования [Хакатон МКБ 2021](https://dsbattle.com/hackathons/mkb/)
16 |
--------------------------------------------------------------------------------
/mkb_benchmark.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3e7947ff-db80-4afb-a61d-b708c8db3ddc",
6 | "metadata": {},
7 | "source": [
8 | "# примитивный вариант решения задачи MKB\n",
9 | "\n",
10 | "автор: Александр Дьяконов (https://dyakonov.org/ag/)\n",
11 | "\n",
12 | "цель: для оценки студентов своего курса (нужно за неделю побить этот бенчмарк)\n",
13 | "\n",
14 | "* решение записано за 20 минут\n",
15 | "* практически нет генерации признаков\n",
16 | "* все категории кодируются по мощности\n",
17 | "* пропуски -> -1\n",
18 | "* одна модель - lgb\n",
19 | "\n",
20 | "результат в лидерборе 0.8889 (на момент посылки ~15 место из 100)"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 1,
26 | "id": "bc3393d8-99c9-44b2-8fd6-657cc085e38b",
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import numpy as np\n",
31 | "import pandas as pd"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "id": "8f537e13-4136-4120-969f-1fde6a500f7e",
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "(17891, 124) (7330, 123)\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "data_train = pd.read_csv('train_dataset_hackathon_mkb.csv', encoding='cp1251', delimiter=';')\n",
50 | "data_test = pd.read_csv('test_dataset_hackathon_mkb.csv', encoding='cp1251', delimiter=';')\n",
51 | "print (data_train.shape, data_test.shape)"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "id": "378ad637-dac9-4b87-a38a-9d4dc11e1424",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "def makeX(data):\n",
62 | " # предобработка данных\n",
63 | " data['CITIZENSHIP_NAME'] = data['CITIZENSHIP_NAME'].fillna(-1).map({-1: -1, 'Российская Федерация': 4, 'Таджикистан': 3, 'Казахстан': 2, 'Армения': 1})\n",
64 | " data['SEX_NAME'] = data['SEX_NAME'].fillna(0).map({0: 0, 'мужской': 1, 'женский': -1})\n",
65 | " group_names = ['OKFS_GROUP', 'OKOPF_GROUP', 'OKOGU_GROUP'] + ['WORKERSRANGE', 'OKVED_CODE']\n",
66 | " date_names = ['SIGN_DATE', 'DATEFIRSTREG', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE']\n",
67 | " for name in group_names + date_names + ['id_client']:\n",
68 | " data[name] = data[name].fillna(-1)\n",
69 | " tmp = data[name].value_counts()\n",
70 | " tmp = tmp + 0.1 * np.random.randn(len(tmp))\n",
71 | " data[name] = data[name].map(tmp)\n",
72 | " data.fillna(-1, inplace=True)\n",
73 | " return data"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 4,
79 | "id": "d0fecd59-bce3-4315-8c58-60db5a032a8e",
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "data_train = makeX(data_train) # обрабатываем обучение\n",
84 | "data_test = makeX(data_test) # обрабатываем тест"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 5,
90 | "id": "c769d199-a3fa-4feb-b9c0-6bdf1570f242",
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "y = data_train.pop('TARGET').values # целевые значения\n",
95 | "data_test = data_test[data_train.columns] # на всякий случай - вдруг, перемешаны столбцы"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 6,
101 | "id": "7eb90f9b-36f6-49df-8063-2b55c8447e58",
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "import lightgbm as lgb\n",
106 | "\n",
107 | "model = lgb.LGBMClassifier(num_leaves=31,\n",
108 | " learning_rate=0.05,\n",
109 | " n_estimators=200)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 7,
115 | "id": "07a1b578-fd9c-4c51-a1cd-38127f065d5e",
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "model.fit(data_train, y)\n",
120 | "\n",
121 | "a = model.predict_proba(data_test)[:, 1] # получаем ответ"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 8,
127 | "id": "4f926992-193a-4f8d-a1c0-2b35a42c38b1",
128 | "metadata": {},
129 | "outputs": [],
130 | "source": [
131 | "df = pd.DataFrame({'id_contract': data_test.id_contract.values, 'TARGET': a})\n",
132 | "df.to_csv('ans1.csv', sep=';', index=False) # сохраняем ответ"
133 | ]
134 | }
135 | ],
136 | "metadata": {
137 | "kernelspec": {
138 | "display_name": "Python 3",
139 | "language": "python",
140 | "name": "python3"
141 | },
142 | "language_info": {
143 | "codemirror_mode": {
144 | "name": "ipython",
145 | "version": 3
146 | },
147 | "file_extension": ".py",
148 | "mimetype": "text/x-python",
149 | "name": "python",
150 | "nbconvert_exporter": "python",
151 | "pygments_lexer": "ipython3",
152 | "version": "3.7.10"
153 | }
154 | },
155 | "nbformat": 4,
156 | "nbformat_minor": 5
157 | }
158 |
--------------------------------------------------------------------------------
/dj_recomgenerator.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 24,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "name = 'George'\n",
12 | "surname = 'Kudryavtsev'\n",
13 | "when = \"September, 2014\"\n",
14 | "sex = \"m\"\n",
15 | "research = \"recommender systems\"\n",
16 | "to = \"the master's program at the Skolkovo Institute of Science and Technology\"\n",
17 | "\n",
18 | "\n",
19 | "fullname = name + ' ' + surname\n",
20 | "if (sex==\"m\"):\n",
21 | " his = \"his\"\n",
22 | " he = \"he\"\n",
23 | " He = \"He\"\n",
24 | " His = \"His\"\n",
25 | "else:\n",
26 | " his = \"her\"\n",
27 | " he = \"she\"\n",
28 | " He = \"She\"\n",
29 | " His = \"Her\"\n",
30 | "\n",
31 | "comment = \"\"\n",
32 | "# comment = \"Since \" + his +\" research topic is not widespread in Russia, \" + name + \"has to study a large amount of relevant literature in English.\""
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 25,
38 | "metadata": {
39 | "collapsed": false
40 | },
41 | "outputs": [
42 | {
43 | "name": "stdout",
44 | "output_type": "stream",
45 | "text": [
46 | "Dear Sir or Madame,\n",
47 | "\n",
48 | "\n",
49 | "I am pleased to provide you with this online letter of recommendation in support of George Kudryavtsev. I am supervisor of his research work at the department of Computational Mathematics and Cybernetics at Lomonosov Moscow State University.\n",
50 | "\n",
51 | "I first met George in September, 2014 when he came to learn about my scientific interests. He also spoke to my colleagues and demonstrated his deep interest to our scientific field. George had excellent academic progress and I was glad to become his research advisor. George attended lectures and seminars I gave at the department. I could positively say that George was among the top students of the group. I would like to make special mention of George talent to learn and structure new information. I also supervised George’s scientific research. I would like to mention his high motivation for research work and organizational abilities.\n",
52 | "\n",
53 | "His research was devoted to recommender systems. In addition George employed his good programming skills to implement many algorithms.\n",
54 | "I consider George as a very promising researcher and confidently recommend his for «the master's program at the Skolkovo Institute of Science and Technology».\n",
55 | "\n",
56 | "If you require any further information, please do not hesitate to contact me: djakonov@mail.ru.\n",
57 | "\n",
58 | "D’yakonov A. G.\n",
59 | "Doctor of Sciences,\n",
60 | "Professor of Department of Mathematical Methods of Forecasting at Lomonosov Moscow State University\n"
61 | ]
62 | }
63 | ],
64 | "source": [
65 | "print (\"Dear Sir or Madame,\\n\\n\")\n",
66 | "print (\"I am pleased to provide you with this online letter of recommendation in support of \" + fullname + \\\n",
67 | " \". I am supervisor of \" + his + \" research work at the department of Computational Mathematics and Cybernetics at Lomonosov Moscow State University.\\n\")\n",
68 | "print (\"I first met \" + name + \" in \" + when + \" when \" + he + \" came to learn about my scientific interests. \" + He + \" also spoke to my colleagues and demonstrated \" + his + \" deep interest to our scientific field. \" + \\\n",
69 | " name + \" had excellent academic progress and I was glad to become \" + his + \" research advisor. \" + name + \\\n",
70 | " \" attended lectures and seminars I gave at the department. I could positively say that \" + name +\n",
71 | " \" was among the top students of the group. I would like to make special mention of \" + name + \\\n",
72 | " \" talent to learn and structure new information. I also supervised \" + name + \"’s scientific research. I would like to mention \" + his + \" high motivation for research work and organizational abilities.\\n\")\n",
73 | "print (His + \" research was devoted to \" + research + '. ' + comment + ' In addition ' + name + ' employed his good programming skills to implement many algorithms.')\n",
74 | "print (\"I consider \" + name + \" as a very promising researcher and confidently recommend his for «\" + to + \"».\\n\")\n",
75 | "print (\"If you require any further information, please do not hesitate to contact me: djakonov@mail.ru.\\n\")\n",
76 | "print (\"D’yakonov A. G.\")\n",
77 | "print (\"Doctor of Sciences,\")\n",
78 | "print (\"Professor of Department of Mathematical Methods of Forecasting at Lomonosov Moscow State University\")"
79 | ]
80 | }
81 | ],
82 | "metadata": {
83 | "kernelspec": {
84 | "display_name": "Python 3",
85 | "language": "python",
86 | "name": "python3"
87 | },
88 | "language_info": {
89 | "codemirror_mode": {
90 | "name": "ipython",
91 | "version": 3
92 | },
93 | "file_extension": ".py",
94 | "mimetype": "text/x-python",
95 | "name": "python",
96 | "nbconvert_exporter": "python",
97 | "pygments_lexer": "ipython3",
98 | "version": "3.5.0"
99 | }
100 | },
101 | "nbformat": 4,
102 | "nbformat_minor": 0
103 | }
104 |
--------------------------------------------------------------------------------
/dj_sas_benchmark.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Тупой бенчмарк"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# подгружаем все нужные пакеты\n",
17 | "import pandas as pd\n",
18 | "import numpy as np\n",
19 | "# для встроенных картинок\n",
20 | "%pylab inline\n",
21 | "import matplotlib.pyplot as plt"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "train = pd.read_csv(\"train.csv\", encoding='cp1251')"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 9,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "test = pd.read_csv(\"test.csv\", encoding='cp1251')"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 16,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "def prepare_data(d, cf=True):\n",
49 | " print ('Time')\n",
50 | " tmp = d['Interval'].str.split('-')\n",
51 | " d['int0']=tmp.apply(lambda x: int(x[0]))\n",
52 | " d['int1']=tmp.apply(lambda x: int(x[1][:-1]))\n",
53 | " del d['Interval']\n",
54 | " \n",
55 | " print ('Date')\n",
56 | " d.OrderDate = pd.to_datetime(d.OrderDate)\n",
57 | " d.Date = pd.to_datetime(d.Date)\n",
58 | " \n",
59 | " deltaT = (d.OrderDate - d.Date).dt.days.astype(int).values\n",
60 | " d['deltaT'] = deltaT\n",
61 | " \n",
62 | " print ('Day')\n",
63 | " d['Date_day'] = d.Date.dt.day\n",
64 | " d['Date_month'] = d.Date.dt.month\n",
65 | " d['Date_weekday'] = d.Date.dt.weekday\n",
66 | " # train['Date_year'] = train.Date.dt.year\n",
67 | " \n",
68 | " d['OrderDate_day'] = d.OrderDate.dt.day\n",
69 | " d['OrderDate_month'] = d.OrderDate.dt.month\n",
70 | " d['OrderDate_weekday'] = d.OrderDate.dt.weekday\n",
71 | " # train['OrderDate_year'] = train.OrderDate.dt.year\n",
72 | " \n",
73 | " del d['Date']\n",
74 | " del d['OrderDate']\n",
75 | " \n",
76 | " print ('Type')\n",
77 | " d['DeliveryType'] = d['DeliveryType'].map({'Обычная доставка': 0, 'Доставка День в День': 1})\n",
78 | " \n",
79 | " print ('Groupby')\n",
80 | " cols = ['ChannelID', 'ClientID', 'DeliveryType', 'prepay', 'count_edit', 'int0', 'int1', 'deltaT', 'Date_day', 'Date_month', 'Date_weekday',\n",
81 | " 'OrderDate_day', 'OrderDate_month', 'OrderDate_weekday']\n",
82 | " if cf:\n",
83 | " cols = cols + ['CancelFlag']\n",
84 | " \n",
85 | " data = d.groupby('OrderID')[cols].first()\n",
86 | " \n",
87 | " print ('Num')\n",
88 | " data['num'] = d.groupby('OrderID')['GroupID'].count()\n",
89 | " data['sum'] = d.groupby('OrderID')['OrderCnt'].sum()\n",
90 | " data['num/sum'] = data['num'] / data['sum']\n",
91 | " \n",
92 | " if cf:\n",
93 | " y = data.pop('CancelFlag')\n",
94 | " return (data, y)\n",
95 | " return (data)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 23,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "Time\n",
108 | "Date\n",
109 | "Day\n",
110 | "Type\n",
111 | "Groupby\n",
112 | "Num\n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "data2 = prepare_data(test, cf=False)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 24,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "name": "stdout",
127 | "output_type": "stream",
128 | "text": [
129 | "Time\n",
130 | "Date\n",
131 | "Day\n",
132 | "Type\n",
133 | "Groupby\n",
134 | "Num\n"
135 | ]
136 | }
137 | ],
138 | "source": [
139 | "data, y = prepare_data(train, cf=True)"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 29,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "[100]\tcv_agg's auc: 0.697344 + 0.00654333\n",
152 | "[200]\tcv_agg's auc: 0.702906 + 0.00615931\n",
153 | "[300]\tcv_agg's auc: 0.706133 + 0.0055678\n",
154 | "[400]\tcv_agg's auc: 0.707971 + 0.00560712\n",
155 | "[500]\tcv_agg's auc: 0.709498 + 0.00520707\n",
156 | "[600]\tcv_agg's auc: 0.710534 + 0.00546179\n",
157 | "[700]\tcv_agg's auc: 0.71134 + 0.00563445\n",
158 | "[800]\tcv_agg's auc: 0.712172 + 0.00550675\n",
159 | "[900]\tcv_agg's auc: 0.712418 + 0.00562642\n",
160 | "[1000]\tcv_agg's auc: 0.712741 + 0.00559418\n"
161 | ]
162 | },
163 | {
164 | "data": {
165 | "text/plain": [
166 | "0.7127633282147994"
167 | ]
168 | },
169 | "execution_count": 29,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "import lightgbm as lgb\n",
176 | "\n",
177 | "param = {'n_estimators':1000, 'num_leaves':6, 'objective':'binary',\n",
178 | " 'learning_rate': 0.1, 'colsample_bytree': 0.75, 'subsample': 0.75,\n",
179 | " 'metric': 'auc'}\n",
180 | " \n",
181 | "w = lgb.cv(param, lgb.Dataset(data, label=y),\n",
182 | " stratified=False,\n",
183 | " num_boost_round=1000, nfold=4, verbose_eval=100)\n",
184 | "max(w['auc-mean'])"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 33,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "978"
196 | ]
197 | },
198 | "execution_count": 33,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "np.argmax(w['auc-mean'])"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 32,
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "name": "stdout",
214 | "output_type": "stream",
215 | "text": [
216 | "[100]\tcv_agg's auc: 0.700747 + 0.0060651\n",
217 | "[200]\tcv_agg's auc: 0.705531 + 0.00576664\n",
218 | "[300]\tcv_agg's auc: 0.708824 + 0.0056635\n",
219 | "[400]\tcv_agg's auc: 0.710562 + 0.00555759\n",
220 | "[500]\tcv_agg's auc: 0.711568 + 0.0050531\n",
221 | "[600]\tcv_agg's auc: 0.712369 + 0.00498233\n",
222 | "[700]\tcv_agg's auc: 0.71297 + 0.0050313\n",
223 | "[800]\tcv_agg's auc: 0.713529 + 0.00490358\n",
224 | "[900]\tcv_agg's auc: 0.713833 + 0.0047361\n",
225 | "[1000]\tcv_agg's auc: 0.71383 + 0.00477599\n"
226 | ]
227 | },
228 | {
229 | "data": {
230 | "text/plain": [
231 | "0.7139605983784897"
232 | ]
233 | },
234 | "execution_count": 32,
235 | "metadata": {},
236 | "output_type": "execute_result"
237 | }
238 | ],
239 | "source": [
240 | "import lightgbm as lgb\n",
241 | "\n",
242 | "param = {'n_estimators':1000, 'num_leaves':8, 'objective':'binary',\n",
243 | " 'learning_rate': 0.1, 'colsample_bytree': 0.75, 'subsample': 0.75,\n",
244 | " 'metric': 'auc'}\n",
245 | " \n",
246 | "w = lgb.cv(param, lgb.Dataset(data, label=y),\n",
247 | " stratified=False,\n",
248 | " num_boost_round=1000, nfold=4, verbose_eval=100)\n",
249 | "max(w['auc-mean'])"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 36,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "model = lgb.LGBMClassifier(learning_rate=0.1, num_leaves=8,\n",
259 | " n_estimators=1000,\n",
260 | " colsample_bytree=0.75, subsample=0.75, random_state=1)\n",
261 | "model.fit(data, y)\n",
262 | "a = model.predict_proba(data2)[:,1]"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 47,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "pd.DataFrame({'ID': [str(x) + ' ' for x in data2.index], ' Score': a}).to_csv('constant.csv', index=False)"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": []
280 | }
281 | ],
282 | "metadata": {
283 | "kernelspec": {
284 | "display_name": "Python 3",
285 | "language": "python",
286 | "name": "python3"
287 | },
288 | "language_info": {
289 | "codemirror_mode": {
290 | "name": "ipython",
291 | "version": 3
292 | },
293 | "file_extension": ".py",
294 | "mimetype": "text/x-python",
295 | "name": "python",
296 | "nbconvert_exporter": "python",
297 | "pygments_lexer": "ipython3",
298 | "version": "3.6.8"
299 | }
300 | },
301 | "nbformat": 4,
302 | "nbformat_minor": 2
303 | }
304 |
--------------------------------------------------------------------------------
/dj_benchmark_GMSC_01.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Бенчмарк\n",
8 | "\n",
9 | "задачи для студентов 2го курса ВМК МГУ\n",
10 | "\n",
11 | "https://www.kaggle.com/c/msu-iml-2018/\n",
12 | "\n",
13 | "2018, Александр Дьяконов https://dyakonov.org/ag/"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 21,
19 | "metadata": {},
20 | "outputs": [
21 | {
22 | "name": "stdout",
23 | "output_type": "stream",
24 | "text": [
25 | "Populating the interactive namespace from numpy and matplotlib\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "import pandas as pd\n",
31 | "import numpy as np\n",
32 | "%pylab inline\n",
33 | "plt.style.use('seaborn-dark')\n",
34 | "import warnings\n",
35 | "warnings.filterwarnings(\"ignore\") # отключение варнингов\n",
36 | "pd.set_option('display.max_columns', None) # pd.options.display.max_columns = None \n",
37 | "# pd.set_option('display.max_rows', None) # не прятать столбцы при выводе дата-фреймов\n",
38 | "import matplotlib.pyplot as plt\n",
39 | "import matplotlib as mpl\n",
40 | "plt.rc('font', size=14)"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "# загружаем данные\n",
48 | "\n",
49 | "не забудьте поменять каталоги"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 23,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "train = pd.read_csv('./data_GMSC/train.csv')\n",
59 | "test = pd.read_csv('./data_GMSC/test.csv')"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 24,
65 | "metadata": {},
66 | "outputs": [
67 | {
68 | "name": "stdout",
69 | "output_type": "stream",
70 | "text": [
71 | "(112500, 11) (37500, 10)\n"
72 | ]
73 | }
74 | ],
75 | "source": [
76 | "# размеры данных\n",
77 | "print(train.shape, test.shape)"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "# смотрим на данные"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 25,
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/html": [
95 | "
\n",
96 | "\n",
109 | "
\n",
110 | " \n",
111 | " \n",
112 | " | \n",
113 | " плохой_клиент | \n",
114 | " линии | \n",
115 | " возраст | \n",
116 | " поведение_30-59_дней | \n",
117 | " Debt_Ratio | \n",
118 | " доход | \n",
119 | " число_кредитов | \n",
120 | " поведение_90_дней | \n",
121 | " недвижимость | \n",
122 | " поведение_60-89_дней | \n",
123 | " семья | \n",
124 | "
\n",
125 | " \n",
126 | " \n",
127 | " \n",
128 | " | 5522 | \n",
129 | " 0 | \n",
130 | " 0.111673 | \n",
131 | " 46 | \n",
132 | " 0 | \n",
133 | " 1.329588 | \n",
134 | " 800.0 | \n",
135 | " 8 | \n",
136 | " 0 | \n",
137 | " 1 | \n",
138 | " 0 | \n",
139 | " 2.0 | \n",
140 | "
\n",
141 | " \n",
142 | " | 89385 | \n",
143 | " 0 | \n",
144 | " 0.044097 | \n",
145 | " 69 | \n",
146 | " 0 | \n",
147 | " 0.535122 | \n",
148 | " 3800.0 | \n",
149 | " 10 | \n",
150 | " 0 | \n",
151 | " 1 | \n",
152 | " 0 | \n",
153 | " 0.0 | \n",
154 | "
\n",
155 | " \n",
156 | " | 81586 | \n",
157 | " 0 | \n",
158 | " 0.047598 | \n",
159 | " 77 | \n",
160 | " 0 | \n",
161 | " 0.169610 | \n",
162 | " 3000.0 | \n",
163 | " 7 | \n",
164 | " 0 | \n",
165 | " 1 | \n",
166 | " 0 | \n",
167 | " 0.0 | \n",
168 | "
\n",
169 | " \n",
170 | " | 105108 | \n",
171 | " 0 | \n",
172 | " 0.761149 | \n",
173 | " 58 | \n",
174 | " 1 | \n",
175 | " 2217.000000 | \n",
176 | " NaN | \n",
177 | " 4 | \n",
178 | " 0 | \n",
179 | " 1 | \n",
180 | " 0 | \n",
181 | " 0.0 | \n",
182 | "
\n",
183 | " \n",
184 | " | 3543 | \n",
185 | " 0 | \n",
186 | " 0.690684 | \n",
187 | " 55 | \n",
188 | " 0 | \n",
189 | " 0.432552 | \n",
190 | " 12416.0 | \n",
191 | " 7 | \n",
192 | " 0 | \n",
193 | " 2 | \n",
194 | " 0 | \n",
195 | " 2.0 | \n",
196 | "
\n",
197 | " \n",
198 | "
\n",
199 | "
"
200 | ],
201 | "text/plain": [
202 | " плохой_клиент линии возраст поведение_30-59_дней Debt_Ratio \\\n",
203 | "5522 0 0.111673 46 0 1.329588 \n",
204 | "89385 0 0.044097 69 0 0.535122 \n",
205 | "81586 0 0.047598 77 0 0.169610 \n",
206 | "105108 0 0.761149 58 1 2217.000000 \n",
207 | "3543 0 0.690684 55 0 0.432552 \n",
208 | "\n",
209 | " доход число_кредитов поведение_90_дней недвижимость \\\n",
210 | "5522 800.0 8 0 1 \n",
211 | "89385 3800.0 10 0 1 \n",
212 | "81586 3000.0 7 0 1 \n",
213 | "105108 NaN 4 0 1 \n",
214 | "3543 12416.0 7 0 2 \n",
215 | "\n",
216 | " поведение_60-89_дней семья \n",
217 | "5522 0 2.0 \n",
218 | "89385 0 0.0 \n",
219 | "81586 0 0.0 \n",
220 | "105108 0 0.0 \n",
221 | "3543 0 2.0 "
222 | ]
223 | },
224 | "execution_count": 25,
225 | "metadata": {},
226 | "output_type": "execute_result"
227 | }
228 | ],
229 | "source": [
230 | "train.sample(5)"
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "metadata": {},
236 | "source": [
237 | "# если хотите работать с numpy-массивом"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 27,
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "data": {
247 | "text/plain": [
248 | "array([[0.00000000e+00, 3.12582480e-02, 5.70000000e+01, 0.00000000e+00,\n",
249 | " 3.97520496e-01, 5.00000000e+03, 1.50000000e+01, 0.00000000e+00,\n",
250 | " 2.00000000e+00, 0.00000000e+00, 0.00000000e+00],\n",
251 | " [0.00000000e+00, 5.23315890e-02, 6.40000000e+01, 0.00000000e+00,\n",
252 | " 5.70000000e+01, nan, 2.00000000e+00, 0.00000000e+00,\n",
253 | " 0.00000000e+00, 0.00000000e+00, nan]])"
254 | ]
255 | },
256 | "execution_count": 27,
257 | "metadata": {},
258 | "output_type": "execute_result"
259 | }
260 | ],
261 | "source": [
262 | "X = train.values\n",
263 | "X[:2,:]"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {},
269 | "source": [
270 | "# готовим данные"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 28,
276 | "metadata": {},
277 | "outputs": [
278 | {
279 | "data": {
280 | "text/plain": [
281 | "((112500, 10), (112500,))"
282 | ]
283 | },
284 | "execution_count": 28,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "y = train.pop('плохой_клиент') # целевой вектор\n",
291 | "train.shape, y.shape"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 29,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "# заменить пропуски на -11\n",
301 | "train.fillna(-11, inplace=True)\n",
302 | "test.fillna(-11, inplace=True)"
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "metadata": {},
308 | "source": [
309 | "# Обучаем модель"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 14,
315 | "metadata": {},
316 | "outputs": [
317 | {
318 | "data": {
319 | "text/plain": [
320 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
321 | " max_depth=2, max_features='auto', max_leaf_nodes=None,\n",
322 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
323 | " min_samples_leaf=1, min_samples_split=2,\n",
324 | " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,\n",
325 | " oob_score=False, random_state=0, verbose=0, warm_start=False)"
326 | ]
327 | },
328 | "execution_count": 14,
329 | "metadata": {},
330 | "output_type": "execute_result"
331 | }
332 | ],
333 | "source": [
334 | "from sklearn.ensemble import RandomForestClassifier\n",
335 | "model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)\n",
336 | "model.fit(train, y)"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": 19,
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "a = model.predict_proba(test)[:,1] # вероятности"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 20,
351 | "metadata": {},
352 | "outputs": [],
353 | "source": [
354 | "# сохранить решение\n",
355 | "pd.DataFrame({'id': np.arange(37500), 'a':a}).to_csv('./data_GMSC/solution.csv', index=False)"
356 | ]
357 | }
358 | ],
359 | "metadata": {
360 | "kernelspec": {
361 | "display_name": "Python 3",
362 | "language": "python",
363 | "name": "python3"
364 | },
365 | "language_info": {
366 | "codemirror_mode": {
367 | "name": "ipython",
368 | "version": 3
369 | },
370 | "file_extension": ".py",
371 | "mimetype": "text/x-python",
372 | "name": "python",
373 | "nbconvert_exporter": "python",
374 | "pygments_lexer": "ipython3",
375 | "version": "3.6.6"
376 | }
377 | },
378 | "nbformat": 4,
379 | "nbformat_minor": 2
380 | }
381 |
--------------------------------------------------------------------------------
/Benchmark_mmp_digital_reputation_challenge_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Бенчмарк\n",
8 | "\n",
9 | "Для задачи **Digital Reputation Challenge**\n",
10 | "\n",
11 | "https://boosters.pro\n",
12 | "\n",
13 | "автор: Александр Дьяконов https://dyakonov.org"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 3,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import numpy as np\n",
23 | "import pandas as pd\n",
24 | "import matplotlib\n",
25 | "import matplotlib.pyplot as plt\n",
26 | "%matplotlib inline"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "## загрузка данных"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 4,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "(4000, 26) (462888, 2) (4000, 453) (4000, 6)\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "TRAIN_PATH = 'train/'\n",
51 | "X1 = pd.read_csv(TRAIN_PATH + 'X1.csv')\n",
52 | "X2 = pd.read_csv(TRAIN_PATH + 'X2.csv')\n",
53 | "X3 = pd.read_csv(TRAIN_PATH + 'X3.csv')\n",
54 | "Y = pd.read_csv(TRAIN_PATH + 'Y.csv')\n",
55 | "print (X1.shape, X2.shape, X3.shape, Y.shape)"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 5,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "Y.columns = ['Y' + s if s != 'id' else 'id' for s in Y.columns]"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 6,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "(4058, 26) (470083, 2) (4058, 453)\n"
77 | ]
78 | }
79 | ],
80 | "source": [
81 | "TEST_PATH = 'test/'\n",
82 | "X1_test = pd.read_csv(TEST_PATH + 'X1.csv')\n",
83 | "X2_test = pd.read_csv(TEST_PATH + 'X2.csv')\n",
84 | "X3_test = pd.read_csv(TEST_PATH + 'X3.csv')\n",
85 | "print (X1_test.shape, X2_test.shape, X3_test.shape)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "# Подготовка обучения и теста\n",
93 | "\n",
94 | "используем только матрицу 1"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 7,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "X = X1.copy()\n",
104 | "X = X.merge(Y)\n",
105 | " \n",
106 | "id_ = X.pop('id')\n",
107 | "y1 = X.pop('Y1')\n",
108 | "y2 = X.pop('Y2')\n",
109 | "y3 = X.pop('Y3')\n",
110 | "y4 = X.pop('Y4')\n",
111 | "y5 = X.pop('Y5')\n",
112 | "\n",
113 | "X_test = X1_test.copy()\n",
114 | "id__ = X_test.pop('id')"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "### Эксперименты\n",
122 | "\n",
123 | "делаются так..."
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 9,
129 | "metadata": {},
130 | "outputs": [
131 | {
132 | "name": "stderr",
133 | "output_type": "stream",
134 | "text": [
135 | "/home/alexander/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:390: UserWarning: Found `n_estimators` in params. Will use it instead of argument\n",
136 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
137 | ]
138 | },
139 | {
140 | "name": "stdout",
141 | "output_type": "stream",
142 | "text": [
143 | "[100]\tcv_agg's auc: 0.600372 + 0.0184634\n",
144 | "[200]\tcv_agg's auc: 0.597381 + 0.0157782\n",
145 | "[300]\tcv_agg's auc: 0.5931 + 0.0141725\n",
146 | "[400]\tcv_agg's auc: 0.590727 + 0.0117742\n",
147 | "[500]\tcv_agg's auc: 0.587088 + 0.0124422\n",
148 | "[600]\tcv_agg's auc: 0.586458 + 0.0104644\n",
149 | "[700]\tcv_agg's auc: 0.587516 + 0.00971511\n",
150 | "[800]\tcv_agg's auc: 0.587225 + 0.0104176\n",
151 | "[900]\tcv_agg's auc: 0.585616 + 0.0111163\n",
152 | "[1000]\tcv_agg's auc: 0.584474 + 0.0119534\n"
153 | ]
154 | },
155 | {
156 | "data": {
157 | "text/plain": [
158 | "0.6026030903311532"
159 | ]
160 | },
161 | "execution_count": 9,
162 | "metadata": {},
163 | "output_type": "execute_result"
164 | }
165 | ],
166 | "source": [
167 | "import lightgbm as lgb\n",
168 | "\n",
169 | "param = {'n_estimators':1000, 'num_leaves':6, 'objective':'binary',\n",
170 | " 'learning_rate': 0.05, 'colsample_bytree': 0.75, 'subsample': 0.75,\n",
171 | " 'metric': 'auc'}\n",
172 | " \n",
173 | "w = lgb.cv(param, lgb.Dataset(X, label=y1),\n",
174 | " stratified=False,\n",
175 | " num_boost_round=1000, nfold=4, verbose_eval=100)\n",
176 | "max(w['auc-mean'])"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 10,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "data": {
186 | "text/plain": [
187 | "[]"
188 | ]
189 | },
190 | "execution_count": 10,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | },
194 | {
195 | "data": {
196 | "image/png": "\n",
197 | "text/plain": [
198 | ""
199 | ]
200 | },
201 | "metadata": {
202 | "needs_background": "light"
203 | },
204 | "output_type": "display_data"
205 | }
206 | ],
207 | "source": [
208 | "plt.plot(w['auc-mean'])"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "# Подготовка ответа"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 11,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "a1 = 0.0\n",
225 | "for t in range(10):\n",
226 | " model1 = lgb.LGBMClassifier(learning_rate=0.01, num_leaves=6,\n",
227 | " n_estimators=290,\n",
228 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n",
229 | " model1.fit(X, y1)\n",
230 | " a = model1.predict_proba(X_test)[:,1]\n",
231 | " # print (a)\n",
232 | " a1 += a\n",
233 | "a1 = a1 / 10"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 12,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "a2 = 0.0\n",
243 | "for t in range(10):\n",
244 | " model2 = lgb.LGBMClassifier(learning_rate=0.03, num_leaves=2,\n",
245 | " n_estimators=378,\n",
246 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n",
247 | " model2.fit(X, y2)\n",
248 | " a = model2.predict_proba(X_test)[:,1]\n",
249 | " a2 += a\n",
250 | "a2 = a2 / 10"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 13,
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "a3 = 0.0\n",
260 | "for t in range(10):\n",
261 | " model3 = lgb.LGBMClassifier(learning_rate=0.01, num_leaves=4,\n",
262 | " n_estimators=543,\n",
263 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n",
264 | " model3.fit(X, y3)\n",
265 | " a = model3.predict_proba(X_test)[:,1]\n",
266 | " a3 += a\n",
267 | "a3 = a3 / 10"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 14,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "a4 = 0.0\n",
277 | "for t in range(10):\n",
278 | " model4 = lgb.LGBMClassifier(learning_rate=0.003, num_leaves=6,\n",
279 | " n_estimators=618,\n",
280 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n",
281 | " model4.fit(X, y4)\n",
282 | " a = model4.predict_proba(X_test)[:,1]\n",
283 | " a4 += a\n",
284 | "a4 = a4 / 10"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 15,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "a5 = 0.0\n",
294 | "for t in range(10):\n",
295 | " model5 = lgb.LGBMClassifier(learning_rate=0.002, num_leaves=3,\n",
296 | " n_estimators=516,\n",
297 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n",
298 | " model5.fit(X, y5)\n",
299 | " a = model5.predict_proba(X_test)[:,1]\n",
300 | " #print (a)\n",
301 | " a5 += a\n",
302 | "a5 = a5 / 10"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 16,
308 | "metadata": {},
309 | "outputs": [
310 | {
311 | "data": {
312 | "text/html": [
313 | "\n",
314 | "\n",
327 | "
\n",
328 | " \n",
329 | " \n",
330 | " | \n",
331 | " id | \n",
332 | " 1 | \n",
333 | " 2 | \n",
334 | " 3 | \n",
335 | " 4 | \n",
336 | " 5 | \n",
337 | "
\n",
338 | " \n",
339 | " \n",
340 | " \n",
341 | " | 0 | \n",
342 | " 0 | \n",
343 | " 0.341597 | \n",
344 | " 0.336250 | \n",
345 | " 0.253492 | \n",
346 | " 0.300679 | \n",
347 | " 0.428683 | \n",
348 | "
\n",
349 | " \n",
350 | " | 1 | \n",
351 | " 1 | \n",
352 | " 0.345812 | \n",
353 | " 0.227194 | \n",
354 | " 0.300929 | \n",
355 | " 0.282622 | \n",
356 | " 0.466563 | \n",
357 | "
\n",
358 | " \n",
359 | " | 2 | \n",
360 | " 2 | \n",
361 | " 0.437719 | \n",
362 | " 0.484228 | \n",
363 | " 0.226082 | \n",
364 | " 0.307257 | \n",
365 | " 0.444477 | \n",
366 | "
\n",
367 | " \n",
368 | " | 3 | \n",
369 | " 4 | \n",
370 | " 0.322215 | \n",
371 | " 0.274412 | \n",
372 | " 0.244045 | \n",
373 | " 0.301236 | \n",
374 | " 0.394336 | \n",
375 | "
\n",
376 | " \n",
377 | " | 4 | \n",
378 | " 7 | \n",
379 | " 0.366907 | \n",
380 | " 0.253626 | \n",
381 | " 0.435525 | \n",
382 | " 0.380312 | \n",
383 | " 0.456793 | \n",
384 | "
\n",
385 | " \n",
386 | "
\n",
387 | "
"
388 | ],
389 | "text/plain": [
390 | " id 1 2 3 4 5\n",
391 | "0 0 0.341597 0.336250 0.253492 0.300679 0.428683\n",
392 | "1 1 0.345812 0.227194 0.300929 0.282622 0.466563\n",
393 | "2 2 0.437719 0.484228 0.226082 0.307257 0.444477\n",
394 | "3 4 0.322215 0.274412 0.244045 0.301236 0.394336\n",
395 | "4 7 0.366907 0.253626 0.435525 0.380312 0.456793"
396 | ]
397 | },
398 | "execution_count": 16,
399 | "metadata": {},
400 | "output_type": "execute_result"
401 | }
402 | ],
403 | "source": [
404 | "df = pd.DataFrame({'id': X1_test.id.values,\n",
405 | " '1': a1,\n",
406 | " '2': a2,\n",
407 | " '3': a3,\n",
408 | " '4': a4,\n",
409 | " '5': a5})\n",
410 | "df.to_csv('mmp_baseline_.csv', index=False)\n",
411 | "df.head()"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": null,
417 | "metadata": {},
418 | "outputs": [],
419 | "source": []
420 | }
421 | ],
422 | "metadata": {
423 | "kernelspec": {
424 | "display_name": "Python 3",
425 | "language": "python",
426 | "name": "python3"
427 | },
428 | "language_info": {
429 | "codemirror_mode": {
430 | "name": "ipython",
431 | "version": 3
432 | },
433 | "file_extension": ".py",
434 | "mimetype": "text/x-python",
435 | "name": "python",
436 | "nbconvert_exporter": "python",
437 | "pygments_lexer": "ipython3",
438 | "version": "3.7.3"
439 | }
440 | },
441 | "nbformat": 4,
442 | "nbformat_minor": 2
443 | }
444 |
--------------------------------------------------------------------------------
/dj_invest_GMSC.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# этапы решения задачи на реальном примере\n",
8 | "\n",
9 | "для курса \"Машинное обучение и анализ данных\" https://github.com/Dyakonov/MLDM/\n",
10 | " \n",
11 | "2019, Александр Дьяконов https://dyakonov.org/ag/"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "name": "stdout",
21 | "output_type": "stream",
22 | "text": [
23 | "Populating the interactive namespace from numpy and matplotlib\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "%pylab inline\n",
31 | "plt.style.use('seaborn-dark')\n",
32 | "import warnings\n",
33 | "warnings.filterwarnings(\"ignore\") # отключение варнингов\n",
34 | "pd.set_option('display.max_columns', None) # pd.options.display.max_columns = None \n",
35 | "# pd.set_option('display.max_rows', None) # не прятать столбцы при выводе дата-фреймов\n",
36 | "import matplotlib.pyplot as plt\n",
37 | "import matplotlib as mpl\n",
38 | "plt.rc('font', size=14)"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "# загрузили данные"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 3,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "train = pd.read_csv('./data_GMSC/train.csv')\n",
55 | "test = pd.read_csv('./data_GMSC/test.csv')"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "name": "stdout",
65 | "output_type": "stream",
66 | "text": [
67 | "(112500, 11) (37500, 10)\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "# размеры данных\n",
73 | "print(train.shape, test.shape)"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "# посмотрели"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 5,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "data": {
90 | "text/html": [
91 | "\n",
92 | "\n",
105 | "
\n",
106 | " \n",
107 | " \n",
108 | " | \n",
109 | " плохой_клиент | \n",
110 | " линии | \n",
111 | " возраст | \n",
112 | " поведение_30-59_дней | \n",
113 | " Debt_Ratio | \n",
114 | " доход | \n",
115 | " число_кредитов | \n",
116 | " поведение_90_дней | \n",
117 | " недвижимость | \n",
118 | " поведение_60-89_дней | \n",
119 | " семья | \n",
120 | "
\n",
121 | " \n",
122 | " \n",
123 | " \n",
124 | " | 22126 | \n",
125 | " 0 | \n",
126 | " 0.357596 | \n",
127 | " 52 | \n",
128 | " 2 | \n",
129 | " 0.532058 | \n",
130 | " 4600.0 | \n",
131 | " 14 | \n",
132 | " 0 | \n",
133 | " 1 | \n",
134 | " 0 | \n",
135 | " 4.0 | \n",
136 | "
\n",
137 | " \n",
138 | " | 54387 | \n",
139 | " 0 | \n",
140 | " 0.056145 | \n",
141 | " 60 | \n",
142 | " 0 | \n",
143 | " 56.000000 | \n",
144 | " NaN | \n",
145 | " 4 | \n",
146 | " 0 | \n",
147 | " 0 | \n",
148 | " 0 | \n",
149 | " 0.0 | \n",
150 | "
\n",
151 | " \n",
152 | " | 815 | \n",
153 | " 0 | \n",
154 | " 0.447224 | \n",
155 | " 45 | \n",
156 | " 0 | \n",
157 | " 0.653607 | \n",
158 | " 9009.0 | \n",
159 | " 14 | \n",
160 | " 0 | \n",
161 | " 3 | \n",
162 | " 0 | \n",
163 | " 3.0 | \n",
164 | "
\n",
165 | " \n",
166 | " | 13043 | \n",
167 | " 0 | \n",
168 | " 0.098810 | \n",
169 | " 54 | \n",
170 | " 0 | \n",
171 | " 0.203736 | \n",
172 | " 19166.0 | \n",
173 | " 15 | \n",
174 | " 0 | \n",
175 | " 2 | \n",
176 | " 0 | \n",
177 | " 4.0 | \n",
178 | "
\n",
179 | " \n",
180 | " | 75469 | \n",
181 | " 0 | \n",
182 | " 0.683554 | \n",
183 | " 34 | \n",
184 | " 0 | \n",
185 | " 0.264168 | \n",
186 | " 5416.0 | \n",
187 | " 9 | \n",
188 | " 0 | \n",
189 | " 1 | \n",
190 | " 0 | \n",
191 | " 2.0 | \n",
192 | "
\n",
193 | " \n",
194 | "
\n",
195 | "
"
196 | ],
197 | "text/plain": [
198 | " плохой_клиент линии возраст поведение_30-59_дней Debt_Ratio \\\n",
199 | "22126 0 0.357596 52 2 0.532058 \n",
200 | "54387 0 0.056145 60 0 56.000000 \n",
201 | "815 0 0.447224 45 0 0.653607 \n",
202 | "13043 0 0.098810 54 0 0.203736 \n",
203 | "75469 0 0.683554 34 0 0.264168 \n",
204 | "\n",
205 | " доход число_кредитов поведение_90_дней недвижимость \\\n",
206 | "22126 4600.0 14 0 1 \n",
207 | "54387 NaN 4 0 0 \n",
208 | "815 9009.0 14 0 3 \n",
209 | "13043 19166.0 15 0 2 \n",
210 | "75469 5416.0 9 0 1 \n",
211 | "\n",
212 | " поведение_60-89_дней семья \n",
213 | "22126 0 4.0 \n",
214 | "54387 0 0.0 \n",
215 | "815 0 3.0 \n",
216 | "13043 0 4.0 \n",
217 | "75469 0 2.0 "
218 | ]
219 | },
220 | "execution_count": 5,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "train.sample(5)"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "# особенности"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "Нам повезло: нет категориальных признаков - не надо думать о кодировках\n",
241 | " \n",
242 | "Но есть пропуски: пока не будем думать о них (попробуйте придкмать что-то умнее) - заменим (-1)"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 6,
248 | "metadata": {},
249 | "outputs": [
250 | {
251 | "data": {
252 | "text/plain": [
253 | "((112500, 10), (112500,))"
254 | ]
255 | },
256 | "execution_count": 6,
257 | "metadata": {},
258 | "output_type": "execute_result"
259 | }
260 | ],
261 | "source": [
262 | "y = train.pop('плохой_клиент') # целевой вектор\n",
263 | "train.shape, y.shape"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 7,
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "# заменить пропуски на -11\n",
273 | "train.fillna(-1, inplace=True)\n",
274 | "test.fillna(-1, inplace=True)"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "у нас задача бинарной классификации:"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 8,
287 | "metadata": {},
288 | "outputs": [
289 | {
290 | "data": {
291 | "text/plain": [
292 | "array([0, 1])"
293 | ]
294 | },
295 | "execution_count": 8,
296 | "metadata": {},
297 | "output_type": "execute_result"
298 | }
299 | ],
300 | "source": [
301 | "np.unique(y)"
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "перечислим подходящие алгоритмы для бинарной классификации (тут, кстати, не все алгоритмы):"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 16,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "from sklearn.linear_model import LogisticRegression\n",
318 | "from sklearn.svm import LinearSVC\n",
319 | "from sklearn.linear_model import SGDClassifier\n",
320 | "from sklearn.neighbors import KNeighborsClassifier\n",
321 | "from sklearn.ensemble import RandomForestClassifier\n",
322 | "from sklearn.ensemble import ExtraTreesClassifier\n",
323 | "from sklearn.ensemble import GradientBoostingClassifier\n",
324 | "\n",
325 | "models = {'лог_регрессия': LogisticRegression(),\n",
326 | " 'лин_svm': LinearSVC(),\n",
327 | " 'SGD': SGDClassifier(),\n",
328 | " 'knn': KNeighborsClassifier(),\n",
329 | " 'RF': RandomForestClassifier(),\n",
330 | " 'ETC': ExtraTreesClassifier(),\n",
331 | " 'GBM': GradientBoostingClassifier()} "
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "поэкспериментируем со всеми алгоритмами (параметры по умолчанию)"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 22,
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "name": "stdout",
348 | "output_type": "stream",
349 | "text": [
350 | "лог_регрессия auc=0.697 std=0.011\n",
351 | "лин_svm auc=0.565 std=0.029\n",
352 | "SGD auc=0.537 std=0.036\n",
353 | "knn auc=0.568 std=0.008\n",
354 | "RF auc=0.777 std=0.007\n",
355 | "ETC auc=0.778 std=0.01\n",
356 | "GBM auc=0.866 std=0.002\n"
357 | ]
358 | }
359 | ],
360 | "source": [
361 | "from sklearn.model_selection import cross_val_score\n",
362 | "from sklearn.model_selection import ShuffleSplit\n",
363 | "\n",
364 | "cv = ShuffleSplit(n_splits=5, test_size=0.1, train_size=None, random_state=1)\n",
365 | "\n",
366 | "for model_name in models:\n",
367 | " model = models[model_name]\n",
368 | " cvs = cross_val_score(model, train, y, cv=cv, scoring='roc_auc')\n",
369 | " print (model_name, f\"auc={np.round(np.mean(cvs), 3)}\", f\"std={np.round(np.std(cvs), 3)}\")"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "некоторые алгоритмы долго обучаются\n",
377 | "\n",
378 | "совет: поймите какие и от чего это зависит!\n",
379 | "\n",
380 | "пока самый лучший алгоритм - **градиентный бустинг**\n",
381 | "\n",
382 | "здесь метрика качества - AUC ROC\n",
383 | "https://dyakonov.org/2017/07/28/auc-roc-%D0%BF%D0%BB%D0%BE%D1%89%D0%B0%D0%B4%D1%8C-%D0%BF%D0%BE%D0%B4-%D0%BA%D1%80%D0%B8%D0%B2%D0%BE%D0%B9-%D0%BE%D1%88%D0%B8%D0%B1%D0%BE%D0%BA/\n",
384 | "\n",
385 | "Метрик качества очень много! Вот некоторые из них:"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 14,
391 | "metadata": {},
392 | "outputs": [
393 | {
394 | "data": {
395 | "text/plain": [
396 | "dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])"
397 | ]
398 | },
399 | "execution_count": 14,
400 | "metadata": {},
401 | "output_type": "execute_result"
402 | }
403 | ],
404 | "source": [
405 | "from sklearn.metrics import SCORERS\n",
406 | "SCORERS.keys()"
407 | ]
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "metadata": {},
412 | "source": [
413 | "настроим параметры бустинга"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 28,
419 | "metadata": {},
420 | "outputs": [
421 | {
422 | "data": {
423 | "text/plain": [
424 | "RandomizedSearchCV(cv=ShuffleSplit(n_splits=5, random_state=None, test_size=0.1, train_size=None),\n",
425 | " error_score='raise-deprecating',\n",
426 | " estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
427 | " learning_rate=0.1, loss='deviance', max_depth=3,\n",
428 | " max_features=None, max_leaf_nodes=None,\n",
429 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
430 | " min_samples_leaf=1, min_sampl... subsample=1.0, tol=0.0001, validation_fraction=0.1,\n",
431 | " verbose=0, warm_start=False),\n",
432 | " fit_params=None, iid='warn', n_iter=10, n_jobs=-1,\n",
433 | " param_distributions={'learning_rate': [0.05, 0.1, 0.2], 'subsample': [0.5, 1.0], 'max_depth': [1, 2, 3, 4, 5], 'max_features': [0.5, 0.75, 1.0]},\n",
434 | " pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
435 | " return_train_score='warn', scoring='roc_auc', verbose=0)"
436 | ]
437 | },
438 | "execution_count": 28,
439 | "metadata": {},
440 | "output_type": "execute_result"
441 | }
442 | ],
443 | "source": [
444 | "from sklearn.model_selection import RandomizedSearchCV\n",
445 | "\n",
446 | "params = {'learning_rate': [0.05, 0.1, 0.2], 'subsample': [0.5, 1.0], 'max_depth': [1, 2, 3, 4, 5], 'max_features': [0.5, 0.75, 1.0]}\n",
447 | "\n",
448 | "model = GradientBoostingClassifier()\n",
449 | "\n",
450 | "rs = RandomizedSearchCV(model, params, n_iter=10, scoring='roc_auc', n_jobs=-1, cv=cv)\n",
451 | "\n",
452 | "rs.fit(train, y)"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": 30,
458 | "metadata": {},
459 | "outputs": [
460 | {
461 | "data": {
462 | "text/html": [
463 | "\n",
464 | "\n",
477 | "
\n",
478 | " \n",
479 | " \n",
480 | " | \n",
481 | " mean_fit_time | \n",
482 | " std_fit_time | \n",
483 | " mean_score_time | \n",
484 | " std_score_time | \n",
485 | " param_subsample | \n",
486 | " param_max_features | \n",
487 | " param_max_depth | \n",
488 | " param_learning_rate | \n",
489 | " params | \n",
490 | " split0_test_score | \n",
491 | " split1_test_score | \n",
492 | " split2_test_score | \n",
493 | " split3_test_score | \n",
494 | " split4_test_score | \n",
495 | " mean_test_score | \n",
496 | " std_test_score | \n",
497 | " rank_test_score | \n",
498 | " split0_train_score | \n",
499 | " split1_train_score | \n",
500 | " split2_train_score | \n",
501 | " split3_train_score | \n",
502 | " split4_train_score | \n",
503 | " mean_train_score | \n",
504 | " std_train_score | \n",
505 | "
\n",
506 | " \n",
507 | " \n",
508 | " \n",
509 | " | 0 | \n",
510 | " 6.836809 | \n",
511 | " 0.289080 | \n",
512 | " 0.018913 | \n",
513 | " 0.000148 | \n",
514 | " 1 | \n",
515 | " 0.75 | \n",
516 | " 4 | \n",
517 | " 0.05 | \n",
518 | " {'subsample': 1.0, 'max_features': 0.75, 'max_... | \n",
519 | " 0.864399 | \n",
520 | " 0.868879 | \n",
521 | " 0.862285 | \n",
522 | " 0.851924 | \n",
523 | " 0.846587 | \n",
524 | " 0.858815 | \n",
525 | " 0.008265 | \n",
526 | " 6 | \n",
527 | " 0.868508 | \n",
528 | " 0.867889 | \n",
529 | " 0.868400 | \n",
530 | " 0.869092 | \n",
531 | " 0.869901 | \n",
532 | " 0.868758 | \n",
533 | " 0.000688 | \n",
534 | "
\n",
535 | " \n",
536 | " | 1 | \n",
537 | " 6.139652 | \n",
538 | " 1.158666 | \n",
539 | " 0.020198 | \n",
540 | " 0.007867 | \n",
541 | " 1 | \n",
542 | " 0.75 | \n",
543 | " 3 | \n",
544 | " 0.05 | \n",
545 | " {'subsample': 1.0, 'max_features': 0.75, 'max_... | \n",
546 | " 0.863162 | \n",
547 | " 0.867577 | \n",
548 | " 0.860194 | \n",
549 | " 0.850598 | \n",
550 | " 0.845386 | \n",
551 | " 0.857383 | \n",
552 | " 0.008187 | \n",
553 | " 9 | \n",
554 | " 0.863949 | \n",
555 | " 0.863699 | \n",
556 | " 0.864472 | \n",
557 | " 0.865349 | \n",
558 | " 0.865811 | \n",
559 | " 0.864656 | \n",
560 | " 0.000808 | \n",
561 | "
\n",
562 | " \n",
563 | " | 2 | \n",
564 | " 7.081066 | \n",
565 | " 0.455116 | \n",
566 | " 0.019497 | \n",
567 | " 0.000924 | \n",
568 | " 0.5 | \n",
569 | " 0.5 | \n",
570 | " 4 | \n",
571 | " 0.05 | \n",
572 | " {'subsample': 0.5, 'max_features': 0.5, 'max_d... | \n",
573 | " 0.864676 | \n",
574 | " 0.869023 | \n",
575 | " 0.862186 | \n",
576 | " 0.852309 | \n",
577 | " 0.846378 | \n",
578 | " 0.858914 | \n",
579 | " 0.008329 | \n",
580 | " 5 | \n",
581 | " 0.867170 | \n",
582 | " 0.866971 | \n",
583 | " 0.867352 | \n",
584 | " 0.869077 | \n",
585 | " 0.869399 | \n",
586 | " 0.867994 | \n",
587 | " 0.001028 | \n",
588 | "
\n",
589 | " \n",
590 | " | 3 | \n",
591 | " 4.471708 | \n",
592 | " 0.441073 | \n",
593 | " 0.013420 | \n",
594 | " 0.004068 | \n",
595 | " 0.5 | \n",
596 | " 0.75 | \n",
597 | " 2 | \n",
598 | " 0.2 | \n",
599 | " {'subsample': 0.5, 'max_features': 0.75, 'max_... | \n",
600 | " 0.864668 | \n",
601 | " 0.867907 | \n",
602 | " 0.861987 | \n",
603 | " 0.850504 | \n",
604 | " 0.847244 | \n",
605 | " 0.858462 | \n",
606 | " 0.008116 | \n",
607 | " 7 | \n",
608 | " 0.864127 | \n",
609 | " 0.865011 | \n",
610 | " 0.865198 | \n",
611 | " 0.866355 | \n",
612 | " 0.866536 | \n",
613 | " 0.865445 | \n",
614 | " 0.000895 | \n",
615 | "
\n",
616 | " \n",
617 | " | 4 | \n",
618 | " 2.114495 | \n",
619 | " 0.046776 | \n",
620 | " 0.009247 | \n",
621 | " 0.000988 | \n",
622 | " 1 | \n",
623 | " 0.75 | \n",
624 | " 1 | \n",
625 | " 0.05 | \n",
626 | " {'subsample': 1.0, 'max_features': 0.75, 'max_... | \n",
627 | " 0.852736 | \n",
628 | " 0.855334 | \n",
629 | " 0.847909 | \n",
630 | " 0.840206 | \n",
631 | " 0.836336 | \n",
632 | " 0.846504 | \n",
633 | " 0.007236 | \n",
634 | " 10 | \n",
635 | " 0.851555 | \n",
636 | " 0.850605 | \n",
637 | " 0.850773 | \n",
638 | " 0.852862 | \n",
639 | " 0.853601 | \n",
640 | " 0.851879 | \n",
641 | " 0.001173 | \n",
642 | "
\n",
643 | " \n",
644 | " | 5 | \n",
645 | " 5.227625 | \n",
646 | " 0.426865 | \n",
647 | " 0.015371 | \n",
648 | " 0.000238 | \n",
649 | " 0.5 | \n",
650 | " 0.75 | \n",
651 | " 3 | \n",
652 | " 0.05 | \n",
653 | " {'subsample': 0.5, 'max_features': 0.75, 'max_... | \n",
654 | " 0.862699 | \n",
655 | " 0.868307 | \n",
656 | " 0.860850 | \n",
657 | " 0.849881 | \n",
658 | " 0.846355 | \n",
659 | " 0.857619 | \n",
660 | " 0.008213 | \n",
661 | " 8 | \n",
662 | " 0.863511 | \n",
663 | " 0.863461 | \n",
664 | " 0.864245 | \n",
665 | " 0.864770 | \n",
666 | " 0.866266 | \n",
667 | " 0.864451 | \n",
668 | " 0.001030 | \n",
669 | "
\n",
670 | " \n",
671 | " | 6 | \n",
672 | " 8.550311 | \n",
673 | " 0.636935 | \n",
674 | " 0.023514 | \n",
675 | " 0.000620 | \n",
676 | " 1 | \n",
677 | " 0.5 | \n",
678 | " 5 | \n",
679 | " 0.05 | \n",
680 | " {'subsample': 1.0, 'max_features': 0.5, 'max_d... | \n",
681 | " 0.865147 | \n",
682 | " 0.869298 | \n",
683 | " 0.862580 | \n",
684 | " 0.854252 | \n",
685 | " 0.848193 | \n",
686 | " 0.859894 | \n",
687 | " 0.007642 | \n",
688 | " 4 | \n",
689 | " 0.872683 | \n",
690 | " 0.873040 | \n",
691 | " 0.873156 | \n",
692 | " 0.874111 | \n",
693 | " 0.874701 | \n",
694 | " 0.873538 | \n",
695 | " 0.000749 | \n",
696 | "
\n",
697 | " \n",
698 | " | 7 | \n",
699 | " 5.745884 | \n",
700 | " 0.232485 | \n",
701 | " 0.017884 | \n",
702 | " 0.000216 | \n",
703 | " 0.5 | \n",
704 | " 0.5 | \n",
705 | " 4 | \n",
706 | " 0.1 | \n",
707 | " {'subsample': 0.5, 'max_features': 0.5, 'max_d... | \n",
708 | " 0.863934 | \n",
709 | " 0.869459 | \n",
710 | " 0.863171 | \n",
711 | " 0.854791 | \n",
712 | " 0.849844 | \n",
713 | " 0.860240 | \n",
714 | " 0.007000 | \n",
715 | " 3 | \n",
716 | " 0.870433 | \n",
717 | " 0.870615 | \n",
718 | " 0.872036 | \n",
719 | " 0.872633 | \n",
720 | " 0.872147 | \n",
721 | " 0.871573 | \n",
722 | " 0.000882 | \n",
723 | "
\n",
724 | " \n",
725 | " | 8 | \n",
726 | " 9.247265 | \n",
727 | " 0.295792 | \n",
728 | " 0.021539 | \n",
729 | " 0.000234 | \n",
730 | " 1 | \n",
731 | " 0.75 | \n",
732 | " 5 | \n",
733 | " 0.1 | \n",
734 | " {'subsample': 1.0, 'max_features': 0.75, 'max_... | \n",
735 | " 0.866498 | \n",
736 | " 0.869321 | \n",
737 | " 0.863294 | \n",
738 | " 0.854622 | \n",
739 | " 0.849150 | \n",
740 | " 0.860577 | \n",
741 | " 0.007549 | \n",
742 | " 2 | \n",
743 | " 0.879376 | \n",
744 | " 0.878538 | \n",
745 | " 0.879975 | \n",
746 | " 0.880051 | \n",
747 | " 0.880607 | \n",
748 | " 0.879709 | \n",
749 | " 0.000704 | \n",
750 | "
\n",
751 | " \n",
752 | " | 9 | \n",
753 | " 5.208328 | \n",
754 | " 0.149721 | \n",
755 | " 0.017666 | \n",
756 | " 0.000754 | \n",
757 | " 1 | \n",
758 | " 0.5 | \n",
759 | " 4 | \n",
760 | " 0.2 | \n",
761 | " {'subsample': 1.0, 'max_features': 0.5, 'max_d... | \n",
762 | " 0.867451 | \n",
763 | " 0.870034 | \n",
764 | " 0.864594 | \n",
765 | " 0.852394 | \n",
766 | " 0.849555 | \n",
767 | " 0.860806 | \n",
768 | " 0.008258 | \n",
769 | " 1 | \n",
770 | " 0.876857 | \n",
771 | " 0.877002 | \n",
772 | " 0.877972 | \n",
773 | " 0.877981 | \n",
774 | " 0.879370 | \n",
775 | " 0.877836 | \n",
776 | " 0.000900 | \n",
777 | "
\n",
778 | " \n",
779 | "
\n",
780 | "
"
781 | ],
782 | "text/plain": [
783 | " mean_fit_time std_fit_time mean_score_time std_score_time \\\n",
784 | "0 6.836809 0.289080 0.018913 0.000148 \n",
785 | "1 6.139652 1.158666 0.020198 0.007867 \n",
786 | "2 7.081066 0.455116 0.019497 0.000924 \n",
787 | "3 4.471708 0.441073 0.013420 0.004068 \n",
788 | "4 2.114495 0.046776 0.009247 0.000988 \n",
789 | "5 5.227625 0.426865 0.015371 0.000238 \n",
790 | "6 8.550311 0.636935 0.023514 0.000620 \n",
791 | "7 5.745884 0.232485 0.017884 0.000216 \n",
792 | "8 9.247265 0.295792 0.021539 0.000234 \n",
793 | "9 5.208328 0.149721 0.017666 0.000754 \n",
794 | "\n",
795 | " param_subsample param_max_features param_max_depth param_learning_rate \\\n",
796 | "0 1 0.75 4 0.05 \n",
797 | "1 1 0.75 3 0.05 \n",
798 | "2 0.5 0.5 4 0.05 \n",
799 | "3 0.5 0.75 2 0.2 \n",
800 | "4 1 0.75 1 0.05 \n",
801 | "5 0.5 0.75 3 0.05 \n",
802 | "6 1 0.5 5 0.05 \n",
803 | "7 0.5 0.5 4 0.1 \n",
804 | "8 1 0.75 5 0.1 \n",
805 | "9 1 0.5 4 0.2 \n",
806 | "\n",
807 | " params split0_test_score \\\n",
808 | "0 {'subsample': 1.0, 'max_features': 0.75, 'max_... 0.864399 \n",
809 | "1 {'subsample': 1.0, 'max_features': 0.75, 'max_... 0.863162 \n",
810 | "2 {'subsample': 0.5, 'max_features': 0.5, 'max_d... 0.864676 \n",
811 | "3 {'subsample': 0.5, 'max_features': 0.75, 'max_... 0.864668 \n",
812 | "4 {'subsample': 1.0, 'max_features': 0.75, 'max_... 0.852736 \n",
813 | "5 {'subsample': 0.5, 'max_features': 0.75, 'max_... 0.862699 \n",
814 | "6 {'subsample': 1.0, 'max_features': 0.5, 'max_d... 0.865147 \n",
815 | "7 {'subsample': 0.5, 'max_features': 0.5, 'max_d... 0.863934 \n",
816 | "8 {'subsample': 1.0, 'max_features': 0.75, 'max_... 0.866498 \n",
817 | "9 {'subsample': 1.0, 'max_features': 0.5, 'max_d... 0.867451 \n",
818 | "\n",
819 | " split1_test_score split2_test_score split3_test_score split4_test_score \\\n",
820 | "0 0.868879 0.862285 0.851924 0.846587 \n",
821 | "1 0.867577 0.860194 0.850598 0.845386 \n",
822 | "2 0.869023 0.862186 0.852309 0.846378 \n",
823 | "3 0.867907 0.861987 0.850504 0.847244 \n",
824 | "4 0.855334 0.847909 0.840206 0.836336 \n",
825 | "5 0.868307 0.860850 0.849881 0.846355 \n",
826 | "6 0.869298 0.862580 0.854252 0.848193 \n",
827 | "7 0.869459 0.863171 0.854791 0.849844 \n",
828 | "8 0.869321 0.863294 0.854622 0.849150 \n",
829 | "9 0.870034 0.864594 0.852394 0.849555 \n",
830 | "\n",
831 | " mean_test_score std_test_score rank_test_score split0_train_score \\\n",
832 | "0 0.858815 0.008265 6 0.868508 \n",
833 | "1 0.857383 0.008187 9 0.863949 \n",
834 | "2 0.858914 0.008329 5 0.867170 \n",
835 | "3 0.858462 0.008116 7 0.864127 \n",
836 | "4 0.846504 0.007236 10 0.851555 \n",
837 | "5 0.857619 0.008213 8 0.863511 \n",
838 | "6 0.859894 0.007642 4 0.872683 \n",
839 | "7 0.860240 0.007000 3 0.870433 \n",
840 | "8 0.860577 0.007549 2 0.879376 \n",
841 | "9 0.860806 0.008258 1 0.876857 \n",
842 | "\n",
843 | " split1_train_score split2_train_score split3_train_score \\\n",
844 | "0 0.867889 0.868400 0.869092 \n",
845 | "1 0.863699 0.864472 0.865349 \n",
846 | "2 0.866971 0.867352 0.869077 \n",
847 | "3 0.865011 0.865198 0.866355 \n",
848 | "4 0.850605 0.850773 0.852862 \n",
849 | "5 0.863461 0.864245 0.864770 \n",
850 | "6 0.873040 0.873156 0.874111 \n",
851 | "7 0.870615 0.872036 0.872633 \n",
852 | "8 0.878538 0.879975 0.880051 \n",
853 | "9 0.877002 0.877972 0.877981 \n",
854 | "\n",
855 | " split4_train_score mean_train_score std_train_score \n",
856 | "0 0.869901 0.868758 0.000688 \n",
857 | "1 0.865811 0.864656 0.000808 \n",
858 | "2 0.869399 0.867994 0.001028 \n",
859 | "3 0.866536 0.865445 0.000895 \n",
860 | "4 0.853601 0.851879 0.001173 \n",
861 | "5 0.866266 0.864451 0.001030 \n",
862 | "6 0.874701 0.873538 0.000749 \n",
863 | "7 0.872147 0.871573 0.000882 \n",
864 | "8 0.880607 0.879709 0.000704 \n",
865 | "9 0.879370 0.877836 0.000900 "
866 | ]
867 | },
868 | "execution_count": 30,
869 | "metadata": {},
870 | "output_type": "execute_result"
871 | }
872 | ],
873 | "source": [
874 | "pd.DataFrame(rs.cv_results_)"
875 | ]
876 | },
877 | {
878 | "cell_type": "markdown",
879 | "metadata": {},
880 | "source": [
881 | "лучшие параметры"
882 | ]
883 | },
884 | {
885 | "cell_type": "code",
886 | "execution_count": 33,
887 | "metadata": {},
888 | "outputs": [
889 | {
890 | "data": {
891 | "text/plain": [
892 | "{'subsample': 1.0, 'max_features': 0.5, 'max_depth': 4, 'learning_rate': 0.2}"
893 | ]
894 | },
895 | "execution_count": 33,
896 | "metadata": {},
897 | "output_type": "execute_result"
898 | }
899 | ],
900 | "source": [
901 | "rs.cv_results_['params'][-1]"
902 | ]
903 | },
904 | {
905 | "cell_type": "markdown",
906 | "metadata": {},
907 | "source": [
908 | "ожидаемое качество"
909 | ]
910 | },
911 | {
912 | "cell_type": "code",
913 | "execution_count": 36,
914 | "metadata": {},
915 | "outputs": [
916 | {
917 | "data": {
918 | "text/plain": [
919 | "0.8608056678859954"
920 | ]
921 | },
922 | "execution_count": 36,
923 | "metadata": {},
924 | "output_type": "execute_result"
925 | }
926 | ],
927 | "source": [
928 | "rs.cv_results_['mean_test_score'][-1]"
929 | ]
930 | },
931 | {
932 | "cell_type": "markdown",
933 | "metadata": {},
934 | "source": [
935 | "Кстати, хуже параметров по умолчанию;)\n",
936 | "\n",
937 | "Мало экспериментов..."
938 | ]
939 | },
940 | {
941 | "cell_type": "markdown",
942 | "metadata": {},
943 | "source": [
944 | "# советы по улучшению\n",
945 | "\n",
946 | "раз лучшим оказался градиентный бустинг => смотрим его лучшие реализации\n",
947 | "\n",
948 | "* xgboost https://en.wikipedia.org/wiki/XGBoost\n",
949 | "* lightgbm https://github.com/Microsoft/LightGBM\n",
950 | "* catboost https://tech.yandex.ru/catboost/"
951 | ]
952 | }
953 | ],
954 | "metadata": {
955 | "kernelspec": {
956 | "display_name": "Python 3",
957 | "language": "python",
958 | "name": "python3"
959 | },
960 | "language_info": {
961 | "codemirror_mode": {
962 | "name": "ipython",
963 | "version": 3
964 | },
965 | "file_extension": ".py",
966 | "mimetype": "text/x-python",
967 | "name": "python",
968 | "nbconvert_exporter": "python",
969 | "pygments_lexer": "ipython3",
970 | "version": "3.6.8"
971 | }
972 | },
973 | "nbformat": 4,
974 | "nbformat_minor": 2
975 | }
976 |
--------------------------------------------------------------------------------
/dj_Benchmark_12trip.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Бенчмарк для задачи OneTwoTrip Contest\n",
8 | "\n",
9 | "https://boosters.pro/championship/onetwotrip_challenge/overview\n",
10 | " \n",
11 | " \n",
12 | "для студентов ВМК МГУ\n",
13 | "\n",
14 | "2019, Александр Дьяконов www.dyakonov.org/ag/"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "name": "stdout",
24 | "output_type": "stream",
25 | "text": [
26 | "Populating the interactive namespace from numpy and matplotlib\n"
27 | ]
28 | }
29 | ],
30 | "source": [
31 | "# подгружаем все нужные пакеты\n",
32 | "import pandas as pd\n",
33 | "import numpy as np\n",
34 | "# для встроенных картинок\n",
35 | "%pylab inline\n",
36 | "# отключить предупреждения\n",
37 | "import warnings\n",
38 | "warnings.filterwarnings('ignore')\n",
39 | "# прогресс-бар\n",
40 | "from tqdm import tqdm, tqdm_notebook"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "## загрузка данных"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {},
54 | "outputs": [
55 | {
56 | "name": "stdout",
57 | "output_type": "stream",
58 | "text": [
59 | "размеры: (196056, 43) (455011, 37)\n"
60 | ]
61 | }
62 | ],
63 | "source": [
64 | "data_train = pd.read_csv('onetwotrip_challenge_train.csv')\n",
65 | "data_test = pd.read_csv('onetwotrip_challenge_test.csv')\n",
66 | "print ('размеры:', data_train.shape, data_test.shape)"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "## посмотрим на данные\n",
74 | "\n",
75 | "обратите внимание, как выводятся дата-фреймы"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 4,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/html": [
86 | "\n",
87 | "\n",
100 | "
\n",
101 | " \n",
102 | " \n",
103 | " | \n",
104 | " orderid | \n",
105 | " userid | \n",
106 | " field0 | \n",
107 | " field1 | \n",
108 | " field2 | \n",
109 | " field3 | \n",
110 | " field4 | \n",
111 | " field5 | \n",
112 | " field6 | \n",
113 | " field7 | \n",
114 | " ... | \n",
115 | " indicator_goal22 | \n",
116 | " indicator_goal23 | \n",
117 | " indicator_goal24 | \n",
118 | " indicator_goal25 | \n",
119 | " goal21 | \n",
120 | " goal22 | \n",
121 | " goal23 | \n",
122 | " goal24 | \n",
123 | " goal25 | \n",
124 | " goal1 | \n",
125 | "
\n",
126 | " \n",
127 | " \n",
128 | " \n",
129 | " | 0 | \n",
130 | " 0 | \n",
131 | " 10d654494cbe97bbb25d51ead2600679aff9e097924add... | \n",
132 | " 0 | \n",
133 | " -0.626508 | \n",
134 | " 11 | \n",
135 | " 12 | \n",
136 | " 1 | \n",
137 | " 1 | \n",
138 | " 0 | \n",
139 | " 1 | \n",
140 | " ... | \n",
141 | " 1 | \n",
142 | " 0 | \n",
143 | " 1 | \n",
144 | " 1 | \n",
145 | " 0 | \n",
146 | " 1 | \n",
147 | " 0 | \n",
148 | " 0 | \n",
149 | " 0 | \n",
150 | " 0 | \n",
151 | "
\n",
152 | " \n",
153 | " | 1 | \n",
154 | " 1 | \n",
155 | " 4aafc0391f72bbcf60537aece62923baf9ce644b64ac36... | \n",
156 | " 144 | \n",
157 | " -0.393794 | \n",
158 | " 5 | \n",
159 | " 7 | \n",
160 | " 2 | \n",
161 | " 0 | \n",
162 | " 0 | \n",
163 | " 2 | \n",
164 | " ... | \n",
165 | " 1 | \n",
166 | " 0 | \n",
167 | " 1 | \n",
168 | " 0 | \n",
169 | " 0 | \n",
170 | " 0 | \n",
171 | " 0 | \n",
172 | " 0 | \n",
173 | " 0 | \n",
174 | " 0 | \n",
175 | "
\n",
176 | " \n",
177 | " | 2 | \n",
178 | " 2 | \n",
179 | " bac8ffef46348f587c8d17137ab01fb24aef21547c647d... | \n",
180 | " 134 | \n",
181 | " -0.548937 | \n",
182 | " 2 | \n",
183 | " 3 | \n",
184 | " 2 | \n",
185 | " 0 | \n",
186 | " 0 | \n",
187 | " 1 | \n",
188 | " ... | \n",
189 | " 1 | \n",
190 | " 0 | \n",
191 | " 1 | \n",
192 | " 1 | \n",
193 | " 0 | \n",
194 | " 0 | \n",
195 | " 0 | \n",
196 | " 0 | \n",
197 | " 0 | \n",
198 | " 0 | \n",
199 | "
\n",
200 | " \n",
201 | " | 3 | \n",
202 | " 3 | \n",
203 | " 0392247b4b87674aba2c32bf2292b105771a6a376871be... | \n",
204 | " 0 | \n",
205 | " -0.238651 | \n",
206 | " 10 | \n",
207 | " 11 | \n",
208 | " 1 | \n",
209 | " 1 | \n",
210 | " 3 | \n",
211 | " 2 | \n",
212 | " ... | \n",
213 | " 1 | \n",
214 | " 0 | \n",
215 | " 1 | \n",
216 | " 1 | \n",
217 | " 0 | \n",
218 | " 0 | \n",
219 | " 0 | \n",
220 | " 0 | \n",
221 | " 0 | \n",
222 | " 0 | \n",
223 | "
\n",
224 | " \n",
225 | " | 4 | \n",
226 | " 4 | \n",
227 | " d1aeefef311bbeb4bd84876c8d49421f276674527d5578... | \n",
228 | " 0 | \n",
229 | " -0.704079 | \n",
230 | " 8 | \n",
231 | " 11 | \n",
232 | " 1 | \n",
233 | " 1 | \n",
234 | " 0 | \n",
235 | " 1 | \n",
236 | " ... | \n",
237 | " 1 | \n",
238 | " 0 | \n",
239 | " 0 | \n",
240 | " 1 | \n",
241 | " 0 | \n",
242 | " 0 | \n",
243 | " 0 | \n",
244 | " 0 | \n",
245 | " 0 | \n",
246 | " 0 | \n",
247 | "
\n",
248 | " \n",
249 | "
\n",
250 | "
5 rows × 43 columns
\n",
251 | "
"
252 | ],
253 | "text/plain": [
254 | " orderid userid field0 \\\n",
255 | "0 0 10d654494cbe97bbb25d51ead2600679aff9e097924add... 0 \n",
256 | "1 1 4aafc0391f72bbcf60537aece62923baf9ce644b64ac36... 144 \n",
257 | "2 2 bac8ffef46348f587c8d17137ab01fb24aef21547c647d... 134 \n",
258 | "3 3 0392247b4b87674aba2c32bf2292b105771a6a376871be... 0 \n",
259 | "4 4 d1aeefef311bbeb4bd84876c8d49421f276674527d5578... 0 \n",
260 | "\n",
261 | " field1 field2 field3 field4 field5 field6 field7 ... \\\n",
262 | "0 -0.626508 11 12 1 1 0 1 ... \n",
263 | "1 -0.393794 5 7 2 0 0 2 ... \n",
264 | "2 -0.548937 2 3 2 0 0 1 ... \n",
265 | "3 -0.238651 10 11 1 1 3 2 ... \n",
266 | "4 -0.704079 8 11 1 1 0 1 ... \n",
267 | "\n",
268 | " indicator_goal22 indicator_goal23 indicator_goal24 indicator_goal25 \\\n",
269 | "0 1 0 1 1 \n",
270 | "1 1 0 1 0 \n",
271 | "2 1 0 1 1 \n",
272 | "3 1 0 1 1 \n",
273 | "4 1 0 0 1 \n",
274 | "\n",
275 | " goal21 goal22 goal23 goal24 goal25 goal1 \n",
276 | "0 0 1 0 0 0 0 \n",
277 | "1 0 0 0 0 0 0 \n",
278 | "2 0 0 0 0 0 0 \n",
279 | "3 0 0 0 0 0 0 \n",
280 | "4 0 0 0 0 0 0 \n",
281 | "\n",
282 | "[5 rows x 43 columns]"
283 | ]
284 | },
285 | "execution_count": 4,
286 | "metadata": {},
287 | "output_type": "execute_result"
288 | }
289 | ],
290 | "source": [
291 | "data_train.head()"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 5,
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "data": {
301 | "text/html": [
302 | "\n",
303 | "\n",
316 | "
\n",
317 | " \n",
318 | " \n",
319 | " | \n",
320 | " orderid | \n",
321 | " userid | \n",
322 | " field0 | \n",
323 | " field1 | \n",
324 | " field2 | \n",
325 | " field3 | \n",
326 | " field4 | \n",
327 | " field5 | \n",
328 | " field6 | \n",
329 | " field7 | \n",
330 | " ... | \n",
331 | " field25 | \n",
332 | " field26 | \n",
333 | " field27 | \n",
334 | " field28 | \n",
335 | " field29 | \n",
336 | " indicator_goal21 | \n",
337 | " indicator_goal22 | \n",
338 | " indicator_goal23 | \n",
339 | " indicator_goal24 | \n",
340 | " indicator_goal25 | \n",
341 | "
\n",
342 | " \n",
343 | " \n",
344 | " \n",
345 | " | 0 | \n",
346 | " 0 | \n",
347 | " 3a6a6af55e097d3f92705936a7ea3ca8aef651f5966832... | \n",
348 | " 0 | \n",
349 | " -0.548937 | \n",
350 | " 10 | \n",
351 | " 10 | \n",
352 | " 1 | \n",
353 | " 1 | \n",
354 | " 0 | \n",
355 | " 1 | \n",
356 | " ... | \n",
357 | " 1 | \n",
358 | " 1 | \n",
359 | " 1 | \n",
360 | " 0 | \n",
361 | " 4 | \n",
362 | " 1 | \n",
363 | " 1 | \n",
364 | " 0 | \n",
365 | " 0 | \n",
366 | " 1 | \n",
367 | "
\n",
368 | " \n",
369 | " | 1 | \n",
370 | " 1 | \n",
371 | " 2df10f61960163da94a4294773ed9c865296e37c330304... | \n",
372 | " 82 | \n",
373 | " -0.626508 | \n",
374 | " 3 | \n",
375 | " 4 | \n",
376 | " 3 | \n",
377 | " 0 | \n",
378 | " 0 | \n",
379 | " 1 | \n",
380 | " ... | \n",
381 | " 1 | \n",
382 | " 1 | \n",
383 | " 1 | \n",
384 | " 0 | \n",
385 | " 1 | \n",
386 | " 1 | \n",
387 | " 1 | \n",
388 | " 0 | \n",
389 | " 1 | \n",
390 | " 1 | \n",
391 | "
\n",
392 | " \n",
393 | " | 2 | \n",
394 | " 2 | \n",
395 | " 20dc3fec5b5eb42fbfe08119063c3a0010a73c7ec94abb... | \n",
396 | " 0 | \n",
397 | " -0.548937 | \n",
398 | " 6 | \n",
399 | " 8 | \n",
400 | " 1 | \n",
401 | " 1 | \n",
402 | " 0 | \n",
403 | " 2 | \n",
404 | " ... | \n",
405 | " 35 | \n",
406 | " 1 | \n",
407 | " 1 | \n",
408 | " 0 | \n",
409 | " 2 | \n",
410 | " 1 | \n",
411 | " 0 | \n",
412 | " 0 | \n",
413 | " 1 | \n",
414 | " 1 | \n",
415 | "
\n",
416 | " \n",
417 | " | 3 | \n",
418 | " 3 | \n",
419 | " ed75b3496977bac207eccb59dc91fe9a8d6a27777a6422... | \n",
420 | " 6 | \n",
421 | " 0.304348 | \n",
422 | " 7 | \n",
423 | " 7 | \n",
424 | " 2 | \n",
425 | " 0 | \n",
426 | " 10 | \n",
427 | " 1 | \n",
428 | " ... | \n",
429 | " 1 | \n",
430 | " 3 | \n",
431 | " 1 | \n",
432 | " 0 | \n",
433 | " 3 | \n",
434 | " 1 | \n",
435 | " 1 | \n",
436 | " 0 | \n",
437 | " 1 | \n",
438 | " 1 | \n",
439 | "
\n",
440 | " \n",
441 | " | 4 | \n",
442 | " 4 | \n",
443 | " a346d08351c5fd0bda82984ed7c8b12b6395829da5b857... | \n",
444 | " 115 | \n",
445 | " -0.471365 | \n",
446 | " 3 | \n",
447 | " 3 | \n",
448 | " 2 | \n",
449 | " 0 | \n",
450 | " 0 | \n",
451 | " 1 | \n",
452 | " ... | \n",
453 | " 1 | \n",
454 | " 1 | \n",
455 | " 1 | \n",
456 | " 0 | \n",
457 | " 1 | \n",
458 | " 1 | \n",
459 | " 1 | \n",
460 | " 0 | \n",
461 | " 1 | \n",
462 | " 1 | \n",
463 | "
\n",
464 | " \n",
465 | "
\n",
466 | "
5 rows × 37 columns
\n",
467 | "
"
468 | ],
469 | "text/plain": [
470 | " orderid userid field0 \\\n",
471 | "0 0 3a6a6af55e097d3f92705936a7ea3ca8aef651f5966832... 0 \n",
472 | "1 1 2df10f61960163da94a4294773ed9c865296e37c330304... 82 \n",
473 | "2 2 20dc3fec5b5eb42fbfe08119063c3a0010a73c7ec94abb... 0 \n",
474 | "3 3 ed75b3496977bac207eccb59dc91fe9a8d6a27777a6422... 6 \n",
475 | "4 4 a346d08351c5fd0bda82984ed7c8b12b6395829da5b857... 115 \n",
476 | "\n",
477 | " field1 field2 field3 field4 field5 field6 field7 ... field25 \\\n",
478 | "0 -0.548937 10 10 1 1 0 1 ... 1 \n",
479 | "1 -0.626508 3 4 3 0 0 1 ... 1 \n",
480 | "2 -0.548937 6 8 1 1 0 2 ... 35 \n",
481 | "3 0.304348 7 7 2 0 10 1 ... 1 \n",
482 | "4 -0.471365 3 3 2 0 0 1 ... 1 \n",
483 | "\n",
484 | " field26 field27 field28 field29 indicator_goal21 indicator_goal22 \\\n",
485 | "0 1 1 0 4 1 1 \n",
486 | "1 1 1 0 1 1 1 \n",
487 | "2 1 1 0 2 1 0 \n",
488 | "3 3 1 0 3 1 1 \n",
489 | "4 1 1 0 1 1 1 \n",
490 | "\n",
491 | " indicator_goal23 indicator_goal24 indicator_goal25 \n",
492 | "0 0 0 1 \n",
493 | "1 0 1 1 \n",
494 | "2 0 1 1 \n",
495 | "3 0 1 1 \n",
496 | "4 0 1 1 \n",
497 | "\n",
498 | "[5 rows x 37 columns]"
499 | ]
500 | },
501 | "execution_count": 5,
502 | "metadata": {},
503 | "output_type": "execute_result"
504 | }
505 | ],
506 | "source": [
507 | "data_test.head()"
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {},
513 | "source": [
514 | "## получаем таблички для обучения"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": 6,
520 | "metadata": {},
521 | "outputs": [],
522 | "source": [
523 | "# удаляем ненужные признаки\n",
524 | "ids = data_test.pop('orderid') # сохраняем id для теста\n",
525 | "data_test.drop(['userid'], inplace=True, axis=1)"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": 7,
531 | "metadata": {},
532 | "outputs": [
533 | {
534 | "data": {
535 | "text/plain": [
536 | "Index(['field0', 'field1', 'field2', 'field3', 'field4', 'field5', 'field6',\n",
537 | " 'field7', 'field8', 'field9', 'field10', 'field11', 'field12',\n",
538 | " 'field13', 'field14', 'field15', 'field16', 'field17', 'field18',\n",
539 | " 'field19', 'field20', 'field21', 'field22', 'field23', 'field24',\n",
540 | " 'field25', 'field26', 'field27', 'field28', 'field29',\n",
541 | " 'indicator_goal21', 'indicator_goal22', 'indicator_goal23',\n",
542 | " 'indicator_goal24', 'indicator_goal25'],\n",
543 | " dtype='object')"
544 | ]
545 | },
546 | "execution_count": 7,
547 | "metadata": {},
548 | "output_type": "execute_result"
549 | }
550 | ],
551 | "source": [
552 | "cols = data_test.columns # значимые колонки\n",
553 | "cols"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 8,
559 | "metadata": {},
560 | "outputs": [],
561 | "source": [
562 | "y = data_train.pop('goal1') # целевая переменная для первой задачи"
563 | ]
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": 9,
568 | "metadata": {},
569 | "outputs": [],
570 | "source": [
571 | "y = y.values # мне так спокойней - в numpy.array"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 10,
577 | "metadata": {},
578 | "outputs": [],
579 | "source": [
580 | "data_train = data_train[cols] # оставить только нужные колонки"
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": 11,
586 | "metadata": {},
587 | "outputs": [
588 | {
589 | "data": {
590 | "text/html": [
591 | "\n",
592 | "\n",
605 | "
\n",
606 | " \n",
607 | " \n",
608 | " | \n",
609 | " field0 | \n",
610 | " field1 | \n",
611 | " field2 | \n",
612 | " field3 | \n",
613 | " field4 | \n",
614 | " field5 | \n",
615 | " field6 | \n",
616 | " field7 | \n",
617 | " field8 | \n",
618 | " field9 | \n",
619 | " ... | \n",
620 | " field25 | \n",
621 | " field26 | \n",
622 | " field27 | \n",
623 | " field28 | \n",
624 | " field29 | \n",
625 | " indicator_goal21 | \n",
626 | " indicator_goal22 | \n",
627 | " indicator_goal23 | \n",
628 | " indicator_goal24 | \n",
629 | " indicator_goal25 | \n",
630 | "
\n",
631 | " \n",
632 | " \n",
633 | " \n",
634 | " | 0 | \n",
635 | " 0 | \n",
636 | " -0.626508 | \n",
637 | " 11 | \n",
638 | " 12 | \n",
639 | " 1 | \n",
640 | " 1 | \n",
641 | " 0 | \n",
642 | " 1 | \n",
643 | " 1 | \n",
644 | " 0 | \n",
645 | " ... | \n",
646 | " 1 | \n",
647 | " 2 | \n",
648 | " 1 | \n",
649 | " 0 | \n",
650 | " 4 | \n",
651 | " 1 | \n",
652 | " 1 | \n",
653 | " 0 | \n",
654 | " 1 | \n",
655 | " 1 | \n",
656 | "
\n",
657 | " \n",
658 | " | 1 | \n",
659 | " 144 | \n",
660 | " -0.393794 | \n",
661 | " 5 | \n",
662 | " 7 | \n",
663 | " 2 | \n",
664 | " 0 | \n",
665 | " 0 | \n",
666 | " 2 | \n",
667 | " 1 | \n",
668 | " 0 | \n",
669 | " ... | \n",
670 | " 41 | \n",
671 | " 3 | \n",
672 | " 1 | \n",
673 | " 0 | \n",
674 | " 2 | \n",
675 | " 1 | \n",
676 | " 1 | \n",
677 | " 0 | \n",
678 | " 1 | \n",
679 | " 0 | \n",
680 | "
\n",
681 | " \n",
682 | " | 2 | \n",
683 | " 134 | \n",
684 | " -0.548937 | \n",
685 | " 2 | \n",
686 | " 3 | \n",
687 | " 2 | \n",
688 | " 0 | \n",
689 | " 0 | \n",
690 | " 1 | \n",
691 | " 1 | \n",
692 | " 0 | \n",
693 | " ... | \n",
694 | " 1 | \n",
695 | " 11 | \n",
696 | " 7 | \n",
697 | " 0 | \n",
698 | " 1 | \n",
699 | " 1 | \n",
700 | " 1 | \n",
701 | " 0 | \n",
702 | " 1 | \n",
703 | " 1 | \n",
704 | "
\n",
705 | " \n",
706 | " | 3 | \n",
707 | " 0 | \n",
708 | " -0.238651 | \n",
709 | " 10 | \n",
710 | " 11 | \n",
711 | " 1 | \n",
712 | " 1 | \n",
713 | " 3 | \n",
714 | " 2 | \n",
715 | " 1 | \n",
716 | " 0 | \n",
717 | " ... | \n",
718 | " 18 | \n",
719 | " 1 | \n",
720 | " 1 | \n",
721 | " 0 | \n",
722 | " 4 | \n",
723 | " 1 | \n",
724 | " 1 | \n",
725 | " 0 | \n",
726 | " 1 | \n",
727 | " 1 | \n",
728 | "
\n",
729 | " \n",
730 | " | 4 | \n",
731 | " 0 | \n",
732 | " -0.704079 | \n",
733 | " 8 | \n",
734 | " 11 | \n",
735 | " 1 | \n",
736 | " 1 | \n",
737 | " 0 | \n",
738 | " 1 | \n",
739 | " 1 | \n",
740 | " 0 | \n",
741 | " ... | \n",
742 | " 1 | \n",
743 | " 1 | \n",
744 | " 1 | \n",
745 | " 0 | \n",
746 | " 3 | \n",
747 | " 1 | \n",
748 | " 1 | \n",
749 | " 0 | \n",
750 | " 0 | \n",
751 | " 1 | \n",
752 | "
\n",
753 | " \n",
754 | "
\n",
755 | "
5 rows × 35 columns
\n",
756 | "
"
757 | ],
758 | "text/plain": [
759 | " field0 field1 field2 field3 field4 field5 field6 field7 field8 \\\n",
760 | "0 0 -0.626508 11 12 1 1 0 1 1 \n",
761 | "1 144 -0.393794 5 7 2 0 0 2 1 \n",
762 | "2 134 -0.548937 2 3 2 0 0 1 1 \n",
763 | "3 0 -0.238651 10 11 1 1 3 2 1 \n",
764 | "4 0 -0.704079 8 11 1 1 0 1 1 \n",
765 | "\n",
766 | " field9 ... field25 field26 field27 field28 field29 indicator_goal21 \\\n",
767 | "0 0 ... 1 2 1 0 4 1 \n",
768 | "1 0 ... 41 3 1 0 2 1 \n",
769 | "2 0 ... 1 11 7 0 1 1 \n",
770 | "3 0 ... 18 1 1 0 4 1 \n",
771 | "4 0 ... 1 1 1 0 3 1 \n",
772 | "\n",
773 | " indicator_goal22 indicator_goal23 indicator_goal24 indicator_goal25 \n",
774 | "0 1 0 1 1 \n",
775 | "1 1 0 1 0 \n",
776 | "2 1 0 1 1 \n",
777 | "3 1 0 1 1 \n",
778 | "4 1 0 0 1 \n",
779 | "\n",
780 | "[5 rows x 35 columns]"
781 | ]
782 | },
783 | "execution_count": 11,
784 | "metadata": {},
785 | "output_type": "execute_result"
786 | }
787 | ],
788 | "source": [
789 | "# что получилось\n",
790 | "data_train.head()"
791 | ]
792 | },
793 | {
794 | "cell_type": "markdown",
795 | "metadata": {},
796 | "source": [
797 | "### Эксперименты\n",
798 | "\n",
799 | "сначала делим выборку на обучение и тест\n",
800 | "\n",
801 | "не самое лучшее решение, но для быстроты экспериментов сгодится"
802 | ]
803 | },
804 | {
805 | "cell_type": "code",
806 | "execution_count": 12,
807 | "metadata": {},
808 | "outputs": [],
809 | "source": [
810 | "from sklearn.model_selection import train_test_split\n",
811 | "X_train, X_test, y_train, y_test = train_test_split(data_train, y, test_size=0.3, random_state=1)"
812 | ]
813 | },
814 | {
815 | "cell_type": "markdown",
816 | "metadata": {},
817 | "source": [
818 | "### Случайный лес\n",
819 | "\n",
820 | "\n",
821 | "строим по одному дереву и вычисляем метрику качества (ROC AUC)\n",
822 | "\n",
823 | "обратите внимание на прогресс-бар"
824 | ]
825 | },
826 | {
827 | "cell_type": "code",
828 | "execution_count": 14,
829 | "metadata": {},
830 | "outputs": [
831 | {
832 | "data": {
833 | "application/vnd.jupyter.widget-view+json": {
834 | "model_id": "f2be4ed53aa84261896b78ba8c029131",
835 | "version_major": 2,
836 | "version_minor": 0
837 | },
838 | "text/plain": [
839 | "HBox(children=(IntProgress(value=0), HTML(value='')))"
840 | ]
841 | },
842 | "metadata": {},
843 | "output_type": "display_data"
844 | },
845 | {
846 | "name": "stdout",
847 | "output_type": "stream",
848 | "text": [
849 | "\n"
850 | ]
851 | },
852 | {
853 | "data": {
854 | "text/plain": [
855 | "[]"
856 | ]
857 | },
858 | "execution_count": 14,
859 | "metadata": {},
860 | "output_type": "execute_result"
861 | },
862 | {
863 | "data": {
864 | "image/png": "\n",
865 | "text/plain": [
866 | ""
867 | ]
868 | },
869 | "metadata": {
870 | "needs_background": "light"
871 | },
872 | "output_type": "display_data"
873 | }
874 | ],
875 | "source": [
876 | "from sklearn.ensemble import RandomForestClassifier\n",
877 | "from sklearn.metrics import roc_auc_score\n",
878 | "\n",
879 | "model = RandomForestClassifier(max_features=1, n_estimators=1, oob_score=False, warm_start=True, random_state=1)\n",
880 | "\n",
881 | "aucs = []\n",
882 | "for t in tqdm_notebook(list(range(1, 101))):\n",
883 | " model.set_params(n_estimators=t)\n",
884 | " model.fit(X_train, y_train)\n",
885 | " a = model.predict_proba(X_test)[:, 1]\n",
886 | " q = roc_auc_score(y_test, a)\n",
887 | " aucs.append(q)\n",
888 | " \n",
889 | "plt.plot(range(1, 101), aucs)"
890 | ]
891 | },
892 | {
893 | "cell_type": "code",
894 | "execution_count": 46,
895 | "metadata": {},
896 | "outputs": [
897 | {
898 | "data": {
899 | "application/vnd.jupyter.widget-view+json": {
900 | "model_id": "08b2382018ab4201b3b2c5904899732f",
901 | "version_major": 2,
902 | "version_minor": 0
903 | },
904 | "text/plain": [
905 | "HBox(children=(IntProgress(value=0), HTML(value='')))"
906 | ]
907 | },
908 | "metadata": {},
909 | "output_type": "display_data"
910 | },
911 | {
912 | "data": {
913 | "text/plain": [
914 | "[]"
915 | ]
916 | },
917 | "execution_count": 46,
918 | "metadata": {},
919 | "output_type": "execute_result"
920 | },
921 | {
922 | "data": {
923 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xl8VfWd//HXh6yQhSUbS4CwhF0EjYCiYB0XKhVr7aLWqdRa2mltp50uo62/Lo7t1LbTxdZxapWpVquOuOEublRckKCAbIGwhwAJhEAIZL2f3x+5tDHGcoEkJ/fe9/PxyIN7zv3eez+Hk7xz8jmbuTsiIhIfegRdgIiIdB2FvohIHFHoi4jEEYW+iEgcUeiLiMQRhb6ISBxR6IuIxBGFvohIHFHoi4jEkcSgC2grOzvbCwoKgi5DRCSqLF++fK+75xxrXLcL/YKCAoqLi4MuQ0QkqpjZtkjGqb0jIhJHFPoiInFEoS8iEkcU+iIicUShLyISRxT6IiJxRKEvIhJHut1x+iIisayusZnKmnpqG5o4VNdE9eFG9tXWs6+2gT49k7lq6pBO/XyFvohIJyvbf5gX1uzhlZIKlm6uoqE51O6404b0UeiLiESz10v3ct09xRxpbGZEThqfO3Moo/IySE9NJC0lkd49k8hKSyYrPZleyZ0fyQp9EZFO8mpJBV/683IKstK44+rTGJ6THnRJCn0RkY6wqqyanz27nrSURE7N701GahI/eXodI3PTue+6qfRLSw66REChLyIxpKq2gTc37SOzZyJnDs8iMeHkD1AMhZwd+w+zs/oIo/IyyE5P+cDzdy3ZzC+eL6Fvr2TSUxJZtHYPAKfm9+bea6fSu1fSSdfRURT6IhIVGppCbK86zNtbqnh9016WbakiKaEHg/r0pH/vVLbsrWV1+QHcW8ZnpSVz8SkDOHVwHxJ6QA8z+memctrQviR9yC+DxRsqeWLFTuoam6lrDFF9uIGS3TXUNjT/bcyQfr04ZVBveiYnYMCWvbUUb9vPRePzuPXyifTplcyBI42UVhxi/MBMUpMSuuB/J3LmR/+HuomioiLXpZVFxN1ZvKGS/319K6UVh9h14AihcFzlZqRw5ogsEszYWX2E8gNHGJDZk3MKs5lemE3FwXqeXFXOS+v2UNf4/iNlMlMTmTk6lxmF2Zw+tC/DstPYc7Ce/3hqLU+/t4t+aclkpSWTktSDjJQkRuWlM3ZAJgP69GT9roO8u72adbsP0tgUwoHEBONLM0bw2alDMLOu/48KM7Pl7l50rHHa0heRbuVQfRNvb9nH714u5d3t1Qzq05Mpw/oxuF8vBvftyeQhfRiRk37MgJ01oT+HG5qorKkn5NAcckorDvHy+j28vL6SJ1eWA9AvLZn6xmaaQs63LhjFvJnDSUlsf+t85qhj3qOk24so9M1sFvBbIAG4y91/1s6YTwM/AhxY6e5Xmdkk4A4gE2gGfuLuD3VQ7SIS5eoam3l3ezVvb6li+fb9bNxTw64DdQAM6tOTn152Cp88PZ/kxBPrzfdKTmRo1t9jbmRuOrMm9CcUckorD7F8237e2baf5pDzjfNHMSSrV4csV3d2zPaOmSUAG4ALgDJgGXClu69tNaYQ+D/gPHffb2a57l5hZqMAd/eNZjYQWA6MdffqD/s8tXdEYl9FTR2/e6mUh4p30NAUwgzG9M9kbP8MRuSmMzovg3NGZX/oFrd8UEe2d6YApe6+OfzGDwKXAmtbjfkicLu77wdw94rwvxuODnD3cjOrAHKADw19EYlNDU0hNu89xDOrdnHXki3UN4X41On5XDg+j9OH9qN3z+5zhEssiyT0BwE7Wk2XAVPbjBkFYGav09IC+pG7P9d6gJlNAZKBTW0/wMzmAfMAhgzp3FOQRaTrbNxTw4J3ylhcUsmmykM0Nrd0FmZPHMC3LxzNsOy0gCuMP5GEfnt7S9r2hBKBQuBcIB94zcwmHG3jmNkA4M/ANe7+gYtOuPudwJ3Q0t6JuHoR6XZCIefJVeXMf30rK3dUk9DDOGtEFh8Zk8uY/hmcmt+HAoV9YCIJ/TJgcKvpfKC8nTFvuXsjsMXMSmj5JbDMzDKBp4Gb3P2tDqhZRLqp1zZW8rNn17Om/CCFuencNHssl04aRE5GyrFfLF0iktBfBhSa2TBgJ3AFcFWbMY8DVwJ/MrNsWto9m80sGXgMuNfdH+64skUkKO7OxopDvLlpH29u2se2qsPU1jdRU9fI/sON5PftyW+vmMQlEwfSo0dwx61L+44Z+u7eZGbXA8/T0q+f7+5rzOxmoNjdF4afu9DM1tJyaOZ33H2fmV0NzACyzGxu+C3nuvuKzlgYEekcNXWN/HXDXl4tqWDxhkoqauqBlsMqxw7IICM1ifSUREb1z+DTRfk66qYb0xm5IvIPVdU2cMnvlrCz+giZqYmcMyqHGYXZnDUim8H9Yv+49mihM3JF5KS5O99+eCWVNfXMn1vEjMKcDrmImQRHa09EPtTdS7bw8voKvj97LOeNyVPgxwCtQRFp14odLdeHv2h8Hp87c2jQ5UgHUXtHRN6npq6RB97ezh8WbyYvM5WfX35qoFePlI6l0BeJY3WNzfxq0QYqa+pJSexBc8h5bvVuauqbOHN4Fj+cM65b3QBETp5CXySO/WrRBu7862by+/akoSlEY3OIGaNz+NKM4UzM7xN0edIJFPoicertLVX88bXNXDV1CD+97JSgy5Euoh25InHoUH0T33p4BYP79uL7F48NuhzpQtrSF4kzjc0hbnlqLWX7j/B/XzqTtBTFQDzR2haJUXWNzbyxaS/l1XVU1NSzq/oI63YfZMPuQzQ0h/jSjOGcUdAv6DKliyn0RWJMRU0d9725jfuWbqeqtgEAM8hOT2F0XgZzpxcwMb83s8b3D7hSCYJCXyRG7DtUz+9eLuUvS7fTGApx/tiWk6oKczPISk8mSWfTCgp9kagVCjn7ahvYc7COl9dX8IfFm6gL34LwSzNH6K5U0i6FvkiUcXd++UIJf1i8mabQ36+SO2t8f7590WhG5qYHWJ10dwp9kSjz25c2cvsrm5h9ygCmDe9HTkYqI3PTGJmbEXRpEgUU+iJR5K7XNvObFzfyqdPzufXyibozlRw3hb5IN7NiRzXrdh2kf+9U+memUtfYzOryg7y7fT+PvrOT2acM4GcKfDlBCn2RbuTBt7fz/cdX0xz64B3tevdM4lOn5/OTy04hQYEvJ0ihL9INuDu/XrSB214uZeaoHH40ZzxVtfXsPlBPYoIxfmAmg/r01CWO5aQp9EUCUNfYzL8/soqNew6RnNiDhqYQa3cd5NNFLVvySQk9dMildAqFvkgXc3e+u2AVC1eWc+7oHJpDTn1TiBs/OoZ5M4Zra146lUJfpIv9etEGFq4s599njeFfzh0RdDkSZxT6Ip3kwJFGVu88wNrygyQmGP3SkimvruO2l0v5TNFgvjxzeNAlShxS6It0sBU7qrnhkVWs313T7vPTR2Zxy2UT1MaRQCj0RTrQguVlfO+x98jNSOE7F41mYn5vJgzsjQNVtfUcrGtiwsDeuviZBEahL9IBDhxp5NeLNvCnN7Zy1ogsbr/qNPqmJb9vTL820yJBiCj0zWwW8FsgAbjL3X/WzphPAz8CHFjp7leF518D3BQedou739MBdYsEJhRyDjc2c7i+ie1Vh3lo2Q6eXFVOXWOIa6cP43sXjyFRW/LSTR0z9M0sAbgduAAoA5aZ2UJ3X9tqTCFwIzDd3febWW54fj/gh0ARLb8Mlodfu7/jF0Wk8z3w9nZ+uHANDU2hv83rlZzAZZMHcdWUoZyS3zvA6kSOLZIt/SlAqbtvBjCzB4FLgbWtxnwRuP1omLt7RXj+RcAid68Kv3YRMAt4oGPKF+k6C1eW873H3uPM4VmcOzqHXsmJ9OmVxMxROWSkJgVdnkhEIgn9QcCOVtNlwNQ2Y0YBmNnrtLSAfuTuz33Iawe1/QAzmwfMAxgyZEiktYt0mVfWV/BvD63gjIJ+zJ97BqlJCUGXJHJCImk8tndcWdurQSUChcC5wJXAXWbWJ8LX4u53unuRuxfl5OREUJJI13ln+37+5f7ljBmQwd3XFCnwJapFEvplwOBW0/lAeTtjnnD3RnffApTQ8ksgkteKdFvVhxv42l/eJTcjlXs+P0VtHIl6kYT+MqDQzIaZWTJwBbCwzZjHgY8AmFk2Le2ezcDzwIVm1tfM+gIXhueJdHvuzr8/soqKmjp+f9VkstJTgi5J5KQds6fv7k1mdj0tYZ0AzHf3NWZ2M1Ds7gv5e7ivBZqB77j7PgAz+w9afnEA3Hx0p65Id3ffW9t4fs0ebpo9lon5fYIuR6RDmPsHb9YQpKKiIi8uLg66DIlzq3ce4BN3vMH0EVncfc0ZukuVdHtmttzdi441TmeQiLSxckc1V9+9lKy0ZH75qVMV+BJTFPoirSzdvI/P3rWUjNREHpp3pvr4EnN07R2JW6UVNfzi+RLWlB8kNyOFnIwUFm+oZFCfntx/3TT6904NukSRDqfQl7hTUVPHrxdt5KFl20lLTmTm6Bz2H25gU2UtZxT04zefmaQtfIlZCn2JK5sqD3HVH9+iqraBa84q4GvnFerqlxJXFPoSN9bvPsjVdy0F4Imvns24gZkBVyTS9RT6EpPeKzvAfW9to66pmeHZ6eRmpnDrc+tJTUzg/i9OZUROetAligRCoS8xY9+hepaU7uX+t7bz9tYq0pIT6NMrmSdWtFz5Y1CfnjzwxWkMyeoVcKUiwVHoS1TbureWP7+1jdc2VrJhzyEA8vv25KbZY/n0GYPJTE3iSEMz26pqGdy3F2kp+paX+KafAIk6zSFnxY5q5i/ZwrOrd5HYowfTRmTx8cmDmDosi0mD+5DQ6oSqnskJjOmv/r0IKPSlG3N3lm/bT0VNPQePNLL3UD3vbK9m2dYqauqayEhN5MszRzD3rAJyM3VMvUgkFPrSLdU3NfPdBav+1o8/akROGh+bOIAzCvpx4fj+pKtdI3Jc9BMj3c7+2ga+9OflvL21im+eP4qLJuSRmZpE755J6smLnCT9BEngGptDLNtaRVnVEcqqj/DkynJ2Vh/htisnM+fUgUGXJxJTFPoSqHe27+d7j77H+t01AJjBsKw0/nLdVIoK+gVcnUjsUehLIA4caeRXL5Rw71vbyMtI5bdXTGLy4L70751KcqIu/irSWRT60mmamkPMf30L63fXcMHYPD4yJpfmkPOnN7Zy5183c7CukWvOLOBbF47SvWdFuohCXzrF5spDfOvhlby7vZr0lEQefWcnvZITSEnswf7DjfzTmFy+ecEoJgzqHXSpInFFoS8dan9tA/cv3cbvXyklJTGB266czMUT+rN0SxVPrSrnwJFGrjtnOKcN6Rt0qSJxSaEvHWLr3lr+Z/EmHnt3J/VNIc4fm8tPLjuFvPBJU9NHZjN9ZHbAVYqIQl9OWvXhBj71hzepqWvkE6flM/esAkb3zwi6LBFph0JfTtqPFq5hf20Dj391unr0It2cjo2Tk/LCmt08vqKc688bqcAXiQIKfTlh+2sb+N5jqxk3IJOvfmRk0OWISATU3pETcqShme8sWEX14QbuvXYKSQnafhCJBgp9OW4b99Tw1b+8w4Y9h/jBx8bpXrMiUSSizTMzm2VmJWZWamY3tPP8XDOrNLMV4a/rWj33czNbY2brzOw2M7O2r5fo8cjyMi75/RL2HWrgnmuncO3Zw4IuSUSOwzG39M0sAbgduAAoA5aZ2UJ3X9tm6EPufn2b154FTAcmhmctAWYCr55k3RKA+Uu2cPNTa5k2vB+3XTFZNy4RiUKRtHemAKXuvhnAzB4ELgXahn57HEgFkgEDkoA9J1aqBOmu1zZzy9PrmDW+P7+7arJ6+CJRKpKf3EHAjlbTZeF5bV1uZqvMbIGZDQZw9zeBV4Bd4a/n3X1d2xea2TwzKzaz4srKyuNeCOk8oZBzx6ubuOXpdXx0ggJfJNpF8tPbXg/e20w/CRS4+0TgReAeADMbCYwF8mn5RXGemc34wJu53+nuRe5elJOTczz1Sydxd14tqeBjv1vCrc+tZ/bEAdx2pQJfJNpF0t4pAwa3ms4H3nfjUnff12ryj8Ct4ceXAW+5+yEAM3sWmAb89UQLls5XU9fIV+5/h9c27mVwv5789opJXDJxID16aB+8SLSLZLNtGVBoZsPMLBm4AljYeoCZDWg1OQc42sLZDsw0s0QzS6JlJ+4H2jvSfTQ1h/jaA+/yxqZ9/OBj43jp387l0kmDFPgiMeKYW/ru3mRm1wPPAwnAfHdfY2Y3A8XuvhD4upnNAZqAKmBu+OULgPOA92hpCT3n7k92/GJIR7nl6XW8WlLJTy87haumDgm6HBHpYObetj0frKKiIi8uLg66jLhQU9fIvW9uo2dSAsNy0ijZXcPPnl3PdWcP46aPjQu6PBE5Dma23N2LjjVOZ+TGqeaQ840HV/DS+or3zT9/bB43Xjw2oKpEpLMp9OPUf71QwkvrK7j50vHMPmUAW/bWsq+2gZmjckhQ/14kZin049DCleX896ubuHLKYP552lDMjKz0lKDLEpEuoNCPI6GQs+CdMn7wxGrOKOjLj+dMQJdCEokvCv04sXxbFT9+ci2ryg5w2pA+3HH16SQn6kQrkXij0I9xa8oPcNtLG3l+zR7yMlP4zWcmcemkgdrCF4lTCv0oV1XbwIvr9rBiRzUrtldTUVPP0KxeDMtOo/pwIy+u20NGaiLfOL+QL54znLQUrXKReKYEiFLNIecvS7fxi+dLOFjXREZqIpMG92H8wEy2Vx3mtY2V1DeF+Mb5hXx++jB690wKumQR6QYU+lGoZHcN33p4Bat3HmT6yCxu/OhYxg3I1KUSROSYFPpR6LuPrGJXdR2/v2oys08ZoP68iERMh29Emc2Vh1i5o5ovzxzBxyZqh6yIHB+FfpR5/N2d9DCYM2lg0KWISBRS6EcRd+exFTuZPjKbPN2fVkROgEI/iryzfT87qo7w8Unt3a1SROTYFPpR5NF3dtIzKYFZE/oHXYqIRCmFfpRoaArx1KpdXDg+TydYicgJU+hHiVdLKjhwpJGPT1ZrR0ROnEI/CoRCzj1vbiU7PZlzRmYHXY6IRDGFfhS49fn1vF66j3/9p0ISE7TKROTEKUG6uYeWbecPizfzz9OGcvW0oUGXIyJRTnsEuyl359WSSr7/2GrOKczmh5eM09m3InLSFPrdzNtbqnhixU5eXl/BrgN1jMxN5/bPnqa2joh0CIV+N1Hf1Mytz5Yw//Ut9EpO4JzCbL55/igumtCfzFRdFllEOoZCvxvYXHmIrz3wLmvKDzL3rAJu+OgYUpMSgi5LRGKQQj9g+2sbuPyONwC463NFnD8uL+CKRCSWKfQDdvsrpRw40sjTXz+HsQMygy5HRGJcRHsHzWyWmZWYWamZ3dDO83PNrNLMVoS/rmv13BAze8HM1pnZWjMr6Ljyo9uOqsPc++Y2Pnl6vgJfRLrEMbf0zSwBuB24ACgDlpnZQndf22boQ+5+fTtvcS/wE3dfZGbpQOhki44Vv3yhhB494JsXjAq6FBGJE5Fs6U8BSt19s7s3AA8Cl0by5mY2Dkh090UA7n7I3Q+fcLUxZPXOAzyxopxrpw9jQO+eQZcjInEiktAfBOxoNV0WntfW5Wa2yswWmNng8LxRQLWZPWpm75rZL8J/ObyPmc0zs2IzK66srDzuhYhG//nsOvr2SuLL544IuhQRiSORhH57p4F6m+kngQJ3nwi8CNwTnp8InAN8GzgDGA7M/cCbud/p7kXuXpSTkxNh6dFrTfkBXi/dx1fOHalj8EWkS0US+mXA4FbT+UB56wHuvs/d68OTfwROb/Xad8OtoSbgceC0kys5+j2yfCfJCT345On5QZciInEmktBfBhSa2TAzSwauABa2HmBmA1pNzgHWtXptXzM7uvl+HtB2B3BcaWgK8fiKnZw/Lpe+aclBlyMiceaYR++4e5OZXQ88DyQA8919jZndDBS7+0Lg62Y2B2gCqgi3cNy92cy+DbxkLVcLW07LXwJx69WSCqpqG7SVLyKBiOjkLHd/BnimzbwftHp8I3Djh7x2ETDxJGqMKQuWl5GdnsKMwtjfdyEi3Y8u3diF9h2q5+X1FXzitEG6aqaIBELJ04WeWFFOU8i5/DS1dkQkGAr9LrRgeRkT83szun9G0KWISJxS6HeRtzbvY+2ug9qBKyKBUuh3gVDI+ekz6xjYO5VPFw0+9gtERDqJQr8LPLmqnFVlB/jWhaN1cxQRCZRCvxO4//0qFXWNzfz8uRLGDcjkssntXbJIRKTr6CYqHeyV9RV8+b7lnDs6h89PH8aqsmp2Vh/h55+cSI8e7V3GSESk6yj0O9DeQ/V8Z8FKstNTWLqliufX7MEMzh2dw/SR2UGXJyKi0O8o7s4Nj7zHwSNNLPzaVIb2S+OJFTtZtHYP35s9NujyREQAhX6HeWjZDl5ct4ebZo9lTP+WWx9eMWUIV0wZEnBlIiJ/px25HWDbvlpufmotZ43I4trpw4IuR0TkQyn0T9LRtk6CGb/81KnaWSsi3ZpC/yQ9XFzGm5v3cePFYxnYR/e6FZHuTaF/Eipq6rjl6bVMGdaPK87QmbYi0v1pR+5x2H2gjmdX72J0XgYT8nvz44VrqWsK8Z+fOEVtHRGJCgr943D7K6X8+a1t75v3nYtGMyInPaCKRESOj0L/OCwp3cv0kVnMmzGCVTuqOVTfxLwZw4MuS0QkYgr9CJXtP8yWvbVcPW0oM0flMHOUbncoItFHO3Ij9HrpXgDOKdTlFEQkein0I/Taxr3kZaZQmKv+vYhEL4V+BEIh541N+5g+MhszHaUjItFLoR+BtbsOUlXboNaOiEQ9hX4EXtvY0s+fPkKhLyLRTaEfgSWllYzOyyA3MzXoUkRETopC/xjqGptZtnU/Z6u1IyIxIKLQN7NZZlZiZqVmdkM7z881s0ozWxH+uq7N85lmttPMft9RhXeVZVuraGgKKfRFJCYc8+QsM0sAbgcuAMqAZWa20N3Xthn6kLtf/yFv8x/A4pOqNCCLSypJSjCmDusXdCkiIictki39KUCpu2929wbgQeDSSD/AzE4H8oAXTqzE4IRCztPv7WJGYQ69knXysohEv0hCfxCwo9V0WXheW5eb2SozW2BmgwHMrAfwX8B3/tEHmNk8Mys2s+LKysoIS+98xdv2s+tAHXMmDQy6FBGRDhFJ6Ld3NpK3mX4SKHD3icCLwD3h+V8BnnH3HfwD7n6nuxe5e1FOTve5ps0TK3bSMymB88fmBV2KiEiHiKRnUQa0vkNIPlDeeoC772s1+Ufg1vDjM4FzzOwrQDqQbGaH3P0DO4O7m8bmEM+8t4vzx+WRlqLWjojEhkjSbBlQaGbDgJ3AFcBVrQeY2QB33xWenAOsA3D3z7YaMxcoiobAh5bLKO8/3Milp6q1IyKx45ih7+5NZnY98DyQAMx39zVmdjNQ7O4Lga+b2RygCagC5nZizV1i4YpyevdMYoYuoSwiMSSivoW7PwM802beD1o9vhG48Rjv8SfgT8ddYQCONDTzwprdXHLqQJITdf6aiMQOJVo7Xlq/h9qGZh21IyIxR6HfjoeLy8jLTGHqsKygSxER6VAK/TY27qlh8YZKrp46lIQeuna+iMQWhX4b81/fQkpiDz47bWjQpYiIdDiFfiv7DtXz6Ds7+cRp+fRLSw66HBGRDqfQb+X+pdupbwrxhbMLgi5FRKRTKPTD6hqbuffNrXxkdA4jczOCLkdEpFMo9MMWrixn76EGrjtneNCliIh0GoU+UFXbwK9e2MDYAZmcNUKHaYpI7Ir70Hd3vv3wSqpqG/jFJydipsM0RSR2xX3o371kCy+vr+D7s8cyYVDvoMsREelUcR36K3ZUc+tz67lofB6fO1PH5YtI7Ivb0K9rbOabD60gNyOVn19+qto6IhIX4vbuIP+zeBNb9tZy3xem0rtXUtDliIh0ibjc0t+6t5b/fnUTl5w6kLMLs4MuR0Sky8Rd6Ls7/++J1aQk9OD/zR4bdDkiIl0q7kL/6fd28drGvXz7otHkZqYGXY6ISJeKq9BvaArxk6fXMWFQJlfrKpoiEofiKvQfe7eMXQfq+O5FY3StfBGJS3ET+s0h538Wb2bCoEzO0c5bEYlTcRP6z67exZa9tXz13JE6Jl9E4lZchL6789+vbGJ4ThoXje8fdDkiIoGJi9B/dUMla3cd5F9mjqCHevkiEsfiIvTveHUTg/r05OOTBwVdiohIoGI+9HdWH+HtLVVcPW0oSQkxv7giIv9QzKfgojW7AZg1Qb18EZGIQt/MZplZiZmVmtkN7Tw/18wqzWxF+Ou68PxJZvamma0xs1Vm9pmOXoBjWbRuDyNz0xmWndbVHy0i0u0c8yqbZpYA3A5cAJQBy8xsobuvbTP0IXe/vs28w8Dn3H2jmQ0ElpvZ8+5e3RHFH8uBw428tbmKeTN031sREYhsS38KUOrum929AXgQuDSSN3f3De6+Mfy4HKgAck602OP1SkkFzSHnwnF5XfWRIiLdWiShPwjY0Wq6LDyvrcvDLZwFZja47ZNmNgVIBja189w8Mys2s+LKysoISz+2F9buJjcjhVPz+3TYe4qIRLNIQr+9A9u9zfSTQIG7TwReBO553xuYDQD+DHze3UMfeDP3O929yN2LcnI65g+BusZmFpdUcv64PB2bLyISFknolwGtt9zzgfLWA9x9n7vXhyf/CJx+9DkzywSeBm5y97dOrtzIvblpH7UNzVyg1o6IyN9EEvrLgEIzG2ZmycAVwMLWA8Jb8kfNAdaF5ycDjwH3uvvDHVNyZF5Yu4e05ATOGpHVlR8rItKtHfPoHXdvMrPrgeeBBGC+u68xs5uBYndfCHzdzOYATUAVMDf88k8DM4AsMzs6b667r+jYxXi/puYQL67bw7mjc0lJTOjMjxIRiSoR3Rjd3Z8Bnmkz7wetHt8I3NjO6+4D7jvJGo/borV7qKyp55JTB3b1R4uIdGsxeUbu3Uu2MKRfL/XzRUTaiLnQX7mjmuJt+5l7VoHujiUi0kbMhf7dS7aQnpLIp4rygy4eQfAlAAAEmUlEQVRFRKTbianQ33XgCM+8t4vPnDGYjNSkoMsREel2Yir073ljGyF35p5VEHQpIiLdUsyE/uGGJh54ezsXje/P4H69gi5HRKRbiuiQzWhQU9fE2SOzufbsgqBLERHptmIm9PMyU7n9s6cFXYaISLcWM+0dERE5NoW+iEgcUeiLiMQRhb6ISBxR6IuIxBGFvohIHFHoi4jEEYW+iEgcMfe29zgPlplVAtuO82XZwN5OKKc7i8dlhvhc7nhcZojP5T6ZZR7q7jnHGtTtQv9EmFmxuxcFXUdXisdlhvhc7nhcZojP5e6KZVZ7R0Qkjij0RUTiSKyE/p1BFxCAeFxmiM/ljsdlhvhc7k5f5pjo6YuISGRiZUtfREQiENWhb2azzKzEzErN7Iag6+ksZjbYzF4xs3VmtsbM/jU8v5+ZLTKzjeF/+wZda0czswQze9fMngpPDzOzpeFlfsjMkoOusSOZWR8zW2Bm68Pr+8w4Wc/fDH9vrzazB8wsNRbXtZnNN7MKM1vdal6769da3BbOt1Vm1iE3DIna0DezBOB24KPAOOBKMxsXbFWdpgn4lruPBaYBXw0v6w3AS+5eCLwUno41/wqsazV9K/Dr8DLvB74QSFWd57fAc+4+BjiVlmWP6fVsZoOArwNF7j4BSACuIDbX9Z+AWW3mfdj6/ShQGP6aB9zREQVEbegDU4BSd9/s7g3Ag8ClAdfUKdx9l7u/E35cQ0sQDKJlee8JD7sH+HgwFXYOM8sHZgN3hacNOA9YEB4SU8tsZpnADOBuAHdvcPdqYnw9hyUCPc0sEegF7CIG17W7/xWoajP7w9bvpcC93uItoI+ZDTjZGqI59AcBO1pNl4XnxTQzKwAmA0uBPHffBS2/GIDc4CrrFL8BvguEwtNZQLW7N4WnY22dDwcqgf8Nt7TuMrM0Ynw9u/tO4JfAdlrC/gCwnNhe16192PrtlIyL5tC3dubF9KFIZpYOPAJ8w90PBl1PZzKzjwEV7r689ex2hsbSOk8ETgPucPfJQC0x1sppT7iHfSkwDBgIpNHS2mgrltZ1JDrl+z2aQ78MGNxqOh8oD6iWTmdmSbQE/v3u/mh49p6jf+6F/60Iqr5OMB2YY2ZbaWndnUfLln+fcAsAYm+dlwFl7r40PL2All8CsbyeAc4Htrh7pbs3Ao8CZxHb67q1D1u/nZJx0Rz6y4DC8B7+ZFp2/CwMuKZOEe5l3w2sc/dftXpqIXBN+PE1wBNdXVtncfcb3T3f3QtoWbcvu/tngVeAT4aHxdoy7wZ2mNno8Kx/AtYSw+s5bDswzcx6hb/Xjy53zK7rNj5s/S4EPhc+imcacOBoG+ikuHvUfgEXAxuATcD3g66nE5fzbFr+rFsFrAh/XUxLj/slYGP4335B19pJy38u8FT48XDgbaAUeBhICbq+Dl7WSUBxeF0/DvSNh/UM/BhYD6wG/gykxOK6Bh6gZb9FIy1b8l/4sPVLS3vn9nC+vUfL0U0nXYPOyBURiSPR3N4REZHjpNAXEYkjCn0RkTii0BcRiSMKfRGROKLQFxGJIwp9EZE4otAXEYkj/x8pJ+TfosO9XgAAAABJRU5ErkJggg==\n",
924 | "text/plain": [
925 | ""
926 | ]
927 | },
928 | "metadata": {
929 | "needs_background": "light"
930 | },
931 | "output_type": "display_data"
932 | }
933 | ],
934 | "source": [
935 | "# аналогичные эксперименты с max_features=2\n",
936 | "\n",
937 | "model = RandomForestClassifier(max_features=2, n_estimators=1, oob_score=False, warm_start=True, random_state=1)\n",
938 | "\n",
939 | "aucs = []\n",
940 | "for t in tqdm_notebook(list(range(1, 101))):\n",
941 | " model.set_params(n_estimators=t)\n",
942 | " model.fit(X_train, y_train)\n",
943 | " a = model.predict_proba(X_test)[:, 1]\n",
944 | " q = roc_auc_score(y_test, a)\n",
945 | " aucs.append(q)\n",
946 | " \n",
947 | "plt.plot(range(1, 101), aucs) "
948 | ]
949 | },
950 | {
951 | "cell_type": "markdown",
952 | "metadata": {},
953 | "source": [
954 | "# Козырь\n",
955 | "\n",
956 | "более хорошая модель из другой библиотеки"
957 | ]
958 | },
959 | {
960 | "cell_type": "code",
961 | "execution_count": 15,
962 | "metadata": {},
963 | "outputs": [],
964 | "source": [
965 | "import lightgbm as lgb\n",
966 | "model = lgb.LGBMClassifier(num_leaves=6, learning_rate=0.1, n_estimators=300)"
967 | ]
968 | },
969 | {
970 | "cell_type": "code",
971 | "execution_count": 16,
972 | "metadata": {},
973 | "outputs": [
974 | {
975 | "name": "stdout",
976 | "output_type": "stream",
977 | "text": [
978 | "[50]\tvalid_0's auc: 0.68674\tvalid_1's auc: 0.665981\n",
979 | "[100]\tvalid_0's auc: 0.71142\tvalid_1's auc: 0.675922\n",
980 | "[150]\tvalid_0's auc: 0.727589\tvalid_1's auc: 0.676426\n",
981 | "[200]\tvalid_0's auc: 0.739755\tvalid_1's auc: 0.678309\n",
982 | "[250]\tvalid_0's auc: 0.748912\tvalid_1's auc: 0.678582\n",
983 | "[300]\tvalid_0's auc: 0.757664\tvalid_1's auc: 0.678602\n",
984 | "[350]\tvalid_0's auc: 0.764987\tvalid_1's auc: 0.677316\n",
985 | "[400]\tvalid_0's auc: 0.771289\tvalid_1's auc: 0.676125\n",
986 | "[450]\tvalid_0's auc: 0.77833\tvalid_1's auc: 0.675192\n",
987 | "[500]\tvalid_0's auc: 0.784456\tvalid_1's auc: 0.674098\n"
988 | ]
989 | }
990 | ],
991 | "source": [
992 | "param = {'num_leaves': 6, 'objective': 'binary', 'learning_rate': 0.1}\n",
993 | "param['metric'] = 'auc'\n",
994 | "\n",
995 | "q = lgb.train(param, train_set=lgb.Dataset(X_train, y_train), num_boost_round=500,\n",
996 | " valid_sets=[lgb.Dataset(X_train, y_train), lgb.Dataset(X_test, y_test)],\n",
997 | " verbose_eval=50)"
998 | ]
999 | },
1000 | {
1001 | "cell_type": "markdown",
1002 | "metadata": {},
1003 | "source": [
1004 | "тут сразу качество лучше"
1005 | ]
1006 | },
1007 | {
1008 | "cell_type": "markdown",
1009 | "metadata": {},
1010 | "source": [
1011 | "### Обучение и формирование ответа"
1012 | ]
1013 | },
1014 | {
1015 | "cell_type": "code",
1016 | "execution_count": 17,
1017 | "metadata": {},
1018 | "outputs": [
1019 | {
1020 | "data": {
1021 | "text/plain": [
1022 | "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n",
1023 | " learning_rate=0.1, max_depth=-1, min_child_samples=20,\n",
1024 | " min_child_weight=0.001, min_split_gain=0.0, n_estimators=300,\n",
1025 | " n_jobs=-1, num_leaves=6, objective=None, random_state=None,\n",
1026 | " reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,\n",
1027 | " subsample_for_bin=200000, subsample_freq=1)"
1028 | ]
1029 | },
1030 | "execution_count": 17,
1031 | "metadata": {},
1032 | "output_type": "execute_result"
1033 | }
1034 | ],
1035 | "source": [
1036 | "model.fit(data_train, y)"
1037 | ]
1038 | },
1039 | {
1040 | "cell_type": "code",
1041 | "execution_count": 18,
1042 | "metadata": {},
1043 | "outputs": [],
1044 | "source": [
1045 | "a = model.predict_proba(data_test)[:, 1] # вероятности за 1й класс"
1046 | ]
1047 | },
1048 | {
1049 | "cell_type": "code",
1050 | "execution_count": 19,
1051 | "metadata": {},
1052 | "outputs": [],
1053 | "source": [
1054 | "pd.DataFrame(a, columns=['proba'], index=ids.values).to_csv('dj1_01_.csv')"
1055 | ]
1056 | }
1057 | ],
1058 | "metadata": {
1059 | "kernelspec": {
1060 | "display_name": "Python 3",
1061 | "language": "python",
1062 | "name": "python3"
1063 | },
1064 | "language_info": {
1065 | "codemirror_mode": {
1066 | "name": "ipython",
1067 | "version": 3
1068 | },
1069 | "file_extension": ".py",
1070 | "mimetype": "text/x-python",
1071 | "name": "python",
1072 | "nbconvert_exporter": "python",
1073 | "pygments_lexer": "ipython3",
1074 | "version": "3.6.7"
1075 | }
1076 | },
1077 | "nbformat": 4,
1078 | "nbformat_minor": 2
1079 | }
1080 |
--------------------------------------------------------------------------------