├── README.md ├── mkb_benchmark.ipynb ├── dj_recomgenerator.ipynb ├── dj_sas_benchmark.ipynb ├── dj_benchmark_GMSC_01.ipynb ├── Benchmark_mmp_digital_reputation_challenge_1.ipynb ├── dj_invest_GMSC.ipynb └── dj_Benchmark_12trip.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # notebooks 2 | Некоторые фрагменты кода 3 | 4 | 5 | [Бенчмарк для студентов 2 курса ВМК МГУ](dj_benchmark_GMSC_01.ipynb) для задачи https://www.kaggle.com/c/msu-iml-2018 6 | 7 | [Чуть подробнее для 4 курса ВМК МГУ](dj_invest_GMSC.ipynb) для задачи https://www.kaggle.com/c/msu-iml-2018 8 | 9 | [Бенчмарк для студентов 5 курса ВМК МГУ](Benchmark_mmp_digital_reputation_challenge_1.ipynb) для задачи https://boosters.pro/championship/digital_reputation_challenge/overview 10 | 11 | [Бенчмарк для студентов 5 курса ВМК МГУ](dj_sas_benchmark.ipynb) для задачи https://sascompetitions.ru 12 | 13 | [Бенчмарк для студентов 2 курса ВМК МГУ](dj_Benchmark_12trip.ipynb) для задачи 1 соревнования https://boosters.pro/championship/onetwotrip_challenge/overview 14 | 15 | [Бенчмарк для студентов ПЗАД](mkb_benchmark.ipynb) для задачи соревнования [Хакатон МКБ 2021](https://dsbattle.com/hackathons/mkb/) 16 | -------------------------------------------------------------------------------- /mkb_benchmark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3e7947ff-db80-4afb-a61d-b708c8db3ddc", 6 | "metadata": {}, 7 | "source": [ 8 | "# примитивный вариант решения задачи MKB\n", 9 | "\n", 10 | "автор: Александр Дьяконов (https://dyakonov.org/ag/)\n", 11 | "\n", 12 | "цель: для оценки студентов своего курса (нужно за неделю побить этот бенчмарк)\n", 13 | "\n", 14 | "* решение записано за 20 минут\n", 15 | "* практически нет генерации признаков\n", 16 | "* все категории кодируются по мощности\n", 17 | "* пропуски -> -1\n", 18 | "* одна модель - lgb\n", 19 | "\n", 20 | "результат в лидерборе 0.8889 (на момент посылки ~15 место из 100)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "id": "bc3393d8-99c9-44b2-8fd6-657cc085e38b", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import pandas as pd" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "id": "8f537e13-4136-4120-969f-1fde6a500f7e", 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "(17891, 124) (7330, 123)\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "data_train = pd.read_csv('train_dataset_hackathon_mkb.csv', encoding='cp1251', delimiter=';')\n", 50 | "data_test = pd.read_csv('test_dataset_hackathon_mkb.csv', encoding='cp1251', delimiter=';')\n", 51 | "print (data_train.shape, data_test.shape)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "id": "378ad637-dac9-4b87-a38a-9d4dc11e1424", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "def makeX(data):\n", 62 | " # предобработка данных\n", 63 | " data['CITIZENSHIP_NAME'] = data['CITIZENSHIP_NAME'].fillna(-1).map({-1: -1, 'Российская Федерация': 4, 'Таджикистан': 3, 'Казахстан': 2, 'Армения': 1})\n", 64 | " data['SEX_NAME'] = data['SEX_NAME'].fillna(0).map({0: 0, 'мужской': 1, 'женский': -1})\n", 65 | " group_names = ['OKFS_GROUP', 'OKOPF_GROUP', 'OKOGU_GROUP'] + ['WORKERSRANGE', 'OKVED_CODE']\n", 66 | " date_names = ['SIGN_DATE', 'DATEFIRSTREG', 'TAXREG_REGDATE', 'TAXREGPAY_REGDATE', 'BIRTHDATE']\n", 67 | " for name in group_names + date_names + ['id_client']:\n", 68 | " data[name] = data[name].fillna(-1)\n", 69 | " tmp = data[name].value_counts()\n", 70 | " tmp = tmp + 0.1 * np.random.randn(len(tmp))\n", 71 | " data[name] = data[name].map(tmp)\n", 72 | " data.fillna(-1, inplace=True)\n", 73 | " return data" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "id": "d0fecd59-bce3-4315-8c58-60db5a032a8e", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "data_train = makeX(data_train) # обрабатываем обучение\n", 84 | "data_test = makeX(data_test) # обрабатываем тест" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 5, 90 | "id": "c769d199-a3fa-4feb-b9c0-6bdf1570f242", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "y = data_train.pop('TARGET').values # целевые значения\n", 95 | "data_test = data_test[data_train.columns] # на всякий случай - вдруг, перемешаны столбцы" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 6, 101 | "id": "7eb90f9b-36f6-49df-8063-2b55c8447e58", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "import lightgbm as lgb\n", 106 | "\n", 107 | "model = lgb.LGBMClassifier(num_leaves=31,\n", 108 | " learning_rate=0.05,\n", 109 | " n_estimators=200)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 7, 115 | "id": "07a1b578-fd9c-4c51-a1cd-38127f065d5e", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "model.fit(data_train, y)\n", 120 | "\n", 121 | "a = model.predict_proba(data_test)[:, 1] # получаем ответ" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 8, 127 | "id": "4f926992-193a-4f8d-a1c0-2b35a42c38b1", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "df = pd.DataFrame({'id_contract': data_test.id_contract.values, 'TARGET': a})\n", 132 | "df.to_csv('ans1.csv', sep=';', index=False) # сохраняем ответ" 133 | ] 134 | } 135 | ], 136 | "metadata": { 137 | "kernelspec": { 138 | "display_name": "Python 3", 139 | "language": "python", 140 | "name": "python3" 141 | }, 142 | "language_info": { 143 | "codemirror_mode": { 144 | "name": "ipython", 145 | "version": 3 146 | }, 147 | "file_extension": ".py", 148 | "mimetype": "text/x-python", 149 | "name": "python", 150 | "nbconvert_exporter": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.7.10" 153 | } 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 5 157 | } 158 | -------------------------------------------------------------------------------- /dj_recomgenerator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 24, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "name = 'George'\n", 12 | "surname = 'Kudryavtsev'\n", 13 | "when = \"September, 2014\"\n", 14 | "sex = \"m\"\n", 15 | "research = \"recommender systems\"\n", 16 | "to = \"the master's program at the Skolkovo Institute of Science and Technology\"\n", 17 | "\n", 18 | "\n", 19 | "fullname = name + ' ' + surname\n", 20 | "if (sex==\"m\"):\n", 21 | " his = \"his\"\n", 22 | " he = \"he\"\n", 23 | " He = \"He\"\n", 24 | " His = \"His\"\n", 25 | "else:\n", 26 | " his = \"her\"\n", 27 | " he = \"she\"\n", 28 | " He = \"She\"\n", 29 | " His = \"Her\"\n", 30 | "\n", 31 | "comment = \"\"\n", 32 | "# comment = \"Since \" + his +\" research topic is not widespread in Russia, \" + name + \"has to study a large amount of relevant literature in English.\"" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 25, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "Dear Sir or Madame,\n", 47 | "\n", 48 | "\n", 49 | "I am pleased to provide you with this online letter of recommendation in support of George Kudryavtsev. I am supervisor of his research work at the department of Computational Mathematics and Cybernetics at Lomonosov Moscow State University.\n", 50 | "\n", 51 | "I first met George in September, 2014 when he came to learn about my scientific interests. He also spoke to my colleagues and demonstrated his deep interest to our scientific field. George had excellent academic progress and I was glad to become his research advisor. George attended lectures and seminars I gave at the department. I could positively say that George was among the top students of the group. I would like to make special mention of George talent to learn and structure new information. I also supervised George’s scientific research. I would like to mention his high motivation for research work and organizational abilities.\n", 52 | "\n", 53 | "His research was devoted to recommender systems. In addition George employed his good programming skills to implement many algorithms.\n", 54 | "I consider George as a very promising researcher and confidently recommend his for «the master's program at the Skolkovo Institute of Science and Technology».\n", 55 | "\n", 56 | "If you require any further information, please do not hesitate to contact me: djakonov@mail.ru.\n", 57 | "\n", 58 | "D’yakonov A. G.\n", 59 | "Doctor of Sciences,\n", 60 | "Professor of Department of Mathematical Methods of Forecasting at Lomonosov Moscow State University\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "print (\"Dear Sir or Madame,\\n\\n\")\n", 66 | "print (\"I am pleased to provide you with this online letter of recommendation in support of \" + fullname + \\\n", 67 | " \". I am supervisor of \" + his + \" research work at the department of Computational Mathematics and Cybernetics at Lomonosov Moscow State University.\\n\")\n", 68 | "print (\"I first met \" + name + \" in \" + when + \" when \" + he + \" came to learn about my scientific interests. \" + He + \" also spoke to my colleagues and demonstrated \" + his + \" deep interest to our scientific field. \" + \\\n", 69 | " name + \" had excellent academic progress and I was glad to become \" + his + \" research advisor. \" + name + \\\n", 70 | " \" attended lectures and seminars I gave at the department. I could positively say that \" + name +\n", 71 | " \" was among the top students of the group. I would like to make special mention of \" + name + \\\n", 72 | " \" talent to learn and structure new information. I also supervised \" + name + \"’s scientific research. I would like to mention \" + his + \" high motivation for research work and organizational abilities.\\n\")\n", 73 | "print (His + \" research was devoted to \" + research + '. ' + comment + ' In addition ' + name + ' employed his good programming skills to implement many algorithms.')\n", 74 | "print (\"I consider \" + name + \" as a very promising researcher and confidently recommend his for «\" + to + \"».\\n\")\n", 75 | "print (\"If you require any further information, please do not hesitate to contact me: djakonov@mail.ru.\\n\")\n", 76 | "print (\"D’yakonov A. G.\")\n", 77 | "print (\"Doctor of Sciences,\")\n", 78 | "print (\"Professor of Department of Mathematical Methods of Forecasting at Lomonosov Moscow State University\")" 79 | ] 80 | } 81 | ], 82 | "metadata": { 83 | "kernelspec": { 84 | "display_name": "Python 3", 85 | "language": "python", 86 | "name": "python3" 87 | }, 88 | "language_info": { 89 | "codemirror_mode": { 90 | "name": "ipython", 91 | "version": 3 92 | }, 93 | "file_extension": ".py", 94 | "mimetype": "text/x-python", 95 | "name": "python", 96 | "nbconvert_exporter": "python", 97 | "pygments_lexer": "ipython3", 98 | "version": "3.5.0" 99 | } 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 0 103 | } 104 | -------------------------------------------------------------------------------- /dj_sas_benchmark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Тупой бенчмарк" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# подгружаем все нужные пакеты\n", 17 | "import pandas as pd\n", 18 | "import numpy as np\n", 19 | "# для встроенных картинок\n", 20 | "%pylab inline\n", 21 | "import matplotlib.pyplot as plt" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "train = pd.read_csv(\"train.csv\", encoding='cp1251')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 9, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "test = pd.read_csv(\"test.csv\", encoding='cp1251')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 16, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "def prepare_data(d, cf=True):\n", 49 | " print ('Time')\n", 50 | " tmp = d['Interval'].str.split('-')\n", 51 | " d['int0']=tmp.apply(lambda x: int(x[0]))\n", 52 | " d['int1']=tmp.apply(lambda x: int(x[1][:-1]))\n", 53 | " del d['Interval']\n", 54 | " \n", 55 | " print ('Date')\n", 56 | " d.OrderDate = pd.to_datetime(d.OrderDate)\n", 57 | " d.Date = pd.to_datetime(d.Date)\n", 58 | " \n", 59 | " deltaT = (d.OrderDate - d.Date).dt.days.astype(int).values\n", 60 | " d['deltaT'] = deltaT\n", 61 | " \n", 62 | " print ('Day')\n", 63 | " d['Date_day'] = d.Date.dt.day\n", 64 | " d['Date_month'] = d.Date.dt.month\n", 65 | " d['Date_weekday'] = d.Date.dt.weekday\n", 66 | " # train['Date_year'] = train.Date.dt.year\n", 67 | " \n", 68 | " d['OrderDate_day'] = d.OrderDate.dt.day\n", 69 | " d['OrderDate_month'] = d.OrderDate.dt.month\n", 70 | " d['OrderDate_weekday'] = d.OrderDate.dt.weekday\n", 71 | " # train['OrderDate_year'] = train.OrderDate.dt.year\n", 72 | " \n", 73 | " del d['Date']\n", 74 | " del d['OrderDate']\n", 75 | " \n", 76 | " print ('Type')\n", 77 | " d['DeliveryType'] = d['DeliveryType'].map({'Обычная доставка': 0, 'Доставка День в День': 1})\n", 78 | " \n", 79 | " print ('Groupby')\n", 80 | " cols = ['ChannelID', 'ClientID', 'DeliveryType', 'prepay', 'count_edit', 'int0', 'int1', 'deltaT', 'Date_day', 'Date_month', 'Date_weekday',\n", 81 | " 'OrderDate_day', 'OrderDate_month', 'OrderDate_weekday']\n", 82 | " if cf:\n", 83 | " cols = cols + ['CancelFlag']\n", 84 | " \n", 85 | " data = d.groupby('OrderID')[cols].first()\n", 86 | " \n", 87 | " print ('Num')\n", 88 | " data['num'] = d.groupby('OrderID')['GroupID'].count()\n", 89 | " data['sum'] = d.groupby('OrderID')['OrderCnt'].sum()\n", 90 | " data['num/sum'] = data['num'] / data['sum']\n", 91 | " \n", 92 | " if cf:\n", 93 | " y = data.pop('CancelFlag')\n", 94 | " return (data, y)\n", 95 | " return (data)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 23, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Time\n", 108 | "Date\n", 109 | "Day\n", 110 | "Type\n", 111 | "Groupby\n", 112 | "Num\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "data2 = prepare_data(test, cf=False)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 24, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "Time\n", 130 | "Date\n", 131 | "Day\n", 132 | "Type\n", 133 | "Groupby\n", 134 | "Num\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "data, y = prepare_data(train, cf=True)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 29, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "[100]\tcv_agg's auc: 0.697344 + 0.00654333\n", 152 | "[200]\tcv_agg's auc: 0.702906 + 0.00615931\n", 153 | "[300]\tcv_agg's auc: 0.706133 + 0.0055678\n", 154 | "[400]\tcv_agg's auc: 0.707971 + 0.00560712\n", 155 | "[500]\tcv_agg's auc: 0.709498 + 0.00520707\n", 156 | "[600]\tcv_agg's auc: 0.710534 + 0.00546179\n", 157 | "[700]\tcv_agg's auc: 0.71134 + 0.00563445\n", 158 | "[800]\tcv_agg's auc: 0.712172 + 0.00550675\n", 159 | "[900]\tcv_agg's auc: 0.712418 + 0.00562642\n", 160 | "[1000]\tcv_agg's auc: 0.712741 + 0.00559418\n" 161 | ] 162 | }, 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "0.7127633282147994" 167 | ] 168 | }, 169 | "execution_count": 29, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "import lightgbm as lgb\n", 176 | "\n", 177 | "param = {'n_estimators':1000, 'num_leaves':6, 'objective':'binary',\n", 178 | " 'learning_rate': 0.1, 'colsample_bytree': 0.75, 'subsample': 0.75,\n", 179 | " 'metric': 'auc'}\n", 180 | " \n", 181 | "w = lgb.cv(param, lgb.Dataset(data, label=y),\n", 182 | " stratified=False,\n", 183 | " num_boost_round=1000, nfold=4, verbose_eval=100)\n", 184 | "max(w['auc-mean'])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 33, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "978" 196 | ] 197 | }, 198 | "execution_count": 33, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "np.argmax(w['auc-mean'])" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 32, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "[100]\tcv_agg's auc: 0.700747 + 0.0060651\n", 217 | "[200]\tcv_agg's auc: 0.705531 + 0.00576664\n", 218 | "[300]\tcv_agg's auc: 0.708824 + 0.0056635\n", 219 | "[400]\tcv_agg's auc: 0.710562 + 0.00555759\n", 220 | "[500]\tcv_agg's auc: 0.711568 + 0.0050531\n", 221 | "[600]\tcv_agg's auc: 0.712369 + 0.00498233\n", 222 | "[700]\tcv_agg's auc: 0.71297 + 0.0050313\n", 223 | "[800]\tcv_agg's auc: 0.713529 + 0.00490358\n", 224 | "[900]\tcv_agg's auc: 0.713833 + 0.0047361\n", 225 | "[1000]\tcv_agg's auc: 0.71383 + 0.00477599\n" 226 | ] 227 | }, 228 | { 229 | "data": { 230 | "text/plain": [ 231 | "0.7139605983784897" 232 | ] 233 | }, 234 | "execution_count": 32, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "import lightgbm as lgb\n", 241 | "\n", 242 | "param = {'n_estimators':1000, 'num_leaves':8, 'objective':'binary',\n", 243 | " 'learning_rate': 0.1, 'colsample_bytree': 0.75, 'subsample': 0.75,\n", 244 | " 'metric': 'auc'}\n", 245 | " \n", 246 | "w = lgb.cv(param, lgb.Dataset(data, label=y),\n", 247 | " stratified=False,\n", 248 | " num_boost_round=1000, nfold=4, verbose_eval=100)\n", 249 | "max(w['auc-mean'])" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 36, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "model = lgb.LGBMClassifier(learning_rate=0.1, num_leaves=8,\n", 259 | " n_estimators=1000,\n", 260 | " colsample_bytree=0.75, subsample=0.75, random_state=1)\n", 261 | "model.fit(data, y)\n", 262 | "a = model.predict_proba(data2)[:,1]" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 47, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "pd.DataFrame({'ID': [str(x) + ' ' for x in data2.index], ' Score': a}).to_csv('constant.csv', index=False)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "Python 3", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3", 298 | "version": "3.6.8" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 2 303 | } 304 | -------------------------------------------------------------------------------- /dj_benchmark_GMSC_01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Бенчмарк\n", 8 | "\n", 9 | "задачи для студентов 2го курса ВМК МГУ\n", 10 | "\n", 11 | "https://www.kaggle.com/c/msu-iml-2018/\n", 12 | "\n", 13 | "2018, Александр Дьяконов https://dyakonov.org/ag/" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 21, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Populating the interactive namespace from numpy and matplotlib\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import pandas as pd\n", 31 | "import numpy as np\n", 32 | "%pylab inline\n", 33 | "plt.style.use('seaborn-dark')\n", 34 | "import warnings\n", 35 | "warnings.filterwarnings(\"ignore\") # отключение варнингов\n", 36 | "pd.set_option('display.max_columns', None) # pd.options.display.max_columns = None \n", 37 | "# pd.set_option('display.max_rows', None) # не прятать столбцы при выводе дата-фреймов\n", 38 | "import matplotlib.pyplot as plt\n", 39 | "import matplotlib as mpl\n", 40 | "plt.rc('font', size=14)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "# загружаем данные\n", 48 | "\n", 49 | "не забудьте поменять каталоги" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 23, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "train = pd.read_csv('./data_GMSC/train.csv')\n", 59 | "test = pd.read_csv('./data_GMSC/test.csv')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 24, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "(112500, 11) (37500, 10)\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "# размеры данных\n", 77 | "print(train.shape, test.shape)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# смотрим на данные" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 25, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/html": [ 95 | "
\n", 96 | "\n", 109 | "\n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | "
плохой_клиентлиниивозрастповедение_30-59_днейDebt_Ratioдоходчисло_кредитовповедение_90_днейнедвижимостьповедение_60-89_днейсемья
552200.1116734601.329588800.080102.0
8938500.0440976900.5351223800.0100100.0
8158600.0475987700.1696103000.070100.0
10510800.7611495812217.000000NaN40100.0
354300.6906845500.43255212416.070202.0
\n", 199 | "
" 200 | ], 201 | "text/plain": [ 202 | " плохой_клиент линии возраст поведение_30-59_дней Debt_Ratio \\\n", 203 | "5522 0 0.111673 46 0 1.329588 \n", 204 | "89385 0 0.044097 69 0 0.535122 \n", 205 | "81586 0 0.047598 77 0 0.169610 \n", 206 | "105108 0 0.761149 58 1 2217.000000 \n", 207 | "3543 0 0.690684 55 0 0.432552 \n", 208 | "\n", 209 | " доход число_кредитов поведение_90_дней недвижимость \\\n", 210 | "5522 800.0 8 0 1 \n", 211 | "89385 3800.0 10 0 1 \n", 212 | "81586 3000.0 7 0 1 \n", 213 | "105108 NaN 4 0 1 \n", 214 | "3543 12416.0 7 0 2 \n", 215 | "\n", 216 | " поведение_60-89_дней семья \n", 217 | "5522 0 2.0 \n", 218 | "89385 0 0.0 \n", 219 | "81586 0 0.0 \n", 220 | "105108 0 0.0 \n", 221 | "3543 0 2.0 " 222 | ] 223 | }, 224 | "execution_count": 25, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "train.sample(5)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "# если хотите работать с numpy-массивом" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 27, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "array([[0.00000000e+00, 3.12582480e-02, 5.70000000e+01, 0.00000000e+00,\n", 249 | " 3.97520496e-01, 5.00000000e+03, 1.50000000e+01, 0.00000000e+00,\n", 250 | " 2.00000000e+00, 0.00000000e+00, 0.00000000e+00],\n", 251 | " [0.00000000e+00, 5.23315890e-02, 6.40000000e+01, 0.00000000e+00,\n", 252 | " 5.70000000e+01, nan, 2.00000000e+00, 0.00000000e+00,\n", 253 | " 0.00000000e+00, 0.00000000e+00, nan]])" 254 | ] 255 | }, 256 | "execution_count": 27, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "X = train.values\n", 263 | "X[:2,:]" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "# готовим данные" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 28, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "((112500, 10), (112500,))" 282 | ] 283 | }, 284 | "execution_count": 28, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "y = train.pop('плохой_клиент') # целевой вектор\n", 291 | "train.shape, y.shape" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 29, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "# заменить пропуски на -11\n", 301 | "train.fillna(-11, inplace=True)\n", 302 | "test.fillna(-11, inplace=True)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "# Обучаем модель" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 14, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 321 | " max_depth=2, max_features='auto', max_leaf_nodes=None,\n", 322 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 323 | " min_samples_leaf=1, min_samples_split=2,\n", 324 | " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,\n", 325 | " oob_score=False, random_state=0, verbose=0, warm_start=False)" 326 | ] 327 | }, 328 | "execution_count": 14, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "from sklearn.ensemble import RandomForestClassifier\n", 335 | "model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)\n", 336 | "model.fit(train, y)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 19, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "a = model.predict_proba(test)[:,1] # вероятности" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 20, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "# сохранить решение\n", 355 | "pd.DataFrame({'id': np.arange(37500), 'a':a}).to_csv('./data_GMSC/solution.csv', index=False)" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "kernelspec": { 361 | "display_name": "Python 3", 362 | "language": "python", 363 | "name": "python3" 364 | }, 365 | "language_info": { 366 | "codemirror_mode": { 367 | "name": "ipython", 368 | "version": 3 369 | }, 370 | "file_extension": ".py", 371 | "mimetype": "text/x-python", 372 | "name": "python", 373 | "nbconvert_exporter": "python", 374 | "pygments_lexer": "ipython3", 375 | "version": "3.6.6" 376 | } 377 | }, 378 | "nbformat": 4, 379 | "nbformat_minor": 2 380 | } 381 | -------------------------------------------------------------------------------- /Benchmark_mmp_digital_reputation_challenge_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Бенчмарк\n", 8 | "\n", 9 | "Для задачи **Digital Reputation Challenge**\n", 10 | "\n", 11 | "https://boosters.pro\n", 12 | "\n", 13 | "автор: Александр Дьяконов https://dyakonov.org" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "import matplotlib\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "%matplotlib inline" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## загрузка данных" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "(4000, 26) (462888, 2) (4000, 453) (4000, 6)\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "TRAIN_PATH = 'train/'\n", 51 | "X1 = pd.read_csv(TRAIN_PATH + 'X1.csv')\n", 52 | "X2 = pd.read_csv(TRAIN_PATH + 'X2.csv')\n", 53 | "X3 = pd.read_csv(TRAIN_PATH + 'X3.csv')\n", 54 | "Y = pd.read_csv(TRAIN_PATH + 'Y.csv')\n", 55 | "print (X1.shape, X2.shape, X3.shape, Y.shape)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 5, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "Y.columns = ['Y' + s if s != 'id' else 'id' for s in Y.columns]" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 6, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "(4058, 26) (470083, 2) (4058, 453)\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "TEST_PATH = 'test/'\n", 82 | "X1_test = pd.read_csv(TEST_PATH + 'X1.csv')\n", 83 | "X2_test = pd.read_csv(TEST_PATH + 'X2.csv')\n", 84 | "X3_test = pd.read_csv(TEST_PATH + 'X3.csv')\n", 85 | "print (X1_test.shape, X2_test.shape, X3_test.shape)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "# Подготовка обучения и теста\n", 93 | "\n", 94 | "используем только матрицу 1" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "X = X1.copy()\n", 104 | "X = X.merge(Y)\n", 105 | " \n", 106 | "id_ = X.pop('id')\n", 107 | "y1 = X.pop('Y1')\n", 108 | "y2 = X.pop('Y2')\n", 109 | "y3 = X.pop('Y3')\n", 110 | "y4 = X.pop('Y4')\n", 111 | "y5 = X.pop('Y5')\n", 112 | "\n", 113 | "X_test = X1_test.copy()\n", 114 | "id__ = X_test.pop('id')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### Эксперименты\n", 122 | "\n", 123 | "делаются так..." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 9, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stderr", 133 | "output_type": "stream", 134 | "text": [ 135 | "/home/alexander/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:390: UserWarning: Found `n_estimators` in params. Will use it instead of argument\n", 136 | " warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n" 137 | ] 138 | }, 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "[100]\tcv_agg's auc: 0.600372 + 0.0184634\n", 144 | "[200]\tcv_agg's auc: 0.597381 + 0.0157782\n", 145 | "[300]\tcv_agg's auc: 0.5931 + 0.0141725\n", 146 | "[400]\tcv_agg's auc: 0.590727 + 0.0117742\n", 147 | "[500]\tcv_agg's auc: 0.587088 + 0.0124422\n", 148 | "[600]\tcv_agg's auc: 0.586458 + 0.0104644\n", 149 | "[700]\tcv_agg's auc: 0.587516 + 0.00971511\n", 150 | "[800]\tcv_agg's auc: 0.587225 + 0.0104176\n", 151 | "[900]\tcv_agg's auc: 0.585616 + 0.0111163\n", 152 | "[1000]\tcv_agg's auc: 0.584474 + 0.0119534\n" 153 | ] 154 | }, 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "0.6026030903311532" 159 | ] 160 | }, 161 | "execution_count": 9, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "import lightgbm as lgb\n", 168 | "\n", 169 | "param = {'n_estimators':1000, 'num_leaves':6, 'objective':'binary',\n", 170 | " 'learning_rate': 0.05, 'colsample_bytree': 0.75, 'subsample': 0.75,\n", 171 | " 'metric': 'auc'}\n", 172 | " \n", 173 | "w = lgb.cv(param, lgb.Dataset(X, label=y1),\n", 174 | " stratified=False,\n", 175 | " num_boost_round=1000, nfold=4, verbose_eval=100)\n", 176 | "max(w['auc-mean'])" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 10, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "[]" 188 | ] 189 | }, 190 | "execution_count": 10, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | }, 194 | { 195 | "data": { 196 | "image/png": "\n", 197 | "text/plain": [ 198 | "
" 199 | ] 200 | }, 201 | "metadata": { 202 | "needs_background": "light" 203 | }, 204 | "output_type": "display_data" 205 | } 206 | ], 207 | "source": [ 208 | "plt.plot(w['auc-mean'])" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "# Подготовка ответа" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 11, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "a1 = 0.0\n", 225 | "for t in range(10):\n", 226 | " model1 = lgb.LGBMClassifier(learning_rate=0.01, num_leaves=6,\n", 227 | " n_estimators=290,\n", 228 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n", 229 | " model1.fit(X, y1)\n", 230 | " a = model1.predict_proba(X_test)[:,1]\n", 231 | " # print (a)\n", 232 | " a1 += a\n", 233 | "a1 = a1 / 10" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 12, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "a2 = 0.0\n", 243 | "for t in range(10):\n", 244 | " model2 = lgb.LGBMClassifier(learning_rate=0.03, num_leaves=2,\n", 245 | " n_estimators=378,\n", 246 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n", 247 | " model2.fit(X, y2)\n", 248 | " a = model2.predict_proba(X_test)[:,1]\n", 249 | " a2 += a\n", 250 | "a2 = a2 / 10" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 13, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "a3 = 0.0\n", 260 | "for t in range(10):\n", 261 | " model3 = lgb.LGBMClassifier(learning_rate=0.01, num_leaves=4,\n", 262 | " n_estimators=543,\n", 263 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n", 264 | " model3.fit(X, y3)\n", 265 | " a = model3.predict_proba(X_test)[:,1]\n", 266 | " a3 += a\n", 267 | "a3 = a3 / 10" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 14, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "a4 = 0.0\n", 277 | "for t in range(10):\n", 278 | " model4 = lgb.LGBMClassifier(learning_rate=0.003, num_leaves=6,\n", 279 | " n_estimators=618,\n", 280 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n", 281 | " model4.fit(X, y4)\n", 282 | " a = model4.predict_proba(X_test)[:,1]\n", 283 | " a4 += a\n", 284 | "a4 = a4 / 10" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 15, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "a5 = 0.0\n", 294 | "for t in range(10):\n", 295 | " model5 = lgb.LGBMClassifier(learning_rate=0.002, num_leaves=3,\n", 296 | " n_estimators=516,\n", 297 | " colsample_bytree=0.75, subsample=0.75, random_state=t)\n", 298 | " model5.fit(X, y5)\n", 299 | " a = model5.predict_proba(X_test)[:,1]\n", 300 | " #print (a)\n", 301 | " a5 += a\n", 302 | "a5 = a5 / 10" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 16, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/html": [ 313 | "
\n", 314 | "\n", 327 | "\n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | "
id12345
000.3415970.3362500.2534920.3006790.428683
110.3458120.2271940.3009290.2826220.466563
220.4377190.4842280.2260820.3072570.444477
340.3222150.2744120.2440450.3012360.394336
470.3669070.2536260.4355250.3803120.456793
\n", 387 | "
" 388 | ], 389 | "text/plain": [ 390 | " id 1 2 3 4 5\n", 391 | "0 0 0.341597 0.336250 0.253492 0.300679 0.428683\n", 392 | "1 1 0.345812 0.227194 0.300929 0.282622 0.466563\n", 393 | "2 2 0.437719 0.484228 0.226082 0.307257 0.444477\n", 394 | "3 4 0.322215 0.274412 0.244045 0.301236 0.394336\n", 395 | "4 7 0.366907 0.253626 0.435525 0.380312 0.456793" 396 | ] 397 | }, 398 | "execution_count": 16, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "df = pd.DataFrame({'id': X1_test.id.values,\n", 405 | " '1': a1,\n", 406 | " '2': a2,\n", 407 | " '3': a3,\n", 408 | " '4': a4,\n", 409 | " '5': a5})\n", 410 | "df.to_csv('mmp_baseline_.csv', index=False)\n", 411 | "df.head()" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [] 420 | } 421 | ], 422 | "metadata": { 423 | "kernelspec": { 424 | "display_name": "Python 3", 425 | "language": "python", 426 | "name": "python3" 427 | }, 428 | "language_info": { 429 | "codemirror_mode": { 430 | "name": "ipython", 431 | "version": 3 432 | }, 433 | "file_extension": ".py", 434 | "mimetype": "text/x-python", 435 | "name": "python", 436 | "nbconvert_exporter": "python", 437 | "pygments_lexer": "ipython3", 438 | "version": "3.7.3" 439 | } 440 | }, 441 | "nbformat": 4, 442 | "nbformat_minor": 2 443 | } 444 | -------------------------------------------------------------------------------- /dj_invest_GMSC.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# этапы решения задачи на реальном примере\n", 8 | "\n", 9 | "для курса \"Машинное обучение и анализ данных\" https://github.com/Dyakonov/MLDM/\n", 10 | " \n", 11 | "2019, Александр Дьяконов https://dyakonov.org/ag/" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "Populating the interactive namespace from numpy and matplotlib\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "%pylab inline\n", 31 | "plt.style.use('seaborn-dark')\n", 32 | "import warnings\n", 33 | "warnings.filterwarnings(\"ignore\") # отключение варнингов\n", 34 | "pd.set_option('display.max_columns', None) # pd.options.display.max_columns = None \n", 35 | "# pd.set_option('display.max_rows', None) # не прятать столбцы при выводе дата-фреймов\n", 36 | "import matplotlib.pyplot as plt\n", 37 | "import matplotlib as mpl\n", 38 | "plt.rc('font', size=14)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "# загрузили данные" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "train = pd.read_csv('./data_GMSC/train.csv')\n", 55 | "test = pd.read_csv('./data_GMSC/test.csv')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "(112500, 11) (37500, 10)\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "# размеры данных\n", 73 | "print(train.shape, test.shape)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "# посмотрели" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/html": [ 91 | "
\n", 92 | "\n", 105 | "\n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | "
плохой_клиентлиниивозрастповедение_30-59_днейDebt_Ratioдоходчисло_кредитовповедение_90_днейнедвижимостьповедение_60-89_днейсемья
2212600.3575965220.5320584600.0140104.0
5438700.05614560056.000000NaN40000.0
81500.4472244500.6536079009.0140303.0
1304300.0988105400.20373619166.0150204.0
7546900.6835543400.2641685416.090102.0
\n", 195 | "
" 196 | ], 197 | "text/plain": [ 198 | " плохой_клиент линии возраст поведение_30-59_дней Debt_Ratio \\\n", 199 | "22126 0 0.357596 52 2 0.532058 \n", 200 | "54387 0 0.056145 60 0 56.000000 \n", 201 | "815 0 0.447224 45 0 0.653607 \n", 202 | "13043 0 0.098810 54 0 0.203736 \n", 203 | "75469 0 0.683554 34 0 0.264168 \n", 204 | "\n", 205 | " доход число_кредитов поведение_90_дней недвижимость \\\n", 206 | "22126 4600.0 14 0 1 \n", 207 | "54387 NaN 4 0 0 \n", 208 | "815 9009.0 14 0 3 \n", 209 | "13043 19166.0 15 0 2 \n", 210 | "75469 5416.0 9 0 1 \n", 211 | "\n", 212 | " поведение_60-89_дней семья \n", 213 | "22126 0 4.0 \n", 214 | "54387 0 0.0 \n", 215 | "815 0 3.0 \n", 216 | "13043 0 4.0 \n", 217 | "75469 0 2.0 " 218 | ] 219 | }, 220 | "execution_count": 5, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "train.sample(5)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "# особенности" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "Нам повезло: нет категориальных признаков - не надо думать о кодировках\n", 241 | " \n", 242 | "Но есть пропуски: пока не будем думать о них (попробуйте придкмать что-то умнее) - заменим (-1)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 6, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "((112500, 10), (112500,))" 254 | ] 255 | }, 256 | "execution_count": 6, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "y = train.pop('плохой_клиент') # целевой вектор\n", 263 | "train.shape, y.shape" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 7, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "# заменить пропуски на -11\n", 273 | "train.fillna(-1, inplace=True)\n", 274 | "test.fillna(-1, inplace=True)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "у нас задача бинарной классификации:" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 8, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "array([0, 1])" 293 | ] 294 | }, 295 | "execution_count": 8, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "np.unique(y)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "перечислим подходящие алгоритмы для бинарной классификации (тут, кстати, не все алгоритмы):" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 16, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "from sklearn.linear_model import LogisticRegression\n", 318 | "from sklearn.svm import LinearSVC\n", 319 | "from sklearn.linear_model import SGDClassifier\n", 320 | "from sklearn.neighbors import KNeighborsClassifier\n", 321 | "from sklearn.ensemble import RandomForestClassifier\n", 322 | "from sklearn.ensemble import ExtraTreesClassifier\n", 323 | "from sklearn.ensemble import GradientBoostingClassifier\n", 324 | "\n", 325 | "models = {'лог_регрессия': LogisticRegression(),\n", 326 | " 'лин_svm': LinearSVC(),\n", 327 | " 'SGD': SGDClassifier(),\n", 328 | " 'knn': KNeighborsClassifier(),\n", 329 | " 'RF': RandomForestClassifier(),\n", 330 | " 'ETC': ExtraTreesClassifier(),\n", 331 | " 'GBM': GradientBoostingClassifier()} " 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "поэкспериментируем со всеми алгоритмами (параметры по умолчанию)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 22, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "name": "stdout", 348 | "output_type": "stream", 349 | "text": [ 350 | "лог_регрессия auc=0.697 std=0.011\n", 351 | "лин_svm auc=0.565 std=0.029\n", 352 | "SGD auc=0.537 std=0.036\n", 353 | "knn auc=0.568 std=0.008\n", 354 | "RF auc=0.777 std=0.007\n", 355 | "ETC auc=0.778 std=0.01\n", 356 | "GBM auc=0.866 std=0.002\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "from sklearn.model_selection import cross_val_score\n", 362 | "from sklearn.model_selection import ShuffleSplit\n", 363 | "\n", 364 | "cv = ShuffleSplit(n_splits=5, test_size=0.1, train_size=None, random_state=1)\n", 365 | "\n", 366 | "for model_name in models:\n", 367 | " model = models[model_name]\n", 368 | " cvs = cross_val_score(model, train, y, cv=cv, scoring='roc_auc')\n", 369 | " print (model_name, f\"auc={np.round(np.mean(cvs), 3)}\", f\"std={np.round(np.std(cvs), 3)}\")" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "некоторые алгоритмы долго обучаются\n", 377 | "\n", 378 | "совет: поймите какие и от чего это зависит!\n", 379 | "\n", 380 | "пока самый лучший алгоритм - **градиентный бустинг**\n", 381 | "\n", 382 | "здесь метрика качества - AUC ROC\n", 383 | "https://dyakonov.org/2017/07/28/auc-roc-%D0%BF%D0%BB%D0%BE%D1%89%D0%B0%D0%B4%D1%8C-%D0%BF%D0%BE%D0%B4-%D0%BA%D1%80%D0%B8%D0%B2%D0%BE%D0%B9-%D0%BE%D1%88%D0%B8%D0%B1%D0%BE%D0%BA/\n", 384 | "\n", 385 | "Метрик качества очень много! Вот некоторые из них:" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 14, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/plain": [ 396 | "dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])" 397 | ] 398 | }, 399 | "execution_count": 14, 400 | "metadata": {}, 401 | "output_type": "execute_result" 402 | } 403 | ], 404 | "source": [ 405 | "from sklearn.metrics import SCORERS\n", 406 | "SCORERS.keys()" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "настроим параметры бустинга" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 28, 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "data": { 423 | "text/plain": [ 424 | "RandomizedSearchCV(cv=ShuffleSplit(n_splits=5, random_state=None, test_size=0.1, train_size=None),\n", 425 | " error_score='raise-deprecating',\n", 426 | " estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n", 427 | " learning_rate=0.1, loss='deviance', max_depth=3,\n", 428 | " max_features=None, max_leaf_nodes=None,\n", 429 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 430 | " min_samples_leaf=1, min_sampl... subsample=1.0, tol=0.0001, validation_fraction=0.1,\n", 431 | " verbose=0, warm_start=False),\n", 432 | " fit_params=None, iid='warn', n_iter=10, n_jobs=-1,\n", 433 | " param_distributions={'learning_rate': [0.05, 0.1, 0.2], 'subsample': [0.5, 1.0], 'max_depth': [1, 2, 3, 4, 5], 'max_features': [0.5, 0.75, 1.0]},\n", 434 | " pre_dispatch='2*n_jobs', random_state=None, refit=True,\n", 435 | " return_train_score='warn', scoring='roc_auc', verbose=0)" 436 | ] 437 | }, 438 | "execution_count": 28, 439 | "metadata": {}, 440 | "output_type": "execute_result" 441 | } 442 | ], 443 | "source": [ 444 | "from sklearn.model_selection import RandomizedSearchCV\n", 445 | "\n", 446 | "params = {'learning_rate': [0.05, 0.1, 0.2], 'subsample': [0.5, 1.0], 'max_depth': [1, 2, 3, 4, 5], 'max_features': [0.5, 0.75, 1.0]}\n", 447 | "\n", 448 | "model = GradientBoostingClassifier()\n", 449 | "\n", 450 | "rs = RandomizedSearchCV(model, params, n_iter=10, scoring='roc_auc', n_jobs=-1, cv=cv)\n", 451 | "\n", 452 | "rs.fit(train, y)" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 30, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "data": { 462 | "text/html": [ 463 | "
\n", 464 | "\n", 477 | "\n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | "
mean_fit_timestd_fit_timemean_score_timestd_score_timeparam_subsampleparam_max_featuresparam_max_depthparam_learning_rateparamssplit0_test_scoresplit1_test_scoresplit2_test_scoresplit3_test_scoresplit4_test_scoremean_test_scorestd_test_scorerank_test_scoresplit0_train_scoresplit1_train_scoresplit2_train_scoresplit3_train_scoresplit4_train_scoremean_train_scorestd_train_score
06.8368090.2890800.0189130.00014810.7540.05{'subsample': 1.0, 'max_features': 0.75, 'max_...0.8643990.8688790.8622850.8519240.8465870.8588150.00826560.8685080.8678890.8684000.8690920.8699010.8687580.000688
16.1396521.1586660.0201980.00786710.7530.05{'subsample': 1.0, 'max_features': 0.75, 'max_...0.8631620.8675770.8601940.8505980.8453860.8573830.00818790.8639490.8636990.8644720.8653490.8658110.8646560.000808
27.0810660.4551160.0194970.0009240.50.540.05{'subsample': 0.5, 'max_features': 0.5, 'max_d...0.8646760.8690230.8621860.8523090.8463780.8589140.00832950.8671700.8669710.8673520.8690770.8693990.8679940.001028
34.4717080.4410730.0134200.0040680.50.7520.2{'subsample': 0.5, 'max_features': 0.75, 'max_...0.8646680.8679070.8619870.8505040.8472440.8584620.00811670.8641270.8650110.8651980.8663550.8665360.8654450.000895
42.1144950.0467760.0092470.00098810.7510.05{'subsample': 1.0, 'max_features': 0.75, 'max_...0.8527360.8553340.8479090.8402060.8363360.8465040.007236100.8515550.8506050.8507730.8528620.8536010.8518790.001173
55.2276250.4268650.0153710.0002380.50.7530.05{'subsample': 0.5, 'max_features': 0.75, 'max_...0.8626990.8683070.8608500.8498810.8463550.8576190.00821380.8635110.8634610.8642450.8647700.8662660.8644510.001030
68.5503110.6369350.0235140.00062010.550.05{'subsample': 1.0, 'max_features': 0.5, 'max_d...0.8651470.8692980.8625800.8542520.8481930.8598940.00764240.8726830.8730400.8731560.8741110.8747010.8735380.000749
75.7458840.2324850.0178840.0002160.50.540.1{'subsample': 0.5, 'max_features': 0.5, 'max_d...0.8639340.8694590.8631710.8547910.8498440.8602400.00700030.8704330.8706150.8720360.8726330.8721470.8715730.000882
89.2472650.2957920.0215390.00023410.7550.1{'subsample': 1.0, 'max_features': 0.75, 'max_...0.8664980.8693210.8632940.8546220.8491500.8605770.00754920.8793760.8785380.8799750.8800510.8806070.8797090.000704
95.2083280.1497210.0176660.00075410.540.2{'subsample': 1.0, 'max_features': 0.5, 'max_d...0.8674510.8700340.8645940.8523940.8495550.8608060.00825810.8768570.8770020.8779720.8779810.8793700.8778360.000900
\n", 780 | "
" 781 | ], 782 | "text/plain": [ 783 | " mean_fit_time std_fit_time mean_score_time std_score_time \\\n", 784 | "0 6.836809 0.289080 0.018913 0.000148 \n", 785 | "1 6.139652 1.158666 0.020198 0.007867 \n", 786 | "2 7.081066 0.455116 0.019497 0.000924 \n", 787 | "3 4.471708 0.441073 0.013420 0.004068 \n", 788 | "4 2.114495 0.046776 0.009247 0.000988 \n", 789 | "5 5.227625 0.426865 0.015371 0.000238 \n", 790 | "6 8.550311 0.636935 0.023514 0.000620 \n", 791 | "7 5.745884 0.232485 0.017884 0.000216 \n", 792 | "8 9.247265 0.295792 0.021539 0.000234 \n", 793 | "9 5.208328 0.149721 0.017666 0.000754 \n", 794 | "\n", 795 | " param_subsample param_max_features param_max_depth param_learning_rate \\\n", 796 | "0 1 0.75 4 0.05 \n", 797 | "1 1 0.75 3 0.05 \n", 798 | "2 0.5 0.5 4 0.05 \n", 799 | "3 0.5 0.75 2 0.2 \n", 800 | "4 1 0.75 1 0.05 \n", 801 | "5 0.5 0.75 3 0.05 \n", 802 | "6 1 0.5 5 0.05 \n", 803 | "7 0.5 0.5 4 0.1 \n", 804 | "8 1 0.75 5 0.1 \n", 805 | "9 1 0.5 4 0.2 \n", 806 | "\n", 807 | " params split0_test_score \\\n", 808 | "0 {'subsample': 1.0, 'max_features': 0.75, 'max_... 0.864399 \n", 809 | "1 {'subsample': 1.0, 'max_features': 0.75, 'max_... 0.863162 \n", 810 | "2 {'subsample': 0.5, 'max_features': 0.5, 'max_d... 0.864676 \n", 811 | "3 {'subsample': 0.5, 'max_features': 0.75, 'max_... 0.864668 \n", 812 | "4 {'subsample': 1.0, 'max_features': 0.75, 'max_... 0.852736 \n", 813 | "5 {'subsample': 0.5, 'max_features': 0.75, 'max_... 0.862699 \n", 814 | "6 {'subsample': 1.0, 'max_features': 0.5, 'max_d... 0.865147 \n", 815 | "7 {'subsample': 0.5, 'max_features': 0.5, 'max_d... 0.863934 \n", 816 | "8 {'subsample': 1.0, 'max_features': 0.75, 'max_... 0.866498 \n", 817 | "9 {'subsample': 1.0, 'max_features': 0.5, 'max_d... 0.867451 \n", 818 | "\n", 819 | " split1_test_score split2_test_score split3_test_score split4_test_score \\\n", 820 | "0 0.868879 0.862285 0.851924 0.846587 \n", 821 | "1 0.867577 0.860194 0.850598 0.845386 \n", 822 | "2 0.869023 0.862186 0.852309 0.846378 \n", 823 | "3 0.867907 0.861987 0.850504 0.847244 \n", 824 | "4 0.855334 0.847909 0.840206 0.836336 \n", 825 | "5 0.868307 0.860850 0.849881 0.846355 \n", 826 | "6 0.869298 0.862580 0.854252 0.848193 \n", 827 | "7 0.869459 0.863171 0.854791 0.849844 \n", 828 | "8 0.869321 0.863294 0.854622 0.849150 \n", 829 | "9 0.870034 0.864594 0.852394 0.849555 \n", 830 | "\n", 831 | " mean_test_score std_test_score rank_test_score split0_train_score \\\n", 832 | "0 0.858815 0.008265 6 0.868508 \n", 833 | "1 0.857383 0.008187 9 0.863949 \n", 834 | "2 0.858914 0.008329 5 0.867170 \n", 835 | "3 0.858462 0.008116 7 0.864127 \n", 836 | "4 0.846504 0.007236 10 0.851555 \n", 837 | "5 0.857619 0.008213 8 0.863511 \n", 838 | "6 0.859894 0.007642 4 0.872683 \n", 839 | "7 0.860240 0.007000 3 0.870433 \n", 840 | "8 0.860577 0.007549 2 0.879376 \n", 841 | "9 0.860806 0.008258 1 0.876857 \n", 842 | "\n", 843 | " split1_train_score split2_train_score split3_train_score \\\n", 844 | "0 0.867889 0.868400 0.869092 \n", 845 | "1 0.863699 0.864472 0.865349 \n", 846 | "2 0.866971 0.867352 0.869077 \n", 847 | "3 0.865011 0.865198 0.866355 \n", 848 | "4 0.850605 0.850773 0.852862 \n", 849 | "5 0.863461 0.864245 0.864770 \n", 850 | "6 0.873040 0.873156 0.874111 \n", 851 | "7 0.870615 0.872036 0.872633 \n", 852 | "8 0.878538 0.879975 0.880051 \n", 853 | "9 0.877002 0.877972 0.877981 \n", 854 | "\n", 855 | " split4_train_score mean_train_score std_train_score \n", 856 | "0 0.869901 0.868758 0.000688 \n", 857 | "1 0.865811 0.864656 0.000808 \n", 858 | "2 0.869399 0.867994 0.001028 \n", 859 | "3 0.866536 0.865445 0.000895 \n", 860 | "4 0.853601 0.851879 0.001173 \n", 861 | "5 0.866266 0.864451 0.001030 \n", 862 | "6 0.874701 0.873538 0.000749 \n", 863 | "7 0.872147 0.871573 0.000882 \n", 864 | "8 0.880607 0.879709 0.000704 \n", 865 | "9 0.879370 0.877836 0.000900 " 866 | ] 867 | }, 868 | "execution_count": 30, 869 | "metadata": {}, 870 | "output_type": "execute_result" 871 | } 872 | ], 873 | "source": [ 874 | "pd.DataFrame(rs.cv_results_)" 875 | ] 876 | }, 877 | { 878 | "cell_type": "markdown", 879 | "metadata": {}, 880 | "source": [ 881 | "лучшие параметры" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": 33, 887 | "metadata": {}, 888 | "outputs": [ 889 | { 890 | "data": { 891 | "text/plain": [ 892 | "{'subsample': 1.0, 'max_features': 0.5, 'max_depth': 4, 'learning_rate': 0.2}" 893 | ] 894 | }, 895 | "execution_count": 33, 896 | "metadata": {}, 897 | "output_type": "execute_result" 898 | } 899 | ], 900 | "source": [ 901 | "rs.cv_results_['params'][-1]" 902 | ] 903 | }, 904 | { 905 | "cell_type": "markdown", 906 | "metadata": {}, 907 | "source": [ 908 | "ожидаемое качество" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": 36, 914 | "metadata": {}, 915 | "outputs": [ 916 | { 917 | "data": { 918 | "text/plain": [ 919 | "0.8608056678859954" 920 | ] 921 | }, 922 | "execution_count": 36, 923 | "metadata": {}, 924 | "output_type": "execute_result" 925 | } 926 | ], 927 | "source": [ 928 | "rs.cv_results_['mean_test_score'][-1]" 929 | ] 930 | }, 931 | { 932 | "cell_type": "markdown", 933 | "metadata": {}, 934 | "source": [ 935 | "Кстати, хуже параметров по умолчанию;)\n", 936 | "\n", 937 | "Мало экспериментов..." 938 | ] 939 | }, 940 | { 941 | "cell_type": "markdown", 942 | "metadata": {}, 943 | "source": [ 944 | "# советы по улучшению\n", 945 | "\n", 946 | "раз лучшим оказался градиентный бустинг => смотрим его лучшие реализации\n", 947 | "\n", 948 | "* xgboost https://en.wikipedia.org/wiki/XGBoost\n", 949 | "* lightgbm https://github.com/Microsoft/LightGBM\n", 950 | "* catboost https://tech.yandex.ru/catboost/" 951 | ] 952 | } 953 | ], 954 | "metadata": { 955 | "kernelspec": { 956 | "display_name": "Python 3", 957 | "language": "python", 958 | "name": "python3" 959 | }, 960 | "language_info": { 961 | "codemirror_mode": { 962 | "name": "ipython", 963 | "version": 3 964 | }, 965 | "file_extension": ".py", 966 | "mimetype": "text/x-python", 967 | "name": "python", 968 | "nbconvert_exporter": "python", 969 | "pygments_lexer": "ipython3", 970 | "version": "3.6.8" 971 | } 972 | }, 973 | "nbformat": 4, 974 | "nbformat_minor": 2 975 | } 976 | -------------------------------------------------------------------------------- /dj_Benchmark_12trip.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Бенчмарк для задачи OneTwoTrip Contest\n", 8 | "\n", 9 | "https://boosters.pro/championship/onetwotrip_challenge/overview\n", 10 | " \n", 11 | " \n", 12 | "для студентов ВМК МГУ\n", 13 | "\n", 14 | "2019, Александр Дьяконов www.dyakonov.org/ag/" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "Populating the interactive namespace from numpy and matplotlib\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "# подгружаем все нужные пакеты\n", 32 | "import pandas as pd\n", 33 | "import numpy as np\n", 34 | "# для встроенных картинок\n", 35 | "%pylab inline\n", 36 | "# отключить предупреждения\n", 37 | "import warnings\n", 38 | "warnings.filterwarnings('ignore')\n", 39 | "# прогресс-бар\n", 40 | "from tqdm import tqdm, tqdm_notebook" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## загрузка данных" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "размеры: (196056, 43) (455011, 37)\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "data_train = pd.read_csv('onetwotrip_challenge_train.csv')\n", 65 | "data_test = pd.read_csv('onetwotrip_challenge_test.csv')\n", 66 | "print ('размеры:', data_train.shape, data_test.shape)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## посмотрим на данные\n", 74 | "\n", 75 | "обратите внимание, как выводятся дата-фреймы" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/html": [ 86 | "
\n", 87 | "\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | "
orderiduseridfield0field1field2field3field4field5field6field7...indicator_goal22indicator_goal23indicator_goal24indicator_goal25goal21goal22goal23goal24goal25goal1
0010d654494cbe97bbb25d51ead2600679aff9e097924add...0-0.62650811121101...1011010000
114aafc0391f72bbcf60537aece62923baf9ce644b64ac36...144-0.393794572002...1010000000
22bac8ffef46348f587c8d17137ab01fb24aef21547c647d...134-0.548937232001...1011000000
330392247b4b87674aba2c32bf2292b105771a6a376871be...0-0.23865110111132...1011000000
44d1aeefef311bbeb4bd84876c8d49421f276674527d5578...0-0.7040798111101...1001000000
\n", 250 | "

5 rows × 43 columns

\n", 251 | "
" 252 | ], 253 | "text/plain": [ 254 | " orderid userid field0 \\\n", 255 | "0 0 10d654494cbe97bbb25d51ead2600679aff9e097924add... 0 \n", 256 | "1 1 4aafc0391f72bbcf60537aece62923baf9ce644b64ac36... 144 \n", 257 | "2 2 bac8ffef46348f587c8d17137ab01fb24aef21547c647d... 134 \n", 258 | "3 3 0392247b4b87674aba2c32bf2292b105771a6a376871be... 0 \n", 259 | "4 4 d1aeefef311bbeb4bd84876c8d49421f276674527d5578... 0 \n", 260 | "\n", 261 | " field1 field2 field3 field4 field5 field6 field7 ... \\\n", 262 | "0 -0.626508 11 12 1 1 0 1 ... \n", 263 | "1 -0.393794 5 7 2 0 0 2 ... \n", 264 | "2 -0.548937 2 3 2 0 0 1 ... \n", 265 | "3 -0.238651 10 11 1 1 3 2 ... \n", 266 | "4 -0.704079 8 11 1 1 0 1 ... \n", 267 | "\n", 268 | " indicator_goal22 indicator_goal23 indicator_goal24 indicator_goal25 \\\n", 269 | "0 1 0 1 1 \n", 270 | "1 1 0 1 0 \n", 271 | "2 1 0 1 1 \n", 272 | "3 1 0 1 1 \n", 273 | "4 1 0 0 1 \n", 274 | "\n", 275 | " goal21 goal22 goal23 goal24 goal25 goal1 \n", 276 | "0 0 1 0 0 0 0 \n", 277 | "1 0 0 0 0 0 0 \n", 278 | "2 0 0 0 0 0 0 \n", 279 | "3 0 0 0 0 0 0 \n", 280 | "4 0 0 0 0 0 0 \n", 281 | "\n", 282 | "[5 rows x 43 columns]" 283 | ] 284 | }, 285 | "execution_count": 4, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "data_train.head()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 5, 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "data": { 301 | "text/html": [ 302 | "
\n", 303 | "\n", 316 | "\n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | "
orderiduseridfield0field1field2field3field4field5field6field7...field25field26field27field28field29indicator_goal21indicator_goal22indicator_goal23indicator_goal24indicator_goal25
003a6a6af55e097d3f92705936a7ea3ca8aef651f5966832...0-0.54893710101101...1110411001
112df10f61960163da94a4294773ed9c865296e37c330304...82-0.626508343001...1110111011
2220dc3fec5b5eb42fbfe08119063c3a0010a73c7ec94abb...0-0.548937681102...35110210011
33ed75b3496977bac207eccb59dc91fe9a8d6a27777a6422...60.3043487720101...1310311011
44a346d08351c5fd0bda82984ed7c8b12b6395829da5b857...115-0.471365332001...1110111011
\n", 466 | "

5 rows × 37 columns

\n", 467 | "
" 468 | ], 469 | "text/plain": [ 470 | " orderid userid field0 \\\n", 471 | "0 0 3a6a6af55e097d3f92705936a7ea3ca8aef651f5966832... 0 \n", 472 | "1 1 2df10f61960163da94a4294773ed9c865296e37c330304... 82 \n", 473 | "2 2 20dc3fec5b5eb42fbfe08119063c3a0010a73c7ec94abb... 0 \n", 474 | "3 3 ed75b3496977bac207eccb59dc91fe9a8d6a27777a6422... 6 \n", 475 | "4 4 a346d08351c5fd0bda82984ed7c8b12b6395829da5b857... 115 \n", 476 | "\n", 477 | " field1 field2 field3 field4 field5 field6 field7 ... field25 \\\n", 478 | "0 -0.548937 10 10 1 1 0 1 ... 1 \n", 479 | "1 -0.626508 3 4 3 0 0 1 ... 1 \n", 480 | "2 -0.548937 6 8 1 1 0 2 ... 35 \n", 481 | "3 0.304348 7 7 2 0 10 1 ... 1 \n", 482 | "4 -0.471365 3 3 2 0 0 1 ... 1 \n", 483 | "\n", 484 | " field26 field27 field28 field29 indicator_goal21 indicator_goal22 \\\n", 485 | "0 1 1 0 4 1 1 \n", 486 | "1 1 1 0 1 1 1 \n", 487 | "2 1 1 0 2 1 0 \n", 488 | "3 3 1 0 3 1 1 \n", 489 | "4 1 1 0 1 1 1 \n", 490 | "\n", 491 | " indicator_goal23 indicator_goal24 indicator_goal25 \n", 492 | "0 0 0 1 \n", 493 | "1 0 1 1 \n", 494 | "2 0 1 1 \n", 495 | "3 0 1 1 \n", 496 | "4 0 1 1 \n", 497 | "\n", 498 | "[5 rows x 37 columns]" 499 | ] 500 | }, 501 | "execution_count": 5, 502 | "metadata": {}, 503 | "output_type": "execute_result" 504 | } 505 | ], 506 | "source": [ 507 | "data_test.head()" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "## получаем таблички для обучения" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 6, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "# удаляем ненужные признаки\n", 524 | "ids = data_test.pop('orderid') # сохраняем id для теста\n", 525 | "data_test.drop(['userid'], inplace=True, axis=1)" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 7, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "data": { 535 | "text/plain": [ 536 | "Index(['field0', 'field1', 'field2', 'field3', 'field4', 'field5', 'field6',\n", 537 | " 'field7', 'field8', 'field9', 'field10', 'field11', 'field12',\n", 538 | " 'field13', 'field14', 'field15', 'field16', 'field17', 'field18',\n", 539 | " 'field19', 'field20', 'field21', 'field22', 'field23', 'field24',\n", 540 | " 'field25', 'field26', 'field27', 'field28', 'field29',\n", 541 | " 'indicator_goal21', 'indicator_goal22', 'indicator_goal23',\n", 542 | " 'indicator_goal24', 'indicator_goal25'],\n", 543 | " dtype='object')" 544 | ] 545 | }, 546 | "execution_count": 7, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "cols = data_test.columns # значимые колонки\n", 553 | "cols" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 8, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "y = data_train.pop('goal1') # целевая переменная для первой задачи" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 9, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "y = y.values # мне так спокойней - в numpy.array" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 10, 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [ 580 | "data_train = data_train[cols] # оставить только нужные колонки" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 11, 586 | "metadata": {}, 587 | "outputs": [ 588 | { 589 | "data": { 590 | "text/html": [ 591 | "
\n", 592 | "\n", 605 | "\n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | "
field0field1field2field3field4field5field6field7field8field9...field25field26field27field28field29indicator_goal21indicator_goal22indicator_goal23indicator_goal24indicator_goal25
00-0.6265081112110110...1210411011
1144-0.39379457200210...41310211010
2134-0.54893723200110...11170111011
30-0.2386511011113210...18110411011
40-0.704079811110110...1110311001
\n", 755 | "

5 rows × 35 columns

\n", 756 | "
" 757 | ], 758 | "text/plain": [ 759 | " field0 field1 field2 field3 field4 field5 field6 field7 field8 \\\n", 760 | "0 0 -0.626508 11 12 1 1 0 1 1 \n", 761 | "1 144 -0.393794 5 7 2 0 0 2 1 \n", 762 | "2 134 -0.548937 2 3 2 0 0 1 1 \n", 763 | "3 0 -0.238651 10 11 1 1 3 2 1 \n", 764 | "4 0 -0.704079 8 11 1 1 0 1 1 \n", 765 | "\n", 766 | " field9 ... field25 field26 field27 field28 field29 indicator_goal21 \\\n", 767 | "0 0 ... 1 2 1 0 4 1 \n", 768 | "1 0 ... 41 3 1 0 2 1 \n", 769 | "2 0 ... 1 11 7 0 1 1 \n", 770 | "3 0 ... 18 1 1 0 4 1 \n", 771 | "4 0 ... 1 1 1 0 3 1 \n", 772 | "\n", 773 | " indicator_goal22 indicator_goal23 indicator_goal24 indicator_goal25 \n", 774 | "0 1 0 1 1 \n", 775 | "1 1 0 1 0 \n", 776 | "2 1 0 1 1 \n", 777 | "3 1 0 1 1 \n", 778 | "4 1 0 0 1 \n", 779 | "\n", 780 | "[5 rows x 35 columns]" 781 | ] 782 | }, 783 | "execution_count": 11, 784 | "metadata": {}, 785 | "output_type": "execute_result" 786 | } 787 | ], 788 | "source": [ 789 | "# что получилось\n", 790 | "data_train.head()" 791 | ] 792 | }, 793 | { 794 | "cell_type": "markdown", 795 | "metadata": {}, 796 | "source": [ 797 | "### Эксперименты\n", 798 | "\n", 799 | "сначала делим выборку на обучение и тест\n", 800 | "\n", 801 | "не самое лучшее решение, но для быстроты экспериментов сгодится" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": 12, 807 | "metadata": {}, 808 | "outputs": [], 809 | "source": [ 810 | "from sklearn.model_selection import train_test_split\n", 811 | "X_train, X_test, y_train, y_test = train_test_split(data_train, y, test_size=0.3, random_state=1)" 812 | ] 813 | }, 814 | { 815 | "cell_type": "markdown", 816 | "metadata": {}, 817 | "source": [ 818 | "### Случайный лес\n", 819 | "\n", 820 | "\n", 821 | "строим по одному дереву и вычисляем метрику качества (ROC AUC)\n", 822 | "\n", 823 | "обратите внимание на прогресс-бар" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": 14, 829 | "metadata": {}, 830 | "outputs": [ 831 | { 832 | "data": { 833 | "application/vnd.jupyter.widget-view+json": { 834 | "model_id": "f2be4ed53aa84261896b78ba8c029131", 835 | "version_major": 2, 836 | "version_minor": 0 837 | }, 838 | "text/plain": [ 839 | "HBox(children=(IntProgress(value=0), HTML(value='')))" 840 | ] 841 | }, 842 | "metadata": {}, 843 | "output_type": "display_data" 844 | }, 845 | { 846 | "name": "stdout", 847 | "output_type": "stream", 848 | "text": [ 849 | "\n" 850 | ] 851 | }, 852 | { 853 | "data": { 854 | "text/plain": [ 855 | "[]" 856 | ] 857 | }, 858 | "execution_count": 14, 859 | "metadata": {}, 860 | "output_type": "execute_result" 861 | }, 862 | { 863 | "data": { 864 | "image/png": "\n", 865 | "text/plain": [ 866 | "
" 867 | ] 868 | }, 869 | "metadata": { 870 | "needs_background": "light" 871 | }, 872 | "output_type": "display_data" 873 | } 874 | ], 875 | "source": [ 876 | "from sklearn.ensemble import RandomForestClassifier\n", 877 | "from sklearn.metrics import roc_auc_score\n", 878 | "\n", 879 | "model = RandomForestClassifier(max_features=1, n_estimators=1, oob_score=False, warm_start=True, random_state=1)\n", 880 | "\n", 881 | "aucs = []\n", 882 | "for t in tqdm_notebook(list(range(1, 101))):\n", 883 | " model.set_params(n_estimators=t)\n", 884 | " model.fit(X_train, y_train)\n", 885 | " a = model.predict_proba(X_test)[:, 1]\n", 886 | " q = roc_auc_score(y_test, a)\n", 887 | " aucs.append(q)\n", 888 | " \n", 889 | "plt.plot(range(1, 101), aucs)" 890 | ] 891 | }, 892 | { 893 | "cell_type": "code", 894 | "execution_count": 46, 895 | "metadata": {}, 896 | "outputs": [ 897 | { 898 | "data": { 899 | "application/vnd.jupyter.widget-view+json": { 900 | "model_id": "08b2382018ab4201b3b2c5904899732f", 901 | "version_major": 2, 902 | "version_minor": 0 903 | }, 904 | "text/plain": [ 905 | "HBox(children=(IntProgress(value=0), HTML(value='')))" 906 | ] 907 | }, 908 | "metadata": {}, 909 | "output_type": "display_data" 910 | }, 911 | { 912 | "data": { 913 | "text/plain": [ 914 | "[]" 915 | ] 916 | }, 917 | "execution_count": 46, 918 | "metadata": {}, 919 | "output_type": "execute_result" 920 | }, 921 | { 922 | "data": { 923 | "image/png": "\n", 924 | "text/plain": [ 925 | "
" 926 | ] 927 | }, 928 | "metadata": { 929 | "needs_background": "light" 930 | }, 931 | "output_type": "display_data" 932 | } 933 | ], 934 | "source": [ 935 | "# аналогичные эксперименты с max_features=2\n", 936 | "\n", 937 | "model = RandomForestClassifier(max_features=2, n_estimators=1, oob_score=False, warm_start=True, random_state=1)\n", 938 | "\n", 939 | "aucs = []\n", 940 | "for t in tqdm_notebook(list(range(1, 101))):\n", 941 | " model.set_params(n_estimators=t)\n", 942 | " model.fit(X_train, y_train)\n", 943 | " a = model.predict_proba(X_test)[:, 1]\n", 944 | " q = roc_auc_score(y_test, a)\n", 945 | " aucs.append(q)\n", 946 | " \n", 947 | "plt.plot(range(1, 101), aucs) " 948 | ] 949 | }, 950 | { 951 | "cell_type": "markdown", 952 | "metadata": {}, 953 | "source": [ 954 | "# Козырь\n", 955 | "\n", 956 | "более хорошая модель из другой библиотеки" 957 | ] 958 | }, 959 | { 960 | "cell_type": "code", 961 | "execution_count": 15, 962 | "metadata": {}, 963 | "outputs": [], 964 | "source": [ 965 | "import lightgbm as lgb\n", 966 | "model = lgb.LGBMClassifier(num_leaves=6, learning_rate=0.1, n_estimators=300)" 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": 16, 972 | "metadata": {}, 973 | "outputs": [ 974 | { 975 | "name": "stdout", 976 | "output_type": "stream", 977 | "text": [ 978 | "[50]\tvalid_0's auc: 0.68674\tvalid_1's auc: 0.665981\n", 979 | "[100]\tvalid_0's auc: 0.71142\tvalid_1's auc: 0.675922\n", 980 | "[150]\tvalid_0's auc: 0.727589\tvalid_1's auc: 0.676426\n", 981 | "[200]\tvalid_0's auc: 0.739755\tvalid_1's auc: 0.678309\n", 982 | "[250]\tvalid_0's auc: 0.748912\tvalid_1's auc: 0.678582\n", 983 | "[300]\tvalid_0's auc: 0.757664\tvalid_1's auc: 0.678602\n", 984 | "[350]\tvalid_0's auc: 0.764987\tvalid_1's auc: 0.677316\n", 985 | "[400]\tvalid_0's auc: 0.771289\tvalid_1's auc: 0.676125\n", 986 | "[450]\tvalid_0's auc: 0.77833\tvalid_1's auc: 0.675192\n", 987 | "[500]\tvalid_0's auc: 0.784456\tvalid_1's auc: 0.674098\n" 988 | ] 989 | } 990 | ], 991 | "source": [ 992 | "param = {'num_leaves': 6, 'objective': 'binary', 'learning_rate': 0.1}\n", 993 | "param['metric'] = 'auc'\n", 994 | "\n", 995 | "q = lgb.train(param, train_set=lgb.Dataset(X_train, y_train), num_boost_round=500,\n", 996 | " valid_sets=[lgb.Dataset(X_train, y_train), lgb.Dataset(X_test, y_test)],\n", 997 | " verbose_eval=50)" 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "markdown", 1002 | "metadata": {}, 1003 | "source": [ 1004 | "тут сразу качество лучше" 1005 | ] 1006 | }, 1007 | { 1008 | "cell_type": "markdown", 1009 | "metadata": {}, 1010 | "source": [ 1011 | "### Обучение и формирование ответа" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": 17, 1017 | "metadata": {}, 1018 | "outputs": [ 1019 | { 1020 | "data": { 1021 | "text/plain": [ 1022 | "LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n", 1023 | " learning_rate=0.1, max_depth=-1, min_child_samples=20,\n", 1024 | " min_child_weight=0.001, min_split_gain=0.0, n_estimators=300,\n", 1025 | " n_jobs=-1, num_leaves=6, objective=None, random_state=None,\n", 1026 | " reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,\n", 1027 | " subsample_for_bin=200000, subsample_freq=1)" 1028 | ] 1029 | }, 1030 | "execution_count": 17, 1031 | "metadata": {}, 1032 | "output_type": "execute_result" 1033 | } 1034 | ], 1035 | "source": [ 1036 | "model.fit(data_train, y)" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": 18, 1042 | "metadata": {}, 1043 | "outputs": [], 1044 | "source": [ 1045 | "a = model.predict_proba(data_test)[:, 1] # вероятности за 1й класс" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": 19, 1051 | "metadata": {}, 1052 | "outputs": [], 1053 | "source": [ 1054 | "pd.DataFrame(a, columns=['proba'], index=ids.values).to_csv('dj1_01_.csv')" 1055 | ] 1056 | } 1057 | ], 1058 | "metadata": { 1059 | "kernelspec": { 1060 | "display_name": "Python 3", 1061 | "language": "python", 1062 | "name": "python3" 1063 | }, 1064 | "language_info": { 1065 | "codemirror_mode": { 1066 | "name": "ipython", 1067 | "version": 3 1068 | }, 1069 | "file_extension": ".py", 1070 | "mimetype": "text/x-python", 1071 | "name": "python", 1072 | "nbconvert_exporter": "python", 1073 | "pygments_lexer": "ipython3", 1074 | "version": "3.6.7" 1075 | } 1076 | }, 1077 | "nbformat": 4, 1078 | "nbformat_minor": 2 1079 | } 1080 | --------------------------------------------------------------------------------