├── hw01 ├── README.md └── Syntax.ipynb ├── hw02 ├── KNN.ipynb ├── NaiveBayes.ipynb ├── NumpyScipy.ipynb ├── Polynom.ipynb └── README.md ├── hw04 ├── DenseLayersCifar10.ipynb ├── README.md └── WideNeuralNetworkMNIST.ipynb ├── lecture01 └── L1_Introduction.pdf ├── lecture02 └── L2_MathAndSimpleMethods.pdf ├── lecture03 └── L3_LinearModels.pdf ├── lecture0405 ├── L4_DecisionTreesAndEnsembles.pdf └── L5_Metrics.pdf ├── lecture06 └── L6_Unsupervised.pdf ├── lecture0708 └── L7_8_Validation_and_features.pdf ├── lecture09 └── L9_NN_intro.pdf ├── lecture10 └── L10_CNN_and_RNN.pdf ├── seminar01 ├── SklearnFirstClassifiers.ipynb ├── SyntaxPart1.ipynb ├── SyntaxPart2.ipynb └── bioresponse.csv ├── seminar02 ├── 01_Numpy.ipynb ├── 02_Scipy.ipynb ├── 03_Matplotlib_Emeli_Intro.ipynb ├── Optional_Part_Naive_Bayes.ipynb ├── Optional_Part_kNN.ipynb ├── numpy_fancy_indexing.png └── numpy_indexing.png ├── seminar03 ├── Proximal_operator.pdf ├── bias_variance.pdf ├── pandas │ ├── .DS_Store │ ├── data_sample_example.tsv │ ├── dataset.tsv │ ├── iris_frame.csv │ ├── pandas_dataframe.ipynb │ ├── pandas_indexing_selection.ipynb │ └── updated_dataset.csv ├── pics │ ├── 3_datasets.png │ ├── bag_of_words.png │ ├── convex_function_3d.png │ ├── normalization.png │ ├── overfitting.png │ ├── regularization.png │ ├── sgd_vs_gd.png │ └── tfidf.png ├── sklearn.linear_model_part1.ipynb ├── sklearn.linear_model_part2.ipynb ├── sklearn │ ├── .DS_Store │ ├── sklearn-cross-validation.ipynb │ ├── sklearn-datasets.ipynb │ └── sklearn-knn-surfaces.ipynb └── text_classification.ipynb ├── seminar0405 ├── 01_TreeVisualization.ipynb ├── 02_Bagging.ipynb ├── 03_Boosting.ipynb ├── 04_ComparingRandomForestAndGradientBoosting.ipynb ├── 05_BiasVariance.ipynb ├── 06_ExtractFeaturesFromDecisionTree.ipynb ├── ClassificationMetrics.ipynb ├── HR.csv ├── RegressionMetrics.ipynb └── ml_bias_variance.png ├── seminar06 ├── 01_ClusteringIntroduction.ipynb ├── 02_ClusteringText.ipynb ├── 03_TopicModelling.ipynb ├── 04_Word2Vec.ipynb ├── 05_CustomerSegmentation.ipynb ├── opencorpora_for_word2vec.txt └── transactions.csv.zip └── seminar07 ├── 1_approximation_theorem.ipynb ├── 2_simple_mnist.ipynb └── 3_dense_cifar10.ipynb /hw01/README.md: -------------------------------------------------------------------------------- 1 | ## Домашнее задание №1: синтаксис Python 2 | 3 | Задание приводится в ноутбуке, в каждой задаче есть контрольный код, вывод которого нужно ввести в [форму](https://goo.gl/forms/uPJiZVDlLVau4EbI2). 4 | 5 | Дедлайн по сдаче: 23:00 5 марта. 6 | -------------------------------------------------------------------------------- /hw01/Syntax.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Базовая часть" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Задание 1.1\n", 15 | "\n", 16 | "Напишите функцию, которая принимает список чисел my_list (количество товаров разных марок) и число $n$ и возвращает сумму всех чисел списка меньше либо равных $n$ (суммарное количество товаров, которых в наличие меньше либо равно $n$)." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "def function_1_1(my_list, n):\n", 26 | " # допишите код здесь\n", 27 | " return" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "function_1_1([1, 2, 3, 4, 5, 6, 3, 7], 7) == 31" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "function_1_1([1, 2, 3, 4, 4, 3, 2, 1], 7) == 20" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "function_1_1([1, 2, 3, 4, 4, 3, 2, 1], 3) == 12" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "function_1_1([1, 2, 3, 4, 5, 6, 3, 7], 5) == 18" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "function_1_1([1, 2, 3, 4, 5, 6, 3, 7], 3) == 9" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "function_1_1([1, 2, 3, 4, 5, 6, 3, 7], 0) == 0" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "#### Значение для формы" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "import random\n", 98 | "\n", 99 | "random.seed(42)\n", 100 | "print(function_1_1([random.randint(0, 1000) for _ in range(10000)], 700))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## Задание 1.2\n", 108 | "\n", 109 | "Имеется список пар (фамилия, долг) $my\\_list$ и число $dept$. Нужно написать функцию, которая для каждой фамилии находит суммарный долг и выводит отсортированный список фамилий, у которых долг строго больше $dept$.\n", 110 | "\n", 111 | "Одна и та же фамилия может встречаться несколько раз, и тогда долги по ней суммируются." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "def function_1_2(my_list, dept):\n", 121 | " # допишите код здесь\n", 122 | " return" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "function_1_2([('Ivanov', 1), ('Petrov', 2), ('Ivanov', 2), ('Sidorov', 4)], 1) == ['Ivanov', 'Petrov', 'Sidorov']" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "function_1_2([('Ivanov', 1), ('Petrov', 2), ('Ivanov', 2), ('Sidorov', 4)], 2) == ['Ivanov', 'Sidorov']" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "function_1_2([('Ivanov', 1), ('Petrov', 2), ('Ivanov', 2), ('Sidorov', 4)], 3) == ['Sidorov']" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "function_1_2([('Ivanov', 1), ('Petrov', 2), ('Ivanov', 2), ('Sidorov', 4)], 4) == []" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "function_1_2([('Ivanov', 1), ('Petrov', 2), ('Ivanov', 1), ('Sidorov', 4)], 2) == ['Sidorov']" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "function_1_2([('Ivanov', 1), ('Petrov', 2), ('Ivanov', 2), ('Petrov', 4)], 2) == ['Ivanov', 'Petrov']" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "function_1_2([('Ivanov', 1), ('Petrov', 2), ('Ivanov', 2), ('Petrov', 4)], 3) == ['Petrov']" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "#### Значение для формы" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "import random\n", 202 | "import numpy as np\n", 203 | "\n", 204 | "\n", 205 | "random.seed(42)\n", 206 | "print(','.join(function_1_2([\n", 207 | " (str(random.randint(1, 20)), random.randint(1, 1000) )\n", 208 | " for _ in range(1000)\n", 209 | "], 25000)))" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "# Продвинутая часть" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "## Задание 2.1\n", 224 | "\n", 225 | "Напишите функцию, принимающую число $n$ и считающую $n!$, не используя модуль math\n", 226 | "\n", 227 | "$n! = 1 \\cdot 2 \\cdot 3 \\cdot \\dots \\cdot (n - 1) \\cdot n$\n", 228 | "\n", 229 | "Подсказка: воспользуйтесь циклами" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "def function_2_1(n):\n", 239 | " # допишите код здесь\n", 240 | " return" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "function_2_1(3) == 6" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "function_2_1(5) == 120" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "function_2_1(9) == 362880" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "function_2_1(13) == 6227020800" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "#### Значение для формы" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "print(function_2_1(100))" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "## Задание 2.2\n", 300 | "\n", 301 | "Есть два списка чисел: цены товаров в одном магазине и цены товаров в другом магазине.\n", 302 | "\n", 303 | "Нужно сравнить средние (среднее берётся по всем товарам) цены, минимальные цены и максимальные в магазинах. Если первое зачение меньше надо вернуть 'First', если второе, то 'Second', а если равны, то 'Equal'\n", 304 | "\n", 305 | "Например, function_2_2([1, 2, 3], [2, 2, 2])\n", 306 | "\n", 307 | "В первом магазине среднее значение 2, минимальное 1, максимальное 3. А во втором среднее 2, минимальное 2 и максимальное 2.\n", 308 | "\n", 309 | "Результаты сравнения $2 = 2,~1 < 2,~3 > 2$, поэтому нужно вернуть ('Equal', 'First', 'Second')" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "def function_2_2(list1, list2):\n", 319 | " return # допишите код здесь" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "function_2_2([1, 2, 3], [2, 2, 2]) == ('Equal', 'First', 'Second')" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "function_2_2([2, 2, 1], [4]) ==('First', 'First', 'First')" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "function_2_2([3, 5], [2, 4, 6]) == ('Equal', 'Second', 'First')" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "function_2_2([1, 7], [2, 4, 7]) == ('First', 'First', 'Equal')" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "#### Значение для формы" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "import random\n", 372 | "\n", 373 | "random.seed(47)\n", 374 | "print(','.join(function_2_2(\n", 375 | " [random.randint(0, 100000) for _ in range(1000)],\n", 376 | " [random.randint(0, 100000) for _ in range(1000)]\n", 377 | ")))" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "# Сложная часть" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "## Задание 3.1\n", 392 | "\n", 393 | "Вам известны количество продаж разных товаров. Они представлены в виде списка, первое значение - число продаж первого товара, второе значение - второго, и так далее. Вы хотите посмотреть на список самых популярных товаров среди тех, у которых объём продаж не выше заданного числа.\n", 394 | "\n", 395 | "Напишите функцию, которая принимает список чисел my_list, число $n$ и число $k$ и возвращает сумму $k$ наибольших чисел среди всех чисел списка меньше либо равных $n$ ." 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "def function_3_1(my_list, n, k):\n", 405 | " # допишите код здесь\n", 406 | " return" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "function_3_1([1, 2, 3, 4, 5, 6, 3, 7], 7, 1) == 7" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "function_3_1([1, 2, 3, 4, 5, 6, 3, 7], 7, 3) == 18" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "function_3_1([1, 2, 3, 4, 5, 6, 3, 7], 7, 5) == 25" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "function_3_1([1, 2, 3, 4, 5, 6, 3, 7], 7, 8) == 31" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "function_3_1([1, 2, 3, 4, 5, 6, 3, 7], 7, 10) == 31" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "function_3_1([1, 5, 3, 6, 5, 6, 3, 6], 6, 3) == 18" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "function_3_1([1, 5, 3, 6, 5, 6, 3, 6], 5, 3) == 13" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "function_3_1([1, 5, 3, 6, 5, 6, 3, 6], 4, 3) == 7" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "function_3_1([1, 5, 3, 6, 5, 6, 3, 6], 6, 2) == 12" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "function_3_1([1, 5, 3, 6, 5, 6, 3, 6], 5, 2) == 10" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "function_3_1([1, 5, 3, 6, 5, 6, 3, 6], 4, 2) == 6" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": {}, 511 | "source": [ 512 | "#### Значение для формы" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "import random\n", 522 | "\n", 523 | "random.seed(42)\n", 524 | "print(function_3_1([random.randint(0, 1000) for _ in range(10000)], 700, 300))" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "## Задание 3.2\n", 532 | "\n", 533 | "Напишите списковое выражение, генерирующее все простые числа не больше заданного числа" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "def function_3_2(n):\n", 543 | " return [\n", 544 | " # допишите код здесь\n", 545 | " ]" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [ 554 | "function_3_2(3) == [2, 3]" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "function_3_2(10) == [2, 3, 5, 7]" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "function_3_2(20) == [2, 3, 5, 7, 11, 13, 17, 19]" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "#### Значение для формы" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "print(sum(function_3_2(10000)))" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "metadata": {}, 594 | "source": [ 595 | "# По хардкору" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "## Задание 4.1\n", 603 | "\n", 604 | "Напишите функцию, которая по множеству, возвращает список всех его подмножеств" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "def function_4_1(my_set):\n", 614 | " # допишите код\n", 615 | " return " 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "def check_4_1(my_set, function_res):\n", 625 | " if len(function_res) != len(set(map(frozenset, function_res))):\n", 626 | " return False\n", 627 | " if len(function_res) != 2 ** len(my_set):\n", 628 | " return False\n", 629 | " return all(x <= my_set for x in function_res)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "check_4_1(set(range(10)), function_4_1(set(range(10))))" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [ 647 | "check_4_1(set(range(1)), function_4_1(set(range(1))))" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "check_4_1(set(range(20)), function_4_1(set(range(20))))" 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": {}, 662 | "source": [ 663 | "#### Значение для формы" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": {}, 670 | "outputs": [], 671 | "source": [ 672 | "print(sum([\n", 673 | " len(subset) - sum(subset) + sum(subset) ** 2\n", 674 | " for subset in function_4_1(set(range(2, 21, 3)))\n", 675 | "]))" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "## Задание 4.2\n", 683 | "\n", 684 | "Напишите функцию раскладывающее число на простые множители\n", 685 | "\n", 686 | "по числу $n$ нужно вернуть список пар $(p_i, c_i)$ такой, что $\\prod_i~p_i^{c_i} = n$" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": null, 692 | "metadata": {}, 693 | "outputs": [], 694 | "source": [ 695 | "def is_prime(n):\n", 696 | " return n > 1 and all(n % i != 0 for i in range(2, n))\n", 697 | "\n", 698 | "def function_4_2(n):\n", 699 | " return [\n", 700 | " # допишите код здесь\n", 701 | " ]" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "def check_4_2(n, my_list):\n", 711 | " prod = 1\n", 712 | " for p, c in my_list:\n", 713 | " if not is_prime(p):\n", 714 | " return False\n", 715 | " prod *= p ** c\n", 716 | " return prod == n" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [ 725 | "check_4_2(10, function_4_2(10))" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": null, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "check_4_2(11690, function_4_2(11690))" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": null, 740 | "metadata": {}, 741 | "outputs": [], 742 | "source": [ 743 | "check_4_2(254242, function_4_2(254242))" 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "metadata": {}, 749 | "source": [ 750 | "#### Значение для формы" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [ 759 | "print(sum(\n", 760 | " (c + 1) ** (p - 1) * c - p\n", 761 | " for p, c in function_4_2(7648)\n", 762 | "))" 763 | ] 764 | } 765 | ], 766 | "metadata": { 767 | "kernelspec": { 768 | "display_name": "dmia", 769 | "language": "python", 770 | "name": "dmia" 771 | }, 772 | "language_info": { 773 | "codemirror_mode": { 774 | "name": "ipython", 775 | "version": 3 776 | }, 777 | "file_extension": ".py", 778 | "mimetype": "text/x-python", 779 | "name": "python", 780 | "nbconvert_exporter": "python", 781 | "pygments_lexer": "ipython3", 782 | "version": "3.6.6" 783 | } 784 | }, 785 | "nbformat": 4, 786 | "nbformat_minor": 2 787 | } 788 | -------------------------------------------------------------------------------- /hw02/KNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Реализуем метод predict_proba для KNN" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Ниже реализован класс KNeighborsClassifier, который для поиска ближайших соседей использует sklearn.neighbors.NearestNeighbors\n", 15 | "\n", 16 | "Требуется реализовать метод predict_proba для вычисления ответа классификатора." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "\n", 27 | "from sklearn.base import BaseEstimator, ClassifierMixin\n", 28 | "from sklearn.neighbors import NearestNeighbors\n", 29 | "\n", 30 | "\n", 31 | "class KNeighborsClassifier(BaseEstimator, ClassifierMixin):\n", 32 | " '''\n", 33 | " Класс, который позволит нам изучить KNN\n", 34 | " '''\n", 35 | " def __init__(self, n_neighbors=5, weights='uniform', \n", 36 | " metric='minkowski', p=2):\n", 37 | " '''\n", 38 | " Инициализируем KNN с несколькими стандартными параметрами\n", 39 | " '''\n", 40 | " assert weights in ('uniform', 'distance')\n", 41 | " \n", 42 | " self.n_neighbors = n_neighbors\n", 43 | " self.weights = weights\n", 44 | " self.metric = metric\n", 45 | " \n", 46 | " self.NearestNeighbors = NearestNeighbors(\n", 47 | " n_neighbors = n_neighbors,\n", 48 | " metric = self.metric)\n", 49 | " \n", 50 | " def fit(self, X, y):\n", 51 | " '''\n", 52 | " Используем sklearn.neighbors.NearestNeighbors \n", 53 | " для запоминания обучающей выборки\n", 54 | " и последующего поиска соседей\n", 55 | " '''\n", 56 | " self.NearestNeighbors.fit(X)\n", 57 | " self.n_classes = len(np.unique(y))\n", 58 | " self.y = y\n", 59 | " \n", 60 | " def predict_proba(self, X, use_first_zero_distant_sample=True):\n", 61 | " '''\n", 62 | " Чтобы реализовать этот метод, \n", 63 | " изучите работу sklearn.neighbors.NearestNeighbors'''\n", 64 | " \n", 65 | " # получим здесь расстояния до соседей distances и их метки\n", 66 | " \n", 67 | " if self.weights == 'uniform':\n", 68 | " w = np.ones(distances.shape)\n", 69 | " else:\n", 70 | " # чтобы не делить на 0, \n", 71 | " # добавим небольшую константу, например 1e-3\n", 72 | " w = 1/(distances + 1e-3)\n", 73 | "\n", 74 | " # реализуем вычисление предсказаний:\n", 75 | " # выбрав один объект, для каждого класса посчитаем\n", 76 | " # суммарный вес голосующих за него объектов\n", 77 | " # затем нормируем эти веса на их сумму\n", 78 | " # и вернем это как предсказание KNN\n", 79 | " return probs" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "# Загрузим данные и обучим классификатор" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 2, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "from sklearn.datasets import load_iris\n", 96 | "X, y = load_iris(return_X_y=True)\n", 97 | "\n", 98 | "knn = KNeighborsClassifier(weights='distance')\n", 99 | "knn.fit(X, y)\n", 100 | "prediction = knn.predict_proba(X, )" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Поскольку мы используем одну и ту же выборку для обучения и предсказания, ближайшим соседом любого объекта будет он же сам. В качестве упражнения предлагаю реализовать метод transform, который реализует получение предсказаний для обучающей выборки, но для каждого объекта не будет учитывать его самого.\n", 108 | "\n", 109 | "Посмотрим, в каких объектах max(prediction) != 1:" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 3, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "[ 56 68 70 72 77 83 106 110 119 123 127 133 134 138 146]\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "inds = np.arange(len(prediction))[prediction.max(1) != 1]\n", 127 | "print(inds)\n", 128 | "\n", 129 | "# [ 56 68 70 72 77 83 106 110 119 123 127 133 134 138 146]" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "Несколько примеров, на которых можно проверить правильность реализованного метода:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 4, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "68 [0. 0.99816311 0.00183689]\n", 149 | "77 [0. 0.99527902 0.00472098]\n", 150 | "146 [0. 0.00239145 0.99760855]\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "for i in 1, 4, -1:\n", 156 | " print(inds[i], prediction[inds[i]])\n", 157 | "\n", 158 | "# 68 [0. 0.99816311 0.00183689]\n", 159 | "# 77 [0. 0.99527902 0.00472098]\n", 160 | "# 146 [0. 0.00239145 0.99760855]" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "**Примечание:** отличие в третьем-четвертом знаке после запятой в тестах не должно повлиять на сдачу задания" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "# Ответы для формы" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "В форму требуется ввести max(prediction) для объекта. Если метод реализован верно, то ячейка ниже распечатает ответы, которые нужно ввести в форму" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "for i in 56, 83, 127:\n", 191 | " print('{:.2f}'.format(max(prediction[i])))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [] 200 | } 201 | ], 202 | "metadata": { 203 | "kernelspec": { 204 | "display_name": "Python 3", 205 | "language": "python", 206 | "name": "python3" 207 | }, 208 | "language_info": { 209 | "codemirror_mode": { 210 | "name": "ipython", 211 | "version": 3 212 | }, 213 | "file_extension": ".py", 214 | "mimetype": "text/x-python", 215 | "name": "python", 216 | "nbconvert_exporter": "python", 217 | "pygments_lexer": "ipython3", 218 | "version": "3.6.5" 219 | } 220 | }, 221 | "nbformat": 4, 222 | "nbformat_minor": 2 223 | } 224 | -------------------------------------------------------------------------------- /hw02/NaiveBayes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Реализуем методы для наивного байеса" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Сгенерируем выборку, в которой каждый признак имеет некоторое своё распределение, параметры которого отличаются для каждого класса. Затем реализуем несколько методов для класса, который уже частично написан ниже:\n", 15 | "- метод predict\n", 16 | "- метод \\_find\\_expon\\_params и \\_get\\_expon\\_density для экспоненциального распределения\n", 17 | "- метод \\_find\\_norm\\_params и \\_get\\_norm\\_probability для биномиального распределения\n", 18 | "\n", 19 | "Для имплементации \\_find\\_something\\_params изучите документацию функций для работы с этими распределениями в scipy.stats и используйте предоставленные там методы." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import numpy as np\n", 29 | "import scipy\n", 30 | "import scipy.stats" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "Сформируем параметры генерации для трех датасетов" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "((5000, 1), (5000,), ['bernoulli'])" 49 | ] 50 | }, 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "func_params_set0 = [(scipy.stats.bernoulli, [dict(p=0.1), dict(p=0.5)]),\n", 58 | " ]\n", 59 | "\n", 60 | "func_params_set1 = [(scipy.stats.bernoulli, [dict(p=0.1), dict(p=0.5)]),\n", 61 | " (scipy.stats.expon, [dict(scale=1), dict(scale=0.3)]),\n", 62 | " ]\n", 63 | "\n", 64 | "func_params_set2 = [(scipy.stats.bernoulli, [dict(p=0.1), dict(p=0.5)]),\n", 65 | " (scipy.stats.expon, [dict(scale=1), dict(scale=0.3)]),\n", 66 | " (scipy.stats.norm, [dict(loc=0, scale=1), dict(loc=1, scale=2)]),\n", 67 | " ]\n", 68 | "\n", 69 | "def generate_dataset_for_nb(func_params_set=[], size = 2500, random_seed=0):\n", 70 | " '''\n", 71 | " Генерирует выборку с заданными параметрами распределений P(x|y).\n", 72 | " Число классов задается длиной списка с параметрами.\n", 73 | " Возвращает X, y, список с названиями распределений\n", 74 | " '''\n", 75 | " np.random.seed(random_seed)\n", 76 | "\n", 77 | " X = []\n", 78 | " names = []\n", 79 | " for func, params in func_params_set:\n", 80 | " names.append(func.name)\n", 81 | " f = []\n", 82 | " for i, param in enumerate(params):\n", 83 | " f.append(func.rvs(size=size, **param))\n", 84 | " f = np.concatenate(f).reshape(-1,1)\n", 85 | " X.append(f)\n", 86 | "\n", 87 | " X = np.concatenate(X, 1)\n", 88 | " y = np.array([0] * size + [1] * size)\n", 89 | "\n", 90 | " shuffle_inds = np.random.choice(range(len(X)), size=len(X), replace=False)\n", 91 | " X = X[shuffle_inds]\n", 92 | " y = y[shuffle_inds]\n", 93 | "\n", 94 | " return X, y, names \n", 95 | "\n", 96 | "X, y, distrubution_names = generate_dataset_for_nb(func_params_set0)\n", 97 | "X.shape, y.shape, distrubution_names" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 3, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "from collections import defaultdict\n", 107 | "from sklearn.base import BaseEstimator, ClassifierMixin\n", 108 | "\n", 109 | "class NaiveBayes(BaseEstimator, ClassifierMixin):\n", 110 | " '''\n", 111 | " Реализация наивного байеса, которая помимо X, y\n", 112 | " принимает на вход во время обучения \n", 113 | " виды распределений значений признаков\n", 114 | " '''\n", 115 | " def __init__(self):\n", 116 | " pass\n", 117 | " \n", 118 | " def _find_bernoulli_params(self, x):\n", 119 | " '''\n", 120 | " метод возвращает найденный параметр `p`\n", 121 | " распределения scipy.stats.bernoulli\n", 122 | " '''\n", 123 | " return dict(p=np.mean(x))\n", 124 | " \n", 125 | " def _get_bernoulli_probability(self, x, params):\n", 126 | " '''\n", 127 | " метод возвращает вероятность x для данных\n", 128 | " параметров распределния\n", 129 | " '''\n", 130 | " return scipy.stats.bernoulli.pmf(x, **params)\n", 131 | "\n", 132 | " def _find_expon_params(self, x):\n", 133 | " # нужно определить параметры распределения\n", 134 | " # и вернуть их\n", 135 | " pass\n", 136 | " \n", 137 | " def _get_expon_density(self, x, params):\n", 138 | " # нужно вернуть плотность распределения в x\n", 139 | " pass\n", 140 | "\n", 141 | " def _find_norm_params(self, x):\n", 142 | " # нужно определить параметры распределения\n", 143 | " # и вернуть их\n", 144 | " pass\n", 145 | " \n", 146 | " def _get_norm_density(self, x, params):\n", 147 | " # нужно вернуть плотность распределения в x\n", 148 | " pass\n", 149 | "\n", 150 | " def _get_params(self, x, distribution):\n", 151 | " '''\n", 152 | " x - значения из распределения,\n", 153 | " distribution - название распределения в scipy.stats\n", 154 | " '''\n", 155 | " if distribution == 'bernoulli':\n", 156 | " return self._find_bernoulli_params(x)\n", 157 | " elif distribution == 'expon':\n", 158 | " return self._find_expon_params(x)\n", 159 | " elif distribution == 'norm':\n", 160 | " return self._find_norm_params(x)\n", 161 | " else:\n", 162 | " raise NotImplementedError('Unknown distribution')\n", 163 | " \n", 164 | " def _get_probability_or_density(self, x, distribution, params):\n", 165 | " '''\n", 166 | " x - значения,\n", 167 | " distribytion - название распределения в scipy.stats,\n", 168 | " params - параметры распределения\n", 169 | " '''\n", 170 | " if distribution == 'bernoulli':\n", 171 | " return self._get_bernoulli_probability(x, params)\n", 172 | " elif distribution == 'expon':\n", 173 | " return self._get_expon_density(x, params)\n", 174 | " elif distribution == 'norm':\n", 175 | " return self._get_norm_density(x, params)\n", 176 | " else:\n", 177 | " raise NotImplementedError('Unknown distribution')\n", 178 | "\n", 179 | " def fit(self, X, y, distrubution_names):\n", 180 | " '''\n", 181 | " X - обучающая выборка,\n", 182 | " y - целевая переменная,\n", 183 | " feature_distributions - список названий распределений, \n", 184 | " по которым предположительно распределны значения P(x|y)\n", 185 | " ''' \n", 186 | " assert X.shape[1] == len(distrubution_names)\n", 187 | " assert set(y) == {0, 1}\n", 188 | " self.n_classes = len(np.unique(y))\n", 189 | " self.distrubution_names = distrubution_names\n", 190 | " \n", 191 | " self.y_prior = [(y == j).mean() for j in range(self.n_classes)]\n", 192 | " \n", 193 | " self.distributions_params = defaultdict(dict)\n", 194 | " for i in range(X.shape[1]):\n", 195 | " distribution = self.distrubution_names[i]\n", 196 | " for j in range(self.n_classes):\n", 197 | " values = X[y == j, i]\n", 198 | " self.distributions_params[j][i] = \\\n", 199 | " self._get_params(values, distribution)\n", 200 | " \n", 201 | " return self.distributions_params\n", 202 | " \n", 203 | " def predict(self, X):\n", 204 | " '''\n", 205 | " X - тестовая выборка\n", 206 | " '''\n", 207 | " assert X.shape[1] == len(self.distrubution_names)\n", 208 | " \n", 209 | " # нужно реализовать подсчет аргмаксной формулы, по которой \n", 210 | " # наивный байес принимает решение о принадлежности объекта классу\n", 211 | " # и применить её для каждого объекта в X\n", 212 | " #\n", 213 | " # примечание: обычно подсчет этой формулы реализуют через \n", 214 | " # её логарифмирование, то есть, через сумму логарифмов вероятностей, \n", 215 | " # поскольку перемножение достаточно малых вероятностей будет вести\n", 216 | " # к вычислительным неточностям\n", 217 | " \n", 218 | " return preds" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "Проверим результат на примере первого распределения" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 4, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/plain": [ 236 | "defaultdict(dict, {0: {0: {'p': 0.1128}}, 1: {0: {'p': 0.482}}})" 237 | ] 238 | }, 239 | "execution_count": 4, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "nb = NaiveBayes()\n", 246 | "nb.fit(X, y, ['bernoulli'])" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 5, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "0.6045\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "from sklearn.metrics import f1_score\n", 264 | "\n", 265 | "prediction = nb.predict(X)\n", 266 | "score = f1_score(y, prediction)\n", 267 | "print('{:.2f}'.format(score))" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "# Ответы для формы" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "Ответом для формы должны служить числа, которые будут выведены ниже. Все ответы проверены: в этих примерах получается одинаковый результат и через сумму логарифмов, и через произведение вероятностей." 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "scipy.stats.bernoulli.name\n", 291 | "\n", 292 | "for fps in (func_params_set0 * 2,\n", 293 | " func_params_set1, \n", 294 | " func_params_set2):\n", 295 | " \n", 296 | "\n", 297 | " X, y, distrubution_names = generate_dataset_for_nb(fps)\n", 298 | " \n", 299 | " nb = NaiveBayes()\n", 300 | " nb.fit(X, y, distrubution_names)\n", 301 | " prediction = nb.predict(X)\n", 302 | " score = f1_score(y, prediction)\n", 303 | " print('{:.2f}'.format(score))" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [] 312 | } 313 | ], 314 | "metadata": { 315 | "kernelspec": { 316 | "display_name": "Python 3", 317 | "language": "python", 318 | "name": "python3" 319 | }, 320 | "language_info": { 321 | "codemirror_mode": { 322 | "name": "ipython", 323 | "version": 3 324 | }, 325 | "file_extension": ".py", 326 | "mimetype": "text/x-python", 327 | "name": "python", 328 | "nbconvert_exporter": "python", 329 | "pygments_lexer": "ipython3", 330 | "version": "3.6.5" 331 | } 332 | }, 333 | "nbformat": 4, 334 | "nbformat_minor": 2 335 | } 336 | -------------------------------------------------------------------------------- /hw02/NumpyScipy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import scipy" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Задача 1\n", 18 | "\n", 19 | "Дан массив $arr$, требуется для каждой позиции $i$ найти номер элемента $arr_i$ в массиве $arr$, отсортированном по убыванию. Все значения массива $arr$ различны." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "def function_1(arr):\n", 29 | " return #TODO" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "(function_1([1, 2, 3]) == [2, 1, 0]).all()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "(function_1([-2, 1, 0]) == [2, 0, 1]).all()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "(function_1([-2, 1, 0, -1]) == [3, 0, 1, 2]).all()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "**Значение для формы**" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "np.random.seed(42)\n", 73 | "arr = function_1(np.random.uniform(size=1000000))\n", 74 | "print(arr[7] + arr[42] + arr[445677] + arr[53422])" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Задача 2\n", 82 | "\n", 83 | "Дана матрица $X$, нужно найти след матрицы $X X^T$" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "def function_2(matrix):\n", 93 | " return #TODO" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "function_2(np.array([\n", 103 | " [1, 2],\n", 104 | " [3, 4]\n", 105 | "])) == 30" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "function_2(np.array([\n", 115 | " [1, 0],\n", 116 | " [0, 1]\n", 117 | "])) == 2" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "function_2(np.array([\n", 127 | " [2, 0],\n", 128 | " [0, 2]\n", 129 | "])) == 8" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "function_2(np.array([\n", 139 | " [2, 1, 1],\n", 140 | " [1, 2, 1]\n", 141 | "])) == 12" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "**Значение для формы**" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "np.random.seed(42)\n", 158 | "arr1 = np.random.uniform(size=(1, 100000))\n", 159 | "arr2 = np.random.uniform(size=(100000, 1))\n", 160 | "print(int(function_2(arr1) + function_2(arr2)))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "### Задача 3\n", 168 | "\n", 169 | "Дан набор точек с координатам точек points_x и points_y. Нужно найти такую точку $p$ с нулевой координатой $y$ (то есть с координатами вида $(x, 0)$), что расстояние от неё до самой удалённой точки из исходного набора (растояние евклидово) минимально" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "def function_3(points_x, points_y):\n", 179 | " return #TODO" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "np.abs(function_3([0, 2], [1, 1]) - 1.) < 1e-3" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "np.abs(function_3([0, 2, 4], [1, 1, 1]) - 2.) < 1e-3" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "np.abs(function_3([0, 4, 4], [1, 1, 1]) - 2.) < 1e-3" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "**Значение для формы**" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "np.random.seed(42)\n", 223 | "arr1 = np.random.uniform(-56, 100, size=100000)\n", 224 | "arr2 = np.random.uniform(-100, 100, size=100000)\n", 225 | "print(int(round((function_3(arr1, arr2) * 100))))" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [] 234 | } 235 | ], 236 | "metadata": { 237 | "kernelspec": { 238 | "display_name": "dmia", 239 | "language": "python", 240 | "name": "dmia" 241 | }, 242 | "language_info": { 243 | "codemirror_mode": { 244 | "name": "ipython", 245 | "version": 3 246 | }, 247 | "file_extension": ".py", 248 | "mimetype": "text/x-python", 249 | "name": "python", 250 | "nbconvert_exporter": "python", 251 | "pygments_lexer": "ipython3", 252 | "version": "3.6.6" 253 | } 254 | }, 255 | "nbformat": 4, 256 | "nbformat_minor": 2 257 | } 258 | -------------------------------------------------------------------------------- /hw02/Polynom.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "%matplotlib inline" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "# Задание\n", 20 | "\n", 21 | "Допишите реализацию класса для обучения полиномиальной регресии, то есть по точкам $x_1, x_2, \\dots, x_n$ и $y_1, y_2, \\dots, y_n$ и заданному числу $d$ решить оптимизационную задачу:\n", 22 | "\n", 23 | "$$ \\sum_{i=1}^n (~f(x_i) - y_i~)^2 \\min_f,$$ где f – полином степени не выше $d$." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "**Примечание:** в этом задании оптимизационную задачу можно решать как с помощью scipy.optimize, так и сводя задачу к линейной регрессии и используя готовую формулу весов из нее. Предпочтительней второй путь, но первый вариант проще, и его можно использовать для проверки. Независимо от того, как вы решите эту задачу, сдавайте в форму ответ, в котором будете больше всего уверенны.\n", 31 | "\n", 32 | "**Предупреждение:** проверка этого задания **не предполагает**, что вы решите его с помощью SGD, т.к. получить таким способом тот же ответ *очень* сложно." 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "class PolynomialRegression(object):\n", 42 | " \n", 43 | " def __init__(self, max_degree=1):\n", 44 | " self.max_degree = max_degree\n", 45 | " \n", 46 | " def fit(self, points_x, points_y):\n", 47 | " # insert your code here to fit the model\n", 48 | " \n", 49 | " return self\n", 50 | " \n", 51 | " def predict(self, points_x):\n", 52 | " # insert your code here to predict the values\n", 53 | " \n", 54 | " return values" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "np.random.seed(42)\n", 64 | "points_x = np.random.uniform(-10, 10, size=10)\n", 65 | "# we use list comprehesion but think about how to write it using np.array operations\n", 66 | "points_y = np.array([4 - x + x ** 2 + 0.1 * x ** 3 + np.random.uniform(-20, 20) for x in points_x])" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAloAAAEyCAYAAAAiFH5AAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGWhJREFUeJzt3X+MXXd55/H3s2ODRhm00zZ0GjthDVrX0ibZtetR6IoumtlAHSJETFSFWBVNgF2TVVm1WuQWQ1Sipgha80OqaGnNJiIsNBNEHOONQk02dDZUWrOx42ycX9M6aVI8jpKSxAkDI2qbZ/+YM2HGzPGM58537px73y9pNPd+z7nnPPPoXOeT8z3n3shMJEmStPT+RbsLkCRJ6lQGLUmSpEIMWpIkSYUYtCRJkgoxaEmSJBVi0JIkSSrEoCVJklSIQUuSJKkQg5YkSVIhq+ZbISJuBd4JPJ+Zl1RjdwAbqlX6gROZuTEi1gGPA2PVsgOZecN8+zj//PNz3bp151x8p/jhD3/Ieeed1+4yViR7U8/e1LM39exNPXtTz97MdujQoe9n5usXsu68QQv4EvB54MvTA5n5nunHEfEZ4OUZ6z+ZmRsXVuqUdevWcfDgwXN5SUcZHR1laGio3WWsSPamnr2pZ2/q2Zt69qaevZktIp5Z6LrzBq3MvL86UzXXjgK4BviPC92hJElSt4iFfKl0FbTunp46nDH+VuCzmTk4Y71Hgb8DXgFuzMzv1GxzO7AdYGBgYPPIyMhi/4bGm5iYoK+vr91lrEj2pp69qWdv6tmbevamnr2ZbXh4+NB09pnPQqYOz2YbcPuM588Cb8jMFyJiM7A3Ii7OzFfOfGFm7gZ2AwwODmY3n5L0lGw9e1PP3tSzN/XsTT17U8/eLN6i7zqMiFXA1cAd02OZ+ePMfKF6fAh4EvjlVouUJElqolY+3uFtwBOZeWx6ICJeHxE91eM3AeuBp1orUZIkqZnmDVoRcTvwf4ANEXEsIj5QLbqW2dOGAG8FHo6Ih4CvAzdk5otLWbAkSVJTLOSuw20149fPMXYncGfrZUmSJDVfqxfDS5Iktd3ew+Ps2j/G8ROTrOnvZceWDWzdtLbdZRm0JElSs+09PM7OPUeYPHkagPETk+zccwSg7WHL7zqUJEmNtmv/2Ksha9rkydPs2j9W84rlY9CSJEmNdvzE5DmNLyeDliRJarQ1/b3nNL6cDFqSJKnRdmzZQO/qnlljvat72LFlQ5sq+ikvhpckSY02fcG7dx1KkiQVsHXT2hURrM7k1KEkSVIhBi1JkqRCDFqSJEmFGLQkSZIKMWhJkiQVYtCSJEkqxKAlSZJUiEFLkiSpEIOWJElSIQYtSZKkQgxakiRJhRi0JEmSCjFoSZIkFWLQkiRJKsSgJUmSVIhBS5IkqRCDliRJUiEGLUmSpELmDVoRcWtEPB8Rj8wYuykixiPioernyhnLdkbE0YgYi4gtpQqXJEla6RZyRutLwBVzjH8uMzdWP/cARMS/Aa4FLq5e8+cR0bNUxUqSJDXJvEErM+8HXlzg9q4CRjLzx5n5D8BR4LIW6pMkSWqsyMz5V4pYB9ydmZdUz28CrgdeAQ4CH87MlyLi88CBzPxKtd4twDcz8+tzbHM7sB1gYGBg88jIyBL8Oc00MTFBX19fu8tYkexNPXtTz97Uszf17E09ezPb8PDwocwcXMi6qxa5jy8ANwNZ/f4M8P5z2UBm7gZ2AwwODubQ0NAiS2m+0dFRuvnvPxt7U8/e1LM39exNPXtTz94s3qLuOszM5zLzdGb+BPgiP50eHAcumrHqhdWYJElS11lU0IqIC2Y8fTcwfUfiPuDaiHhtRLwRWA/839ZKlCRJaqZ5pw4j4nZgCDg/Io4BHweGImIjU1OHTwMfBMjMRyPia8BjwCngtzPzdJnSJUmSVrZ5g1Zmbptj+JazrP8J4BOtFCVJktQJ/GR4SZKkQgxakiRJhRi0JEmSCjFoSZIkFWLQkiRJKsSgJUmSVIhBS5IkqRCDliRJUiEGLUmSpEIMWpIkSYUYtCRJkgoxaEmSJBVi0JIkSSrEoCVJklSIQUuSJKkQg5YkSVIhBi1JkqRCDFqSJEmFGLQkSZIKMWhJkiQVYtCSJEkqxKAlSZJUiEFLkiSpEIOWJElSIQYtSZKkQgxakiRJhcwbtCLi1oh4PiIemTG2KyKeiIiHI+KuiOivxtdFxGREPFT9/EXJ4iVJklayhZzR+hJwxRlj9wKXZOa/Bf4O2Dlj2ZOZubH6uWFpypQkSWqeeYNWZt4PvHjG2Lcy81T19ABwYYHaJEmSGi0yc/6VItYBd2fmJXMs+5/AHZn5lWq9R5k6y/UKcGNmfqdmm9uB7QADAwObR0ZGFvcXdICJiQn6+vraXcaKZG/q2Zt69qaevalnb+rZm9mGh4cPZebgQtZd1cqOIuJjwCngq9XQs8AbMvOFiNgM7I2IizPzlTNfm5m7gd0Ag4ODOTQ01EopjTY6Oko3//1nY2/q2Zt69qaevalnb+rZm8Vb9F2HEXE98E7gN7M6LZaZP87MF6rHh4AngV9egjolSZIaZ1FBKyKuAH4PeFdm/mjG+Osjoqd6/CZgPfDUUhQqSZLUNPNOHUbE7cAQcH5EHAM+ztRdhq8F7o0IgAPVHYZvBf4wIk4CPwFuyMwX59ywJElSh5s3aGXmtjmGb6lZ907gzlaLkiRJ6gR+MrwkSVIhBi1JkqRCDFqSJEmFGLQkSZIKMWhJkiQVYtCSJEkqxKAlSZJUiEFLkiSpEIOWJElSIQYtSZKkQgxakiRJhRi0JEmSCjFoSZIkFWLQkiRJKsSgJUmSVIhBS5IkqRCDliRJUiEGLUmSpEIMWpIkSYUYtCRJkgoxaEmSJBVi0JIkSSrEoCVJklSIQUuSJKkQg5YkSVIhBi1JkqRCFhS0IuLWiHg+Ih6ZMfbzEXFvRPx99fvnqvGIiD+NiKMR8XBE/Eqp4iVJklayhZ7R+hJwxRljHwHuy8z1wH3Vc4B3AOurn+3AF1ovU5IkqXkWFLQy837gxTOGrwJuqx7fBmydMf7lnHIA6I+IC5aiWEmSpCaJzFzYihHrgLsz85Lq+YnM7K8eB/BSZvZHxN3ApzLzb6tl9wG/n5kHz9jedqbOeDEwMLB5ZGRkaf6iBpqYmKCvr6/dZaxI9qaevalnb+rZm3r2pp69mW14ePhQZg4uZN1VS7HDzMyIWFhi++lrdgO7AQYHB3NoaGgpSmmk0dFRuvnvPxt7U8/e1LM39exNPXtTz94sXit3HT43PSVY/X6+Gh8HLpqx3oXVmCRJUldpJWjtA66rHl8HfGPG+G9Vdx/+KvByZj7bwn4kSZIaaUFThxFxOzAEnB8Rx4CPA58CvhYRHwCeAa6pVr8HuBI4CvwIeN8S1yxJktQICwpambmtZtHlc6ybwG+3UpQkSVIn8JPhJUmSCjFoSZIkFWLQkiRJKsSgJUmSVMiSfGCpJElqtr2Hx9m1f4zjJyZZ09/Lji0b2LppbbvLajyDliRJXW7v4XF27jnC5MnTAIyfmGTnniMAhq0WOXUoSVKX27V/7NWQNW3y5Gl27R9rU0Wdw6AlSVKXO35i8pzGtXAGLUmSutya/t5zGtfCGbQkSepyO7ZsoHd1z6yx3tU97NiyoU0VdQ4vhpckqctNX/DuXYdLz6AlSZLYummtwaoApw4lSZIKMWhJkiQVYtCSJEkqxKAlSZJUiEFLkiSpEIOWJElSIQYtSZKkQgxakiRJhRi0JEmSCjFoSZIkFWLQkiRJKsSgJUmSVIhBS5IkqRCDliRJUiGrFvvCiNgA3DFj6E3AHwD9wH8G/qka/2hm3rPoCiVJkhpq0UErM8eAjQAR0QOMA3cB7wM+l5mfXpIKJUmSGmqppg4vB57MzGeWaHuSJEmNF5nZ+kYibgUezMzPR8RNwPXAK8BB4MOZ+dIcr9kObAcYGBjYPDIy0nIdTTUxMUFfX1+7y1iR7E09e1PP3tSzN/XsTT17M9vw8PChzBxcyLotB62IeA1wHLg4M5+LiAHg+0ACNwMXZOb7z7aNwcHBPHjwYEt1NNno6ChDQ0PtLmNFsjf17E09e1PP3tSzN/XszWwRseCgtRRTh+9g6mzWcwCZ+Vxmns7MnwBfBC5bgn1IkiQ1zlIErW3A7dNPIuKCGcveDTyyBPuQJElqnEXfdQgQEecBbwc+OGP4TyJiI1NTh0+fsUySJKlrtBS0MvOHwC+cMfbeliqSJEnqEH4yvCRJUiEGLUmSpEIMWpIkSYUYtCRJkgoxaEmSJBVi0JIkSSrEoCVJklSIQUuSJKkQg5YkSVIhBi1JkqRCDFqSJEmFGLQkSZIKMWhJkiQVYtCSJEkqxKAlSZJUiEFLkiSpEIOWJElSIQYtSZKkQgxakiRJhRi0JEmSCjFoSZIkFWLQkiRJKsSgJUmSVIhBS5IkqRCDliRJUiEGLUmSpEJWtbqBiHga+AFwGjiVmYMR8fPAHcA64Gngmsx8qdV9SZIkNclSndEazsyNmTlYPf8IcF9mrgfuq55LkiR1lVJTh1cBt1WPbwO2FtqPJEnSihWZ2doGIv4BeAlI4C8zc3dEnMjM/mp5AC9NP5/xuu3AdoCBgYHNIyMjLdXRZBMTE/T19bW7jBXJ3tSzN/XsTT17U8/e1LM3sw0PDx+aMYt3Vi1fowX8WmaOR8QvAvdGxBMzF2ZmRsTPpLnM3A3sBhgcHMyhoaElKKWZRkdH6ea//2zsTT17U8/e1LM39exNPXuzeC1PHWbmePX7eeAu4DLguYi4AKD6/Xyr+5EkSWqaloJWRJwXEa+bfgz8OvAIsA+4rlrtOuAbrexHkiSpiVqdOhwA7pq6DItVwF9l5l9HxAPA1yLiA8AzwDUt7keSJKlxWgpamfkU8O/mGH8BuLyVbUuS1Iq9h8fZtX+M4ycmWdPfy44tG9i6aW27y1KXWYqL4SVJWlH2Hh5n554jTJ48DcD4iUl27jkCYNjSsvIreCRJHWfX/rFXQ9a0yZOn2bV/rE0VqVsZtCRJHef4iclzGpdKMWhJkjrOmv7ecxqXSjFoSZI6zo4tG+hd3TNrrHd1Dzu2bGhTRepWXgwvSeo40xe8e9eh2s2gJUnqSFs3rTVYqe2cOpQkSSrEoCVJklSIQUuSJKkQg5YkSVIhBi1JkqRCDFqSJEmFGLQkSZIKMWhJkiQVYtCSJEkqxKAlSZJUiEFLkiSpEIOWJElSIQYtSZKkQgxakiRJhRi0JEmSCjFoSZIkFWLQkiRJKsSgJUmSVIhBS5IkqZBFB62IuCgi/iYiHouIRyPid6rxmyJiPCIeqn6uXLpyJUmSmmNVC689BXw4Mx+MiNcBhyLi3mrZ5zLz062XJ0mS1FyLDlqZ+SzwbPX4BxHxOLB2qQqTJElqusjM1jcSsQ64H7gE+G/A9cArwEGmznq9NMdrtgPbAQYGBjaPjIy0XEdTTUxM0NfX1+4yViR7U8/e1LM39exNPXtTz97MNjw8fCgzBxeybstBKyL6gP8NfCIz90TEAPB9IIGbgQsy8/1n28bg4GAePHiwpTqabHR0lKGhoXaXsSLZm3r2pp69qWdv6tmbevZmtohYcNBq6a7DiFgN3Al8NTP3AGTmc5l5OjN/AnwRuKyVfUiSJDVVK3cdBnAL8HhmfnbG+AUzVns38Mjiy5MkSWquVu46fAvwXuBIRDxUjX0U2BYRG5maOnwa+GBLFUqSJDVUK3cd/i0Qcyy6Z/HlSJIkdQ4/GV6SJKkQg5YkSVIhBi1JkqRCDFqSJEmFtHLXoSRJxe09PM6u/WMcPzHJmv5edmzZwNZNfuObmsGgJUlasfYeHmfnniNMnjwNwPiJSXbuOQJg2FIjOHUoSVqxdu0fezVkTZs8eZpd+8faVJF0brrijJannSWpmY6fmDyncWml6fig1emnnW/ce4Tbv/s9TmfSE8G2N1/EH229tN1lSdKSWNPfy/gcoWpNf28bqpHOXcdPHXbyaecb9x7hKwf+kdOZAJzO5CsH/pEb9x5pc2WStDR2bNlA7+qeWWO9q3vYsWVDmyqSzk3HB61OPu18+3e/d07jktQ0Wzet5ZNXX8ra/l4CWNvfyyevvrQjZiTUHTp+6rCTTztPn8la6LgkNdHWTWsNVmqsjj+j1cmnnXtiru/0rh+XJEnLq+ODViefdt725ovOaVySJC2vjp86hM497Tx9d6F3HUqStDJ1RdDqZH+09VKDlSRJK1THTx1KkiS1i0FLkiSpEIOWJElSIQYtSZKkQrwYXpIaZO/hcXbtH+P4iUnW9PeyY8uGjryrWuoUBi1Jaoi9h8fZuefIq9/fOn5ikp17pr7b1LAlrUxOHUpSQ+zaP/ZqyJo2efI0u/aPtakiSfMxaElSQxyf43tbzzYuqf0MWpLUEGv6e89pXFL7GbQkqSF2bNlA7+qeWWO9q3vYsWVDmyqSNJ9iQSsiroiIsYg4GhEfKbUfSeoWWzet5ZNXX8ra/l4CWNvfyyevvtQL4aUVrMhdhxHRA/wZ8HbgGPBAROzLzMdK7E+SusXWTWsNVlKDlDqjdRlwNDOfysx/BkaAqwrtS5IkaUWKzFz6jUb8BnBFZv6n6vl7gTdn5odmrLMd2A4wMDCweWRkZMnraIqJiQn6+vraXcaKZG/q2Zt69qaevalnb+rZm9mGh4cPZebgQtZt2weWZuZuYDfA4OBgDg0NtauUthsdHaWb//6zsTf17E09e1PP3tSzN/XszeKVmjocBy6a8fzCakySJKlrlApaDwDrI+KNEfEa4FpgX6F9SZIkrUhFpg4z81REfAjYD/QAt2bmoyX2JUndwi+Ulpqn2DVamXkPcE+p7UtSN/ELpaVm8pPhJakB/EJpqZkMWpLUAH6htNRMBi2pYfYeHuctn/o2R8Zf5i2f+jZ7D3tDbzfwC6WlZjJoSQ0yfZ3OeHUWY/o6HcNW5/MLpaVmMmhJDeJ1Ot3LL5SWmqltnwwv6dx5nU538wulpebxjJbUIF6nI0nNYtCSGsTrdCSpWZw6lBpketpo6pqsH7DWTweXpBXNoCU1zPR1OqOjo/zX3xxqdzmSpLNw6lCSJKkQg5YkSVIhBi1JkqRCDFqSJEmFGLQkSZIKMWhJkiQVYtCSJEkqxKAlSZJUSGRmu2sgIv4JeKbddbTR+cD3213ECmVv6tmbevamnr2pZ2/q2ZvZ/lVmvn4hK66IoNXtIuJgZg62u46VyN7Uszf17E09e1PP3tSzN4vn1KEkSVIhBi1JkqRCDForw+52F7CC2Zt69qaevalnb+rZm3r2ZpG8RkuSJKkQz2hJkiQVYtCSJEkqxKDVBhFxR0Q8VP08HREP1az3dEQcqdY7uNx1tkNE3BQR4zP6c2XNeldExFhEHI2Ijyx3ne0QEbsi4omIeDgi7oqI/pr1uua4me84iIjXVu+3oxHx3YhYt/xVLr+IuCgi/iYiHouIRyPid+ZYZygiXp7xXvuDdtTaDvO9R2LKn1bHzcMR8SvtqHO5RcSGGcfDQxHxSkT87hnrdO1xs1ir2l1AN8rM90w/jojPAC+fZfXhzOy2D4n7XGZ+um5hRPQAfwa8HTgGPBAR+zLzseUqsE3uBXZm5qmI+GNgJ/D7Net2/HGzwOPgA8BLmfmvI+Ja4I+B9/zs1jrOKeDDmflgRLwOOBQR987xHvlOZr6zDfWtBGd7j7wDWF/9vBn4QvW7o2XmGLARXn1/jQN3zbFqNx8358wzWm0UEQFcA9ze7loa5jLgaGY+lZn/DIwAV7W5puIy81uZeap6egC4sJ31rAALOQ6uAm6rHn8duLx633W0zHw2Mx+sHv8AeBxY296qGuUq4Ms55QDQHxEXtLuoZXY58GRmdvO3tiwJg1Z7/Qfgucz8+5rlCXwrIg5FxPZlrKvdPlSdrr81In5ujuVrge/NeH6M7vuPyPuBb9Ys65bjZiHHwavrVCH1ZeAXlqW6FaKaLt0EfHeOxf8+Iv5fRHwzIi5e1sLaa773iP/GwLXUnwTo1uNmUZw6LCQi/hfwS3Ms+lhmfqN6vI2zn836tcwcj4hfBO6NiCcy8/6lrnW5na03TJ2iv5mpfwhvBj7DVKjoCgs5biLiY0xNDX21ZjMdedzo3EVEH3An8LuZ+coZix9k6vvaJqprIfcyNVXWDXyPnEVEvAZ4F1OXJ5ypm4+bRTFoFZKZbzvb8ohYBVwNbD7LNsar389HxF1MTZU0/h+D+XozLSK+CNw9x6Jx4KIZzy+sxhpvAcfN9cA7gcuz5kPwOvW4mcNCjoPpdY5V77l/CbywPOW1V0SsZipkfTUz95y5fGbwysx7IuLPI+L8Tr+2Dxb0HunYf2MW6B3Ag5n53JkLuvm4WSynDtvnbcATmXlsroURcV51ESsRcR7w68Ajy1hfW5xxHcS7mftvfgBYHxFvrP7P61pg33LU104RcQXwe8C7MvNHNet003GzkONgH3Bd9fg3gG/XBdROUl2HdgvweGZ+tmadX5q+Xi0iLmPqvwcdH0IX+B7ZB/xWdffhrwIvZ+azy1xqO9XOtnTrcdMKz2i1z8/Mf0fEGuC/Z+aVwABwV3U8rwL+KjP/etmrXH5/EhEbmZo6fBr4IMzuTXXX3YeA/UAPcGtmPtqugpfR54HXMjXVAXAgM2/o1uOm7jiIiD8EDmbmPqbCxv+IiKPAi0y977rBW4D3Akfipx8f81HgDQCZ+RdMBc//EhGngEng2m4IodS8RyLiBni1N/cAVwJHgR8B72tTrcuuCp9vp/q3txqb2ZtuPW4Wza/gkSRJKsSpQ0mSpEIMWpIkSYUYtCRJkgoxaEmSJBVi0JIkSSrEoCVJklSIQUuSJKmQ/w+dgKdwolc3OgAAAABJRU5ErkJggg==\n", 77 | "text/plain": [ 78 | "
" 79 | ] 80 | }, 81 | "metadata": { 82 | "needs_background": "light" 83 | }, 84 | "output_type": "display_data" 85 | } 86 | ], 87 | "source": [ 88 | "plt.figure(figsize=(10, 5))\n", 89 | "plt.scatter(points_x, points_y)\n", 90 | "plt.grid()\n", 91 | "plt.show()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 19, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "def plot_model(max_degree):\n", 101 | " plt.figure(figsize=(10, 5))\n", 102 | " plt.scatter(points_x, points_y)\n", 103 | " model = PolynomialRegression(max_degree).fit(points_x, points_y)\n", 104 | " all_x = np.arange(-10, 10.1, 0.1)\n", 105 | " plt.plot(all_x, model.predict(all_x))\n", 106 | " plt.grid()\n", 107 | " plt.show()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "for i in range(10):\n", 117 | " plot_model(i)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Объясните почему графики меняются таким образом" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "**Значение для формы**" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "print(int(\n", 148 | " PolynomialRegression(7).fit(points_x, points_y).predict([10])[0]\n", 149 | " + PolynomialRegression(1).fit(points_x, points_y).predict([-5])[0]\n", 150 | " + PolynomialRegression(4).fit(points_x, points_y).predict([-15])[0]\n", 151 | "))" 152 | ] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python 3", 158 | "language": "python", 159 | "name": "python3" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.6.5" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 2 176 | } 177 | -------------------------------------------------------------------------------- /hw02/README.md: -------------------------------------------------------------------------------- 1 | Выполните задачи в домашних ноутбуках, в каждой задаче будет контрольный код, вывод которого нужно ввести в [форму](https://goo.gl/forms/aPXg2ZL0KyuUFRFn2). 2 | 3 | Дедлайн по сдаче: 23:00 25 марта. 4 | -------------------------------------------------------------------------------- /hw04/DenseLayersCifar10.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Полносвязная сеть на Cifar10" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "В этом задании мы попробуем применить несколько моделей на датасете cifar10 и понять, какие из них лучше" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Загрузим данные" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import keras\n", 31 | "import numpy as np\n", 32 | "\n", 33 | "from keras.datasets import cifar10\n", 34 | "from keras.utils import np_utils\n", 35 | "\n", 36 | "from keras.layers import Dense, Dropout\n", 37 | "from keras.layers.core import Activation\n", 38 | "from keras.models import Sequential\n", 39 | "\n", 40 | "import matplotlib.pyplot as plt" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Для воспроизводимости расчетов воспользуемся стандартным разбиением на обучающую и тестовую выборки" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "(x_train, y_train), (x_test, y_test) = cifar10.load_data()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Смотрим на данные глазами" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "images_and_labels = list(zip(x_train, y_train))\n", 73 | "for index, (image, label) in enumerate(images_and_labels[:12]):\n", 74 | " plt.subplot(5, 4, index + 1)\n", 75 | " plt.axis('off')\n", 76 | " plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')\n", 77 | " plt.title('label: %i' % label )" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "Преобразуем их в подходящем нас в виде" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "x_train = x_train.reshape(50000, 32*32*3)\n", 94 | "x_test = x_test.reshape(10000, 32*32*3)\n", 95 | "x_train = x_train.astype('float32')\n", 96 | "x_test = x_test.astype('float32')\n", 97 | "x_train /= 255\n", 98 | "x_test /= 255\n", 99 | "print(x_train.shape[0], 'train samples')\n", 100 | "print(x_test.shape[0], 'test samples')\n", 101 | "\n", 102 | "# convert class vectors to binary class matrices\n", 103 | "num_classes = 10\n", 104 | "y_train = keras.utils.to_categorical(y_train, num_classes)\n", 105 | "y_test = keras.utils.to_categorical(y_test, num_classes)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "# Построим модель для обучения" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "Напишем функцию, которая возвращает качество у сети с заданной структурой " 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "def get_accuracy_on_deep_model(layers, data, batch_size=128, epochs=10):\n", 129 | "\n", 130 | " \"\"\"\n", 131 | "\n", 132 | " Функция создает модель с несколькими скрытыми слояи (количество вершин каждого слоя\n", 133 | " указано в массиве layers). Каждый слой (кроме последнего, на 10 выходов)\n", 134 | " сопровождается дропаутом с долей удаленных ребер 0.2\n", 135 | "\n", 136 | " Модель обучается на данных data с гиперпараметрами batch_size и epochs.\n", 137 | "\n", 138 | " Для функции активации используйте relu, на последнем слое используйте softmax.\n", 139 | "\n", 140 | " Функция возвращает качество на тестовое выборке в последней эпохе.\n", 141 | "\n", 142 | " :param layers: массив, каждый элемент равен количеству вершин на слое.\n", 143 | " :param data: кортеж, (x_train, y_train, x_test, y_test)\n", 144 | " :param batch_size: размер батча (берем по дефолту)\n", 145 | " :param epochs: количество эпох (берем по дефолту)\n", 146 | " :return: финальное качество на тестовой выборке.\n", 147 | " \"\"\"\n", 148 | "\n", 149 | " x_train, y_train, x_test, y_test = data\n", 150 | "\n", 151 | " model = Sequential()\n", 152 | "\n", 153 | " # добавляем первый слой\n", 154 | " model.add(...)\n", 155 | " model.add(Dropout(0.2))\n", 156 | "\n", 157 | " # с помощью цикла добавляем остальные слои\n", 158 | " for layer in layers[1:]:\n", 159 | " pass\n", 160 | "\n", 161 | " # добавляем последний слой\n", 162 | " model.add(...)\n", 163 | "\n", 164 | " # компилируем модель\n", 165 | " model.compile(loss='categorical_crossentropy',\n", 166 | " optimizer='adam',\n", 167 | " metrics=['accuracy'])\n", 168 | "\n", 169 | " # обучаемся\n", 170 | " history = model.fit()\n", 171 | "\n", 172 | " # возвращаем только последнее значение на валидационной выборке\n", 173 | " return history.history[...]" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "Посмотрим качество с двумя скрытыми слоями" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "data = x_train, y_train, x_test, y_test" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "scrolled": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "get_accuracy_on_deep_model([150,512], data)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "get_accuracy_on_deep_model([700,150], data)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "get_accuracy_on_deep_model([150,700], data)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "Посмотрим качество с тремя скрытыми слоями" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "get_accuracy_on_deep_model([100,100,100], data)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "get_accuracy_on_deep_model([30,150,30], data)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "get_accuracy_on_deep_model([200,100,50], data)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "Посмотрим качество с четырьмя скрытыми слоями" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "get_accuracy_on_deep_model([80,80,80,80], data)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "get_accuracy_on_deep_model([120,40,120,40], data)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "# Ответы на форму" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "Ответ на форму - выбор структуры, которая показала наибольшее качество среди остальных структур с тем же количеством слоев" 292 | ] 293 | } 294 | ], 295 | "metadata": { 296 | "kernelspec": { 297 | "display_name": "my_env", 298 | "language": "python", 299 | "name": "env_chukrello" 300 | }, 301 | "language_info": { 302 | "codemirror_mode": { 303 | "name": "ipython", 304 | "version": 2 305 | }, 306 | "file_extension": ".py", 307 | "mimetype": "text/x-python", 308 | "name": "python", 309 | "nbconvert_exporter": "python", 310 | "pygments_lexer": "ipython2", 311 | "version": "2.7.10" 312 | } 313 | }, 314 | "nbformat": 4, 315 | "nbformat_minor": 2 316 | } 317 | -------------------------------------------------------------------------------- /hw04/README.md: -------------------------------------------------------------------------------- 1 | Выполните задачи в домашних ноутбуках и в соответствии с ними пройдите [форму](https://docs.google.com/forms/d/e/1FAIpQLSdpuV-Wg7nv29mMHehg1_rJCouJ8lK50qEZxvWXLLv02hjnQA/viewform). 2 | 3 | Дедлайн по сдаче: 23:00 22 апреля. 4 | -------------------------------------------------------------------------------- /hw04/WideNeuralNetworkMNIST.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Нейросеть с одним скрытым слоем на MNIST" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Попробуем применить модель с одним скрытым слоем на датасете MNIST. Необходимо будет реализовать функцию, обучающую модель и понять, в какое качество мы \"упираемся\"." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Загрузим данные" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "scrolled": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import keras\n", 33 | "import numpy as np\n", 34 | "\n", 35 | "from keras.datasets import mnist\n", 36 | "from keras.utils import np_utils\n", 37 | "\n", 38 | "from keras.layers import Dense\n", 39 | "from keras.layers.core import Activation\n", 40 | "from keras.models import Sequential\n", 41 | "\n", 42 | "import matplotlib.pyplot as plt" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Для воспроизводимости расчетов воспользуемся стандартным разбиением на обучающую и тестовую выборки" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "train, test = mnist.load_data()\n", 59 | "\n", 60 | "x_train = train[0]\n", 61 | "y_train = train[1]\n", 62 | "\n", 63 | "x_test = test[0]\n", 64 | "y_test = test[1]" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Перед тем, как начать работу, посмотрите на данные глазами" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "images_and_labels = list(zip(x_train, y_train))\n", 81 | "for index, (image, label) in enumerate(images_and_labels[:12]):\n", 82 | " plt.subplot(5, 4, index + 1)\n", 83 | " plt.axis('off')\n", 84 | " plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')\n", 85 | " plt.title('label: %i' % label )" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Преобразуем данные: сделаем так, чтобы мы работали с матрицей, у которой значения от 0 до 1" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "x_train = x_train.reshape(60000, 28*28)\n", 102 | "x_test = x_test.reshape(10000, 28*28)\n", 103 | "x_train = x_train.astype('float32')\n", 104 | "x_test = x_test.astype('float32')\n", 105 | "x_train /= 255\n", 106 | "x_test /= 255\n", 107 | "print(x_train.shape[0], 'train samples')\n", 108 | "print(x_test.shape[0], 'test samples')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "А таргет сделаем категориальной переменной (то есть значение таргета по индексу k будет говорить, является ли эта цифра k)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "num_classes = 10\n", 125 | "y_train = keras.utils.to_categorical(y_train, num_classes)\n", 126 | "y_test = keras.utils.to_categorical(y_test, num_classes)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## Построим модель для обучения" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Зафиксируем гиперпараметры сети" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "batch_size = 128\n", 150 | "epochs = 10" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "scrolled": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "def get_accuracies_of_wide_model(units, data, batch_size=128, epochs=10, n_iterations=5):\n", 162 | " \"\"\"\n", 163 | " Функция создает модель с одним скрытым слоем с количеством вершин units,\n", 164 | " обучается на данных data с гиперпараметрами batch_size и epochs количество\n", 165 | " раз, равное n_iterations и возвращает массив метрик качества на каждой итерации.\n", 166 | " \n", 167 | " Для функции активации используйте relu, на последнем слое используйте softmax.\n", 168 | " \n", 169 | " :param units: количество вершин (регулируем)\n", 170 | " :param batch_size: размер батча (берем по дефолту)\n", 171 | " :param epochs: количество эпох (берем по дефолту)\n", 172 | " :param n_iterations: количество итераций (берем по дефолту)\n", 173 | " :param data: кортеж, (x_train, y_train, x_test, y_test)\n", 174 | " :return: массив, качество на тестовой выборке по каждой итерации\n", 175 | " \"\"\"\n", 176 | " \n", 177 | " x_train, y_train, x_test, y_test = data\n", 178 | " accuracies = []\n", 179 | " \n", 180 | " for i in range(n_iterations):\n", 181 | " model = Sequential()\n", 182 | " # добавление необходимых слоев\n", 183 | " # ...\n", 184 | " # компилируем модель (не меняйте параметры)\n", 185 | " model.compile(optimizer=\"adam\",\n", 186 | " loss='categorical_crossentropy',\n", 187 | " metrics=[\"accuracy\"])\n", 188 | " \n", 189 | " # обучение (указать правильные параметры)\n", 190 | " history = model.fit()\n", 191 | " #получение конечного качества на тестовой выборке\n", 192 | " accuracy = \n", 193 | " \n", 194 | " accuracies.append(accuracy)\n", 195 | " \n", 196 | " return accuracies" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "Переберем разное количество вершин в нашей сети. Чтобы сильно не напрягать компьютер, не берите больше 3000 вершин.\n", 204 | "\n", 205 | "Задача со звездочкой: постройте график качества и посмотрите, изменяется ли оно" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "# делаем массив вершин (рекомендуем брать до 3000)\n", 215 | "units_list = []\n", 216 | " \n", 217 | "data = (x_train, y_train, x_test, y_test)\n", 218 | "results = [get_accuracies_of_wide_model(units, data) for units in units_list]" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "# Ответы для формы" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "Ответом для формы должно служить максимальное качество на тестовой выборке. Поскольку в keras результаты разнятся от запуска к запуску, правильный ответ будет засчитан как интервал" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "max_results = max([max(result) for result in results])\n", 242 | "print('{:.4f}'.format(max_results))" 243 | ] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "my_env", 249 | "language": "python", 250 | "name": "env_chukrello" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 2 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython2", 262 | "version": "2.7.10" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 2 267 | } 268 | -------------------------------------------------------------------------------- /lecture01/L1_Introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/lecture01/L1_Introduction.pdf -------------------------------------------------------------------------------- /lecture02/L2_MathAndSimpleMethods.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/lecture02/L2_MathAndSimpleMethods.pdf -------------------------------------------------------------------------------- /lecture03/L3_LinearModels.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/lecture03/L3_LinearModels.pdf -------------------------------------------------------------------------------- /lecture0405/L4_DecisionTreesAndEnsembles.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/lecture0405/L4_DecisionTreesAndEnsembles.pdf -------------------------------------------------------------------------------- /lecture0405/L5_Metrics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/lecture0405/L5_Metrics.pdf -------------------------------------------------------------------------------- /lecture06/L6_Unsupervised.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/lecture06/L6_Unsupervised.pdf -------------------------------------------------------------------------------- /lecture0708/L7_8_Validation_and_features.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/lecture0708/L7_8_Validation_and_features.pdf -------------------------------------------------------------------------------- /lecture09/L9_NN_intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/lecture09/L9_NN_intro.pdf -------------------------------------------------------------------------------- /lecture10/L10_CNN_and_RNN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/lecture10/L10_CNN_and_RNN.pdf -------------------------------------------------------------------------------- /seminar01/SklearnFirstClassifiers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Обучаем первые классификаторы в sklearn" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Данные" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "\n", 22 | "По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).\n", 23 | "\n", 24 | "Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Готовим обучающую и тестовую выборки" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import pandas as pd\n", 41 | "\n", 42 | "bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/html": [ 53 | "
\n", 54 | "\n", 67 | "\n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | "
ActivityD1D2D3D4D5D6D7D8D9...D1767D1768D1769D1770D1771D1772D1773D1774D1775D1776
010.0000000.4970090.100.00.1329560.6780310.2731660.5854450.743663...0000000000
110.3666670.6062910.050.00.1112090.8034550.1061050.4117540.836582...1111010010
210.0333000.4801240.000.00.2097910.6103500.3564530.5177200.679051...0000000000
310.0000000.5388250.000.50.1963440.7242300.2356060.2887640.805110...0000000000
400.1000000.5177940.000.00.4947340.7814220.1543610.3038090.812646...0000000000
\n", 217 | "

5 rows × 1777 columns

\n", 218 | "
" 219 | ], 220 | "text/plain": [ 221 | " Activity D1 D2 D3 D4 D5 D6 D7 \\\n", 222 | "0 1 0.000000 0.497009 0.10 0.0 0.132956 0.678031 0.273166 \n", 223 | "1 1 0.366667 0.606291 0.05 0.0 0.111209 0.803455 0.106105 \n", 224 | "2 1 0.033300 0.480124 0.00 0.0 0.209791 0.610350 0.356453 \n", 225 | "3 1 0.000000 0.538825 0.00 0.5 0.196344 0.724230 0.235606 \n", 226 | "4 0 0.100000 0.517794 0.00 0.0 0.494734 0.781422 0.154361 \n", 227 | "\n", 228 | " D8 D9 ... D1767 D1768 D1769 D1770 D1771 D1772 D1773 \\\n", 229 | "0 0.585445 0.743663 ... 0 0 0 0 0 0 0 \n", 230 | "1 0.411754 0.836582 ... 1 1 1 1 0 1 0 \n", 231 | "2 0.517720 0.679051 ... 0 0 0 0 0 0 0 \n", 232 | "3 0.288764 0.805110 ... 0 0 0 0 0 0 0 \n", 233 | "4 0.303809 0.812646 ... 0 0 0 0 0 0 0 \n", 234 | "\n", 235 | " D1774 D1775 D1776 \n", 236 | "0 0 0 0 \n", 237 | "1 0 1 0 \n", 238 | "2 0 0 0 \n", 239 | "3 0 0 0 \n", 240 | "4 0 0 0 \n", 241 | "\n", 242 | "[5 rows x 1777 columns]" 243 | ] 244 | }, 245 | "execution_count": 2, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "bioresponce.head(5)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 3, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "y = bioresponce.Activity.values" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 4, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "X = bioresponce.iloc[:, 1:]" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 6, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "from sklearn.model_selection import train_test_split\n", 279 | "\n", 280 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "### Строим модель и оцениваем качество" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 7, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "from sklearn.linear_model import LogisticRegression" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 8, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "model = LogisticRegression()\n", 306 | "model.fit(X_train, y_train)\n", 307 | "preds = model.predict(X_test)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 9, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "numpy.ndarray" 319 | ] 320 | }, 321 | "execution_count": 9, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "type(preds)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 10, 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "1" 339 | ] 340 | }, 341 | "execution_count": 10, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "10 // 9" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 11, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "0.7560581583198708\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "print(sum(preds == y_test) / len(preds))" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 12, 370 | "metadata": {}, 371 | "outputs": [ 372 | { 373 | "name": "stdout", 374 | "output_type": "stream", 375 | "text": [ 376 | "0.7560581583198708\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "print(sum(preds == y_test) / float(len(preds)))" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 13, 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "0.7560581583198708\n" 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "from sklearn.metrics import accuracy_score\n", 399 | "\n", 400 | "print(accuracy_score(preds, y_test))" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "### Качество на кросс-валидации" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 14, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "name": "stdout", 417 | "output_type": "stream", 418 | "text": [ 419 | "[0.74404762 0.73956262 0.72310757 0.75099602 0.75896414]\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "from sklearn.model_selection import cross_val_score\n", 425 | "\n", 426 | "print(cross_val_score(model, X_train, y_train, cv=5))" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 15, 432 | "metadata": {}, 433 | "outputs": [ 434 | { 435 | "name": "stdout", 436 | "output_type": "stream", 437 | "text": [ 438 | "0.7433355944771515\n" 439 | ] 440 | } 441 | ], 442 | "source": [ 443 | "print(cross_val_score(model, X_train, y_train, cv=5).mean())" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "### Пробуем другие классификаторы" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 16, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "from sklearn.neighbors import KNeighborsClassifier\n", 460 | "from sklearn.tree import DecisionTreeClassifier\n", 461 | "from sklearn.svm import LinearSVC\n", 462 | "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 17, 468 | "metadata": {}, 469 | "outputs": [ 470 | { 471 | "name": "stdout", 472 | "output_type": "stream", 473 | "text": [ 474 | "0.7189014539579968 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 475 | " metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n", 476 | " weights='uniform')\n", 477 | "0.7059773828756059 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", 478 | " max_features=None, max_leaf_nodes=None,\n", 479 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 480 | " min_samples_leaf=1, min_samples_split=2,\n", 481 | " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n", 482 | " splitter='best')\n", 483 | "0.7431340872374798 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 484 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 485 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 486 | " verbose=0)\n", 487 | "0.789983844911147 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 488 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 489 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 490 | " min_samples_leaf=1, min_samples_split=2,\n", 491 | " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,\n", 492 | " oob_score=False, random_state=None, verbose=0,\n", 493 | " warm_start=False)\n", 494 | "0.778675282714055 GradientBoostingClassifier(criterion='friedman_mse', init=None,\n", 495 | " learning_rate=0.1, loss='deviance', max_depth=3,\n", 496 | " max_features=None, max_leaf_nodes=None,\n", 497 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 498 | " min_samples_leaf=1, min_samples_split=2,\n", 499 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n", 500 | " presort='auto', random_state=None, subsample=1.0, verbose=0,\n", 501 | " warm_start=False)\n", 502 | "CPU times: user 25.9 s, sys: 900 ms, total: 26.8 s\n", 503 | "Wall time: 25.9 s\n" 504 | ] 505 | } 506 | ], 507 | "source": [ 508 | "%%time\n", 509 | "\n", 510 | "models = [\n", 511 | " KNeighborsClassifier(),\n", 512 | " DecisionTreeClassifier(),\n", 513 | " LinearSVC(),\n", 514 | " RandomForestClassifier(n_estimators=100), \n", 515 | " GradientBoostingClassifier(n_estimators=100)\n", 516 | "]\n", 517 | "\n", 518 | "for model in models:\n", 519 | " model.fit(X_train, y_train)\n", 520 | " preds = model.predict(X_test)\n", 521 | " print(accuracy_score(preds, y_test), model)" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [] 530 | } 531 | ], 532 | "metadata": { 533 | "anaconda-cloud": {}, 534 | "kernelspec": { 535 | "display_name": "dmia", 536 | "language": "python", 537 | "name": "dmia" 538 | }, 539 | "language_info": { 540 | "codemirror_mode": { 541 | "name": "ipython", 542 | "version": 3 543 | }, 544 | "file_extension": ".py", 545 | "mimetype": "text/x-python", 546 | "name": "python", 547 | "nbconvert_exporter": "python", 548 | "pygments_lexer": "ipython3", 549 | "version": "3.6.6" 550 | } 551 | }, 552 | "nbformat": 4, 553 | "nbformat_minor": 1 554 | } 555 | -------------------------------------------------------------------------------- /seminar02/numpy_fancy_indexing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar02/numpy_fancy_indexing.png -------------------------------------------------------------------------------- /seminar02/numpy_indexing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar02/numpy_indexing.png -------------------------------------------------------------------------------- /seminar03/Proximal_operator.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/Proximal_operator.pdf -------------------------------------------------------------------------------- /seminar03/bias_variance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/bias_variance.pdf -------------------------------------------------------------------------------- /seminar03/pandas/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/pandas/.DS_Store -------------------------------------------------------------------------------- /seminar03/pandas/data_sample_example.tsv: -------------------------------------------------------------------------------- 1 | Name Birth City Position Иванов А.А. 22.03.1980 Москва Сорокин И.В. 07.08.1965 Волгоград инженер Белов М.М. 13.02.1980 Ростов менеджер Мельникова Д.С. 15.04.1985 Ростов Рыбина Е.П. 19.11.1985 Москва инженер Костров С.О. 31.05.1985 Москва стажер -------------------------------------------------------------------------------- /seminar03/pandas/dataset.tsv: -------------------------------------------------------------------------------- 1 | Name Birth City Position Иванов А.А. 22.03.1980 Москва Сорокин И.В. 07.08.1965 Волгоград инженер Белов М.М. 13.02.1980 Ростов менеджер Мельникова Д.С. 15.04.1985 Ростов Рыбина Е.П. 19.11.1985 Москва инженер Костров С.О. 31.05.1985 Москва стажер -------------------------------------------------------------------------------- /seminar03/pandas/iris_frame.csv: -------------------------------------------------------------------------------- 1 | sepal length (cm);sepal width (cm);petal length (cm);petal width (cm);target;target_name 2 | 5.1;3.5;1.4;0.2;0;setosa 3 | 4.9;3.0;1.4;0.2;0;setosa 4 | 4.7;3.2;1.3;0.2;0;setosa 5 | 4.6;3.1;1.5;0.2;0;setosa 6 | 5.0;3.6;1.4;0.2;0;setosa 7 | 5.4;3.9;1.7;0.4;0;setosa 8 | 4.6;3.4;1.4;0.3;0;setosa 9 | 5.0;3.4;1.5;0.2;0;setosa 10 | 4.4;2.9;1.4;0.2;0;setosa 11 | 4.9;3.1;1.5;0.1;0;setosa 12 | 5.4;3.7;1.5;0.2;0;setosa 13 | 4.8;3.4;1.6;0.2;0;setosa 14 | 4.8;3.0;1.4;0.1;0;setosa 15 | 4.3;3.0;1.1;0.1;0;setosa 16 | 5.8;4.0;1.2;0.2;0;setosa 17 | 5.7;4.4;1.5;0.4;0;setosa 18 | 5.4;3.9;1.3;0.4;0;setosa 19 | 5.1;3.5;1.4;0.3;0;setosa 20 | 5.7;3.8;1.7;0.3;0;setosa 21 | 5.1;3.8;1.5;0.3;0;setosa 22 | 5.4;3.4;1.7;0.2;0;setosa 23 | 5.1;3.7;1.5;0.4;0;setosa 24 | 4.6;3.6;1.0;0.2;0;setosa 25 | 5.1;3.3;1.7;0.5;0;setosa 26 | 4.8;3.4;1.9;0.2;0;setosa 27 | 5.0;3.0;1.6;0.2;0;setosa 28 | 5.0;3.4;1.6;0.4;0;setosa 29 | 5.2;3.5;1.5;0.2;0;setosa 30 | 5.2;3.4;1.4;0.2;0;setosa 31 | 4.7;3.2;1.6;0.2;0;setosa 32 | 4.8;3.1;1.6;0.2;0;setosa 33 | 5.4;3.4;1.5;0.4;0;setosa 34 | 5.2;4.1;1.5;0.1;0;setosa 35 | 5.5;4.2;1.4;0.2;0;setosa 36 | 4.9;3.1;1.5;0.1;0;setosa 37 | 5.0;3.2;1.2;0.2;0;setosa 38 | 5.5;3.5;1.3;0.2;0;setosa 39 | 4.9;3.1;1.5;0.1;0;setosa 40 | 4.4;3.0;1.3;0.2;0;setosa 41 | 5.1;3.4;1.5;0.2;0;setosa 42 | 5.0;3.5;1.3;0.3;0;setosa 43 | 4.5;2.3;1.3;0.3;0;setosa 44 | 4.4;3.2;1.3;0.2;0;setosa 45 | 5.0;3.5;1.6;0.6;0;setosa 46 | 5.1;3.8;1.9;0.4;0;setosa 47 | 4.8;3.0;1.4;0.3;0;setosa 48 | 5.1;3.8;1.6;0.2;0;setosa 49 | 4.6;3.2;1.4;0.2;0;setosa 50 | 5.3;3.7;1.5;0.2;0;setosa 51 | 5.0;3.3;1.4;0.2;0;setosa 52 | 7.0;3.2;4.7;1.4;1;versicolor 53 | 6.4;3.2;4.5;1.5;1;versicolor 54 | 6.9;3.1;4.9;1.5;1;versicolor 55 | 5.5;2.3;4.0;1.3;1;versicolor 56 | 6.5;2.8;4.6;1.5;1;versicolor 57 | 5.7;2.8;4.5;1.3;1;versicolor 58 | 6.3;3.3;4.7;1.6;1;versicolor 59 | 4.9;2.4;3.3;1.0;1;versicolor 60 | 6.6;2.9;4.6;1.3;1;versicolor 61 | 5.2;2.7;3.9;1.4;1;versicolor 62 | 5.0;2.0;3.5;1.0;1;versicolor 63 | 5.9;3.0;4.2;1.5;1;versicolor 64 | 6.0;2.2;4.0;1.0;1;versicolor 65 | 6.1;2.9;4.7;1.4;1;versicolor 66 | 5.6;2.9;3.6;1.3;1;versicolor 67 | 6.7;3.1;4.4;1.4;1;versicolor 68 | 5.6;3.0;4.5;1.5;1;versicolor 69 | 5.8;2.7;4.1;1.0;1;versicolor 70 | 6.2;2.2;4.5;1.5;1;versicolor 71 | 5.6;2.5;3.9;1.1;1;versicolor 72 | 5.9;3.2;4.8;1.8;1;versicolor 73 | 6.1;2.8;4.0;1.3;1;versicolor 74 | 6.3;2.5;4.9;1.5;1;versicolor 75 | 6.1;2.8;4.7;1.2;1;versicolor 76 | 6.4;2.9;4.3;1.3;1;versicolor 77 | 6.6;3.0;4.4;1.4;1;versicolor 78 | 6.8;2.8;4.8;1.4;1;versicolor 79 | 6.7;3.0;5.0;1.7;1;versicolor 80 | 6.0;2.9;4.5;1.5;1;versicolor 81 | 5.7;2.6;3.5;1.0;1;versicolor 82 | 5.5;2.4;3.8;1.1;1;versicolor 83 | 5.5;2.4;3.7;1.0;1;versicolor 84 | 5.8;2.7;3.9;1.2;1;versicolor 85 | 6.0;2.7;5.1;1.6;1;versicolor 86 | 5.4;3.0;4.5;1.5;1;versicolor 87 | 6.0;3.4;4.5;1.6;1;versicolor 88 | 6.7;3.1;4.7;1.5;1;versicolor 89 | 6.3;2.3;4.4;1.3;1;versicolor 90 | 5.6;3.0;4.1;1.3;1;versicolor 91 | 5.5;2.5;4.0;1.3;1;versicolor 92 | 5.5;2.6;4.4;1.2;1;versicolor 93 | 6.1;3.0;4.6;1.4;1;versicolor 94 | 5.8;2.6;4.0;1.2;1;versicolor 95 | 5.0;2.3;3.3;1.0;1;versicolor 96 | 5.6;2.7;4.2;1.3;1;versicolor 97 | 5.7;3.0;4.2;1.2;1;versicolor 98 | 5.7;2.9;4.2;1.3;1;versicolor 99 | 6.2;2.9;4.3;1.3;1;versicolor 100 | 5.1;2.5;3.0;1.1;1;versicolor 101 | 5.7;2.8;4.1;1.3;1;versicolor 102 | 6.3;3.3;6.0;2.5;2;virginica 103 | 5.8;2.7;5.1;1.9;2;virginica 104 | 7.1;3.0;5.9;2.1;2;virginica 105 | 6.3;2.9;5.6;1.8;2;virginica 106 | 6.5;3.0;5.8;2.2;2;virginica 107 | 7.6;3.0;6.6;2.1;2;virginica 108 | 4.9;2.5;4.5;1.7;2;virginica 109 | 7.3;2.9;6.3;1.8;2;virginica 110 | 6.7;2.5;5.8;1.8;2;virginica 111 | 7.2;3.6;6.1;2.5;2;virginica 112 | 6.5;3.2;5.1;2.0;2;virginica 113 | 6.4;2.7;5.3;1.9;2;virginica 114 | 6.8;3.0;5.5;2.1;2;virginica 115 | 5.7;2.5;5.0;2.0;2;virginica 116 | 5.8;2.8;5.1;2.4;2;virginica 117 | 6.4;3.2;5.3;2.3;2;virginica 118 | 6.5;3.0;5.5;1.8;2;virginica 119 | 7.7;3.8;6.7;2.2;2;virginica 120 | 7.7;2.6;6.9;2.3;2;virginica 121 | 6.0;2.2;5.0;1.5;2;virginica 122 | 6.9;3.2;5.7;2.3;2;virginica 123 | 5.6;2.8;4.9;2.0;2;virginica 124 | 7.7;2.8;6.7;2.0;2;virginica 125 | 6.3;2.7;4.9;1.8;2;virginica 126 | 6.7;3.3;5.7;2.1;2;virginica 127 | 7.2;3.2;6.0;1.8;2;virginica 128 | 6.2;2.8;4.8;1.8;2;virginica 129 | 6.1;3.0;4.9;1.8;2;virginica 130 | 6.4;2.8;5.6;2.1;2;virginica 131 | 7.2;3.0;5.8;1.6;2;virginica 132 | 7.4;2.8;6.1;1.9;2;virginica 133 | 7.9;3.8;6.4;2.0;2;virginica 134 | 6.4;2.8;5.6;2.2;2;virginica 135 | 6.3;2.8;5.1;1.5;2;virginica 136 | 6.1;2.6;5.6;1.4;2;virginica 137 | 7.7;3.0;6.1;2.3;2;virginica 138 | 6.3;3.4;5.6;2.4;2;virginica 139 | 6.4;3.1;5.5;1.8;2;virginica 140 | 6.0;3.0;4.8;1.8;2;virginica 141 | 6.9;3.1;5.4;2.1;2;virginica 142 | 6.7;3.1;5.6;2.4;2;virginica 143 | 6.9;3.1;5.1;2.3;2;virginica 144 | 5.8;2.7;5.1;1.9;2;virginica 145 | 6.8;3.2;5.9;2.3;2;virginica 146 | 6.7;3.3;5.7;2.5;2;virginica 147 | 6.7;3.0;5.2;2.3;2;virginica 148 | 6.3;2.5;5.0;1.9;2;virginica 149 | 6.5;3.0;5.2;2.0;2;virginica 150 | 6.2;3.4;5.4;2.3;2;virginica 151 | 5.9;3.0;5.1;1.8;2;virginica 152 | -------------------------------------------------------------------------------- /seminar03/pandas/updated_dataset.csv: -------------------------------------------------------------------------------- 1 | Name,Birth,City,Position 2 | Иванов А.А.,22.03.1980,Москва, 3 | Сорокин И.В.,07.08.1965,Волгоград,инженер 4 | Белов М.М.,13.02.1980,Ростов,менеджер 5 | Мельникова Д.С.,15.04.1985,Ростов, 6 | Рыбина Е.П.,19.11.1985,Москва,инженер 7 | -------------------------------------------------------------------------------- /seminar03/pics/3_datasets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/pics/3_datasets.png -------------------------------------------------------------------------------- /seminar03/pics/bag_of_words.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/pics/bag_of_words.png -------------------------------------------------------------------------------- /seminar03/pics/convex_function_3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/pics/convex_function_3d.png -------------------------------------------------------------------------------- /seminar03/pics/normalization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/pics/normalization.png -------------------------------------------------------------------------------- /seminar03/pics/overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/pics/overfitting.png -------------------------------------------------------------------------------- /seminar03/pics/regularization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/pics/regularization.png -------------------------------------------------------------------------------- /seminar03/pics/sgd_vs_gd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/pics/sgd_vs_gd.png -------------------------------------------------------------------------------- /seminar03/pics/tfidf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/pics/tfidf.png -------------------------------------------------------------------------------- /seminar03/sklearn/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar03/sklearn/.DS_Store -------------------------------------------------------------------------------- /seminar03/sklearn/sklearn-cross-validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sklearn" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## sklearn.cross_validation" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "документация: http://scikit-learn.org/stable/modules/cross_validation.html" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 16, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "from sklearn import cross_validation, datasets\n", 33 | "\n", 34 | "import numpy as np" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Разовое разбиение данных на обучение и тест с помощью train_test_split" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 17, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "iris = datasets.load_iris()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 18, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(iris.data, iris.target, \n", 64 | " test_size = 0.3)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 19, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "0.3" 78 | ] 79 | }, 80 | "execution_count": 19, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных\n", 87 | "float(len(test_labels))/len(iris.data)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "Размер обучающей выборки: 105 объектов \n", 102 | "Размер тестовой выборки: 45 объектов\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "print 'Размер обучающей выборки: {} объектов \\nРазмер тестовой выборки: {} объектов'.format(len(train_data),\n", 108 | " len(test_data))" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "Обучающая выборка:\n", 123 | "[[ 4.6 3.4 1.4 0.3]\n", 124 | " [ 5. 2. 3.5 1. ]\n", 125 | " [ 6.4 3.2 5.3 2.3]\n", 126 | " [ 5. 3.3 1.4 0.2]\n", 127 | " [ 5.6 2.5 3.9 1.1]]\n", 128 | "\n", 129 | "\n", 130 | "Тестовая выборка:\n", 131 | "[[ 5.2 4.1 1.5 0.1]\n", 132 | " [ 7.1 3. 5.9 2.1]\n", 133 | " [ 5.1 3.5 1.4 0.2]\n", 134 | " [ 4.9 3.1 1.5 0.1]\n", 135 | " [ 5.1 3.7 1.5 0.4]]\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "print 'Обучающая выборка:\\n', train_data[:5]\n", 141 | "print '\\n'\n", 142 | "print 'Тестовая выборка:\\n', test_data[:5]" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 7, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "Метки классов на обучающей выборке:\n", 157 | "[0 1 2 0 1 1 2 1 0 1 1 1 2 0 1 0 1 1 0 0 0 1 2 2 0 0 2 1 1 1 0 2 2 1 1 0 0\n", 158 | " 2 2 0 1 1 1 0 0 2 2 0 2 2 0 2 0 2 2 0 0 2 2 0 1 2 0 1 1 1 1 2 1 1 0 0 2 2\n", 159 | " 0 1 2 2 1 0 1 1 0 2 0 0 1 2 0 1 0 1 2 2 2 1 2 1 2 2 1 0 2 2 1]\n", 160 | "\n", 161 | "\n", 162 | "Метки классов на тестовой выборке:\n", 163 | "[0 2 0 0 0 0 0 2 2 1 2 1 1 0 2 2 2 0 1 1 1 2 2 1 2 0 0 2 2 0 0 2 1 2 0 0 0\n", 164 | " 2 1 0 1 0 1 1 1]\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "print 'Метки классов на обучающей выборке:\\n', train_labels\n", 170 | "print '\\n'\n", 171 | "print 'Метки классов на тестовой выборке:\\n', test_labels" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "### Стратегии проведения кросс-валидации" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "#### KFold" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 23, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "[2 3 4 5 6 7 8 9] [0 1]\n", 200 | "[0 1 4 5 6 7 8 9] [2 3]\n", 201 | "[0 1 2 3 6 7 8 9] [4 5]\n", 202 | "[0 1 2 3 4 5 8 9] [6 7]\n", 203 | "[0 1 2 3 4 5 6 7] [8 9]\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "for train_indices, test_indices in cross_validation.KFold(10, n_folds = 5):\n", 209 | " print train_indices, test_indices" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 22, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "[0 2 3 7 8] [1 4 5 6 9]\n", 224 | "[1 4 5 6 9] [0 2 3 7 8]\n" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "#многократный запуск приводит к различным разбиениям\n", 230 | "for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True):\n", 231 | " print train_indices, test_indices" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 24, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "[1 3 5 7 8] [0 2 4 6 9]\n", 246 | "[0 2 4 6 9] [1 3 5 7 8]\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "#многократный запуск приводит к одному и тому же разбиению, результат запуска детерминированный\n", 252 | "for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True, random_state = 1):\n", 253 | " print train_indices, test_indices" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "#### StratifiedKFold" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 25, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "[0 0 0 0 0 1 1 1 1 1]\n", 275 | "[3 4 8 9] [0 1 2 5 6 7]\n", 276 | "[0 1 2 5 6 7] [3 4 8 9]\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "target = np.array([0] * 5 + [1] * 5)\n", 282 | "print target\n", 283 | "for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2, shuffle = True, random_state = 0):\n", 284 | " print train_indices, test_indices" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 26, 290 | "metadata": { 291 | "collapsed": false 292 | }, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "[0 1 0 1 0 1 0 1 0 1]\n", 299 | "[1 2 3 4] [0 5 6 7 8 9]\n", 300 | "[0 5 6 7 8 9] [1 2 3 4]\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "target = np.array([0, 1] * 5)\n", 306 | "print target\n", 307 | "for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2,shuffle = True):\n", 308 | " print train_indices, test_indices" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "#### ShuffleSplit" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 27, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [ 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "[4 3 0 7 8 1 5 6] [9 2]\n", 330 | "[5 7 4 2 9 8 0 6] [1 3]\n", 331 | "[3 2 6 8 4 9 1 7] [0 5]\n", 332 | "[0 9 5 6 7 8 1 2] [3 4]\n", 333 | "[4 1 7 6 9 0 5 2] [3 8]\n", 334 | "[4 0 1 5 9 3 8 2] [6 7]\n", 335 | "[2 7 8 9 3 0 1 5] [4 6]\n", 336 | "[4 8 0 2 7 9 3 5] [6 1]\n", 337 | "[1 8 0 2 4 6 3 7] [9 5]\n", 338 | "[7 9 2 8 4 5 0 1] [6 3]\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "for train_indices, test_indices in cross_validation.ShuffleSplit(10, n_iter = 10, test_size = 0.2):\n", 344 | " print train_indices, test_indices" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "#### StratifiedShuffleSplit" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 14, 357 | "metadata": { 358 | "collapsed": false 359 | }, 360 | "outputs": [ 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "[0 0 0 0 0 1 1 1 1 1]\n", 366 | "[6 8 3 1 9 7 4 0] [5 2]\n", 367 | "[9 4 7 3 1 6 8 0] [5 2]\n", 368 | "[5 1 2 8 0 3 6 9] [4 7]\n", 369 | "[7 3 4 8 9 0 5 2] [1 6]\n" 370 | ] 371 | } 372 | ], 373 | "source": [ 374 | "target = np.array([0] * 5 + [1] * 5)\n", 375 | "print target\n", 376 | "for train_indices, test_indices in cross_validation.StratifiedShuffleSplit(target, n_iter = 4, test_size = 0.2):\n", 377 | " print train_indices, test_indices" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "#### Leave-One-Out" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 15, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "[1 2 3 4 5 6 7 8 9] [0]\n", 399 | "[0 2 3 4 5 6 7 8 9] [1]\n", 400 | "[0 1 3 4 5 6 7 8 9] [2]\n", 401 | "[0 1 2 4 5 6 7 8 9] [3]\n", 402 | "[0 1 2 3 5 6 7 8 9] [4]\n", 403 | "[0 1 2 3 4 6 7 8 9] [5]\n", 404 | "[0 1 2 3 4 5 7 8 9] [6]\n", 405 | "[0 1 2 3 4 5 6 8 9] [7]\n", 406 | "[0 1 2 3 4 5 6 7 9] [8]\n", 407 | "[0 1 2 3 4 5 6 7 8] [9]\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "for train_indices, test_index in cross_validation.LeaveOneOut(10):\n", 413 | " print train_indices, test_index" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators" 421 | ] 422 | } 423 | ], 424 | "metadata": { 425 | "anaconda-cloud": {}, 426 | "kernelspec": { 427 | "display_name": "Python [default]", 428 | "language": "python", 429 | "name": "python2" 430 | }, 431 | "language_info": { 432 | "codemirror_mode": { 433 | "name": "ipython", 434 | "version": 2 435 | }, 436 | "file_extension": ".py", 437 | "mimetype": "text/x-python", 438 | "name": "python", 439 | "nbconvert_exporter": "python", 440 | "pygments_lexer": "ipython2", 441 | "version": "2.7.12" 442 | } 443 | }, 444 | "nbformat": 4, 445 | "nbformat_minor": 0 446 | } 447 | -------------------------------------------------------------------------------- /seminar0405/02_Bagging.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Бэггинг" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Бэггинг над решающими деревьями\n", 15 | "\n", 16 | "В этом блокноте мы увидим на примере, насколько позволяет улучшить качество по сравнению с одиночным деревом использование бэггинга и добавление в деревьях рандомизации при выборе признаков для разбиений." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import pandas as pd\n", 26 | "import numpy as np\n", 27 | "from sklearn.model_selection import cross_val_score, train_test_split\n", 28 | "from sklearn.ensemble import BaggingClassifier\n", 29 | "from sklearn.tree import DecisionTreeClassifier\n", 30 | "from sklearn.ensemble import RandomForestClassifier\n", 31 | "from sklearn.linear_model import LogisticRegression # для сравнения с линейной моделью" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "scrolled": true 39 | }, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "['last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "data = pd.read_csv('HR.csv')\n", 51 | "\n", 52 | "target = 'left'\n", 53 | "features = [c for c in data if c != target]\n", 54 | "print(features)\n", 55 | "\n", 56 | "X, y = data[features], data[target]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 8, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "d3 = DecisionTreeClassifier() # Обычное решающее дерево" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Качество классификации решающим деревом с настройками по-умолчанию:" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 12, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "Decision tree: 0.6536431686337267\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "print(\"Decision tree:\", cross_val_score(d3, X, y).mean())" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "Бэггинг над решающими деревьями:" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 13, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "D3 bagging: 0.7174495299059812\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "print(\"D3 bagging:\", cross_val_score(BaggingClassifier(d3, random_state=42), X, y).mean())" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "Усредненная модель оказалась заметно лучше. У решающих деревьев есть существенный недостаток - нестабильность получаемого дерева при небольших изменениях в выборке. Но бэггинг обращает этот недостаток в достоинство, ведь усредненная модель работает лучше, когда базовые модели слабо скоррелированы (это обстоятельство будет пояснено дополнительно ниже - в теоретической части)." 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Изучив параметры DecisionTreeClassifier, можно найти хороший способ сделать деревья еще более различными - при построении каждого узла отбирать случайные max_features признаков и искать информативное разбиение только по одному из них." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "# Решающее дерево с рандомизацией в сплитах\n", 137 | "rnd_d3 = DecisionTreeClassifier(max_features=int(len(features) ** 0.5))" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "?DecisionTreeClassifier" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 14, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "Randomized D3 Bagging: 0.7194494632259785\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "print(\"Randomized D3 Bagging:\", cross_val_score(BaggingClassifier(rnd_d3, random_state=42), X, y).mean())" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "В среднем, качество получается еще лучше. Для выбора числа признаков использовалась часто применяемая на практике эвристика - брать корень из общего числа признаков. Если бы мы решали задачу регрессии - брали бы треть от общего числа." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 15, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "Random Forest: 0.7232495965859839\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "print(\"Random Forest:\", cross_val_score(RandomForestClassifier(random_state=42), X, y).mean())" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "Кстати, давайте посмотрим, нужно ли выбирать случайные признаки в каждом сплите или достаточно выбрать их один раз для каждого дерева. В этом нам поможет параметр max_features в BaggingClassifier:" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 5, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "?BaggingClassifier" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 9, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "D3 bagging: 0.7175827298793092\n" 216 | ] 217 | }, 218 | { 219 | "name": "stderr", 220 | "output_type": "stream", 221 | "text": [ 222 | "/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n", 223 | " warnings.warn(CV_WARNING, FutureWarning)\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "print(\"D3 bagging:\", cross_val_score(BaggingClassifier(d3, random_state=42, max_features=int(len(features) ** 0.5)), X, y).mean())" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 16, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "name": "stdout", 238 | "output_type": "stream", 239 | "text": [ 240 | "Logistic Regression: 0.6287053143962126\n" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "print(\"Logistic Regression:\", cross_val_score(LogisticRegression(), X, y).mean())" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "## Bonus-track" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "#### Опциональное практическое задание\n", 260 | "Повторные запуски cross_val_score будут показывать различное качество модели.\n", 261 | "\n", 262 | "Это зависит от параметра рандомизации модели \"random_state\" в DecisionTreeClassifier, BaggingClassifie или RandomForest.\n", 263 | "\n", 264 | "Чтобы определить, действительно ли одна модель лучше другой, можно посмотреть на её качество в среднем, то есть усредняя запуски с разным random_state. Попробуйте сравнить качество и понять, действительно ли BaggingClassifier(d3) лучше BaggingClassifier(rnd_d3).\n", 265 | "\n", 266 | "Также попробуйте подумать, чем здесь отличается BaggingClassifier(rnd_d3) от RandomForestClassifier()?" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "collapsed": true 274 | }, 275 | "outputs": [], 276 | "source": [] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "#### Немного теории: почему усреднение увеличивает \"устойчивость\" модели" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "Пусть есть случайные одинаково распределённые величины $\\xi_1, \\xi_2, \\dots, \\xi_n$, скоррелированные с коэффициентом корреляции $\\rho$ и дисперсией $\\sigma^2$. Какова будет дисперсия величины $\\frac1n \\sum_{i=1}^n \\xi_i$?\n", 290 | "\n", 291 | "$$\\mathbf{D} \\frac1n \\sum_{i=1}^n \\xi_i = \\frac1{n^2}\\mathbf{cov} (\\sum_{i=1}^n \\xi_i, \\sum_{i=1}^n \\xi_i) = \\frac1{n^2} \\sum_{i=1, j=1}^n \\mathbf{cov}(\\xi_i, \\xi_j) = \\frac1{n^2} \\sum_{i=1}^n \\mathbf{cov}(\\xi_i, \\xi_i) + \\frac1{n^2} \\sum_{i=1, j=1, i\\neq j}^n \\mathbf{cov}(\\xi_i, \\xi_j) = \\frac1{n^2} \\sum_{i=1}^n \\sigma^2+ \\frac1{n^2} \\sum_{i=1, j=1, i\\neq j}^n \\rho \\sigma^2 =$$\n", 292 | "$$ = \\frac1{n^2} n \\sigma^2 + \\frac1{n^2} n(n-1) \\rho \\sigma^2 = \\frac{\\sigma^2( 1 + \\rho(n-1))}{n}$$\n", 293 | "\n", 294 | "Таким образом, чем менее величины скоррелированы между собой, тем меньше будет дисперсия после их усреднения. В этом и состоит усреднения прогнозов моделей: давайте сделаем много моделей, желательно с не очень большой корреляцией ответов, а потом усредим их прогнозы, тогда предсказание станет более устойчивым, чем у отдельных моделей." 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "**Опциональные теоретические задачи**\n", 302 | "\n", 303 | "1.\n", 304 | "\n", 305 | "Покажите, что матожидание квадрата ошибки модели, полученной усреднением $M$ регрессионных моделей $a_i(x)$, ответы коротых не скоррелированы, а матожидание $\\mathbf{E}_{x,y}\\left( a_i(x) - y \\right)$ равно нулю для каждой модели, будет в $M$ раз меньше, чем матожидание квадрата ошибки каждой модели $a_i(x)$\n", 306 | "\n", 307 | "2.\n", 308 | "\n", 309 | "Покажите, что средний квадрат отклонения от правильных ответов на обучающей выборке (**MSE**, mean squared error) у усредненной модели будет всегда не больше, чем среднее значение **MSE** на обучающей выборке по всем усредняемым моделям. *Подсказка: вам помогут неравенство Коши-Буняковского (оно же неравенство Шварца) или неравенство Йенсена.*\n", 310 | "\n", 311 | "3.\n", 312 | "\n", 313 | "Обобщите результат задачи 2 для любой выпуклой функции потерь." 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [] 322 | } 323 | ], 324 | "metadata": { 325 | "anaconda-cloud": {}, 326 | "kernelspec": { 327 | "display_name": "Python 3", 328 | "language": "python", 329 | "name": "python3" 330 | }, 331 | "language_info": { 332 | "codemirror_mode": { 333 | "name": "ipython", 334 | "version": 3 335 | }, 336 | "file_extension": ".py", 337 | "mimetype": "text/x-python", 338 | "name": "python", 339 | "nbconvert_exporter": "python", 340 | "pygments_lexer": "ipython3", 341 | "version": "3.7.1" 342 | } 343 | }, 344 | "nbformat": 4, 345 | "nbformat_minor": 1 346 | } 347 | -------------------------------------------------------------------------------- /seminar0405/03_Boosting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Градиентный бустинг" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Бустинг это метод построения компизиции алгоритмов, в котором базовые алгоритмы строятся последовательно один за другим, причем каждый следующий алгоритм строится таким образом, чтобы уменьшить ошибку предыдущего." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Положим, что алгоритм это сумма некоторых базовых алгоритмов:\n", 22 | " $$a_N(x) = \\sum_{n=1}^N b_n(x)$$" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Пусть задана некоторая функция потерь, которую мы оптимизируем\n", 30 | "$$\\sum_{i=1}^l L(\\hat y_i, y_i) \\rightarrow min$$ \n", 31 | "\n", 32 | "\n", 33 | "Зададимся вопросом: а что если мы хотим добавить ещё один алгоритм в эту композицию, но не просто добавить, а как можно оптимальнее с точки зрения исходной оптимизационной задачи. То есть уже есть какой-то алгоритм $a_N(x)$ и мы хотим прибавить к нему базовый алгоритм $b_{N+1}(x)$:\n", 34 | "\n", 35 | "$$\\sum_{i=1}^l L(a_{N}(x_i) + b_{N+1}(x_i), y_i) \\to \\min_{b_{N+1}}$$\n", 36 | "\n", 37 | "Сначала имеет смысл решить более простую задачу: определить, какие значения $r_1 ,r_2 ..., r_l$ должен принимать алгоритм $b_N(x_i) = r_i$ на объектах обучающей выборки, чтобы ошибка на обучающей выборке была минимальной:\n", 38 | "\n", 39 | "$$F(r) = \\sum_{i=1}^l L(a_{N}(x_i) + r_i, y_i) \\to \\min_{r},$$\n", 40 | "\n", 41 | "где $r = (r_1, r_2, \\dots, r_l)$ - вектор сдвигов." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "Поскольку направление наискорейшего убывания функции задается направлением антиградиента, его можно принять в качестве вектора $r$:\n", 49 | "$$r = -\\nabla F \\\\$$\n", 50 | "$$r_i = \\frac{\\partial{L}(a_N(x_i), y_i))}{\\partial{a_N(x_i)}}, \\ \\ \\ i = \\overline{1,l}$$" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Компоненты вектора $r$, фактически, являются теми значениями, которые на объектах обучающей выборки должен принимать новый алгоритм $b_{N+1}(x)$, чтобы минимизировать ошибку строящейся композиции. \n", 58 | "Обучение $b_{N+1}(x)$, таким образом, представляет собой *задачу обучения на размеченных данных*, в которой ${(x_i , r_i )}_{i=1}^l$ — обучающая выборка, и используется, например, квадратичная функция ошибки:\n", 59 | "$$b_{N+1}(x) = arg \\min_{b}\\sum_{i=1}^l(b(x_i) - r_i)^2$$" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "Таким образом, можно подобрать неплохое улучшение текущего алгоритма $a_N(x)$, а потом ещё раз и ещё, в итоге получив комбинацию алгоритмов, которая будет минимизировать исходный функционал." 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "Если говорить более точно, в градиентном бустинге итоговый алгоритм строится не просто как сумма базовых алгоритмов, а как взвешенная сумма:\n", 74 | " $$a_N(x) = \\sum_{n=1}^N \\alpha_n b_n(x)$$\n", 75 | " \n", 76 | "Статегии подбора весов $\\alpha_n$ тоже могут быть разными по аналогии с подбором шага в градиентном спуске." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "# Градиентный бустинг над решающими деревьями\n", 84 | "\n", 85 | "Наиболее популярное семейство алгоритмов для бустинга это деревья. Рассмотрим популярные библиотеки" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 1, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stderr", 95 | "output_type": "stream", 96 | "text": [ 97 | "/anaconda3/lib/python3.7/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n", 98 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n", 99 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n", 100 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n", 101 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "import pandas as pd\n", 107 | "import numpy as np\n", 108 | "from sklearn.model_selection import cross_val_score, train_test_split\n", 109 | "\n", 110 | "from xgboost import XGBClassifier\n", 111 | "from catboost import CatBoostClassifier\n", 112 | "from lightgbm import LGBMClassifier\n", 113 | "\n", 114 | "import warnings\n", 115 | "warnings.filterwarnings('ignore')" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 15, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "'2.2.3'" 127 | ] 128 | }, 129 | "execution_count": 15, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "import lightgbm\n", 136 | "lightgbm.__version__" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 3, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "data = pd.read_csv('HR.csv')" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 4, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "(14999, 7)" 157 | ] 158 | }, 159 | "execution_count": 4, 160 | "metadata": {}, 161 | "output_type": "execute_result" 162 | } 163 | ], 164 | "source": [ 165 | "data.shape" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 3, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/html": [ 176 | "
\n", 177 | "\n", 190 | "\n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | "
last_evaluationnumber_projectaverage_montly_hourstime_spend_companyWork_accidentleftpromotion_last_5years
00.5321573010
10.8652626000
20.8872724010
30.8752235010
40.5221593010
\n", 256 | "
" 257 | ], 258 | "text/plain": [ 259 | " last_evaluation number_project average_montly_hours time_spend_company \\\n", 260 | "0 0.53 2 157 3 \n", 261 | "1 0.86 5 262 6 \n", 262 | "2 0.88 7 272 4 \n", 263 | "3 0.87 5 223 5 \n", 264 | "4 0.52 2 159 3 \n", 265 | "\n", 266 | " Work_accident left promotion_last_5years \n", 267 | "0 0 1 0 \n", 268 | "1 0 0 0 \n", 269 | "2 0 1 0 \n", 270 | "3 0 1 0 \n", 271 | "4 0 1 0 " 272 | ] 273 | }, 274 | "execution_count": 3, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "data.head()" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 4, 286 | "metadata": { 287 | "scrolled": true 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "X, y = data.drop('left', axis=1).values, data['left'].values" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "Качество классификации решающим деревом с настройками по-умолчанию:" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 5, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "XGBClassifier: 0.7791\n", 311 | "CPU times: user 1.32 s, sys: 7.94 ms, total: 1.33 s\n", 312 | "Wall time: 1.34 s\n" 313 | ] 314 | } 315 | ], 316 | "source": [ 317 | "%%time\n", 318 | "print(\"XGBClassifier: {:.4f}\".format(cross_val_score(XGBClassifier(), X, y).mean()))" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 13, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "CatBoostClassifier: 0.7644\n", 331 | "CPU times: user 27.1 s, sys: 5.33 s, total: 32.4 s\n", 332 | "Wall time: 9.66 s\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "%%time\n", 338 | "print(\"CatBoostClassifier: {:.4f}\".format(cross_val_score(CatBoostClassifier(iterations=100, verbose=False), X, y).mean()))" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 7, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "name": "stdout", 348 | "output_type": "stream", 349 | "text": [ 350 | "LGBMClassifier: 0.7790\n", 351 | "CPU times: user 1.54 s, sys: 1.85 s, total: 3.39 s\n", 352 | "Wall time: 2 s\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "%%time\n", 358 | "print(\"LGBMClassifier: {:.4f}\".format(cross_val_score(LGBMClassifier(), X, y).mean()))" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 12, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "?LGBMClassifier" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "## Опциональное задание\n", 375 | "Поэкспериментируйте с основными параметрами алгоритмов, чтобы максимизировать качество" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [] 386 | } 387 | ], 388 | "metadata": { 389 | "anaconda-cloud": {}, 390 | "kernelspec": { 391 | "display_name": "Python 3", 392 | "language": "python", 393 | "name": "python3" 394 | }, 395 | "language_info": { 396 | "codemirror_mode": { 397 | "name": "ipython", 398 | "version": 3 399 | }, 400 | "file_extension": ".py", 401 | "mimetype": "text/x-python", 402 | "name": "python", 403 | "nbconvert_exporter": "python", 404 | "pygments_lexer": "ipython3", 405 | "version": "3.7.1" 406 | } 407 | }, 408 | "nbformat": 4, 409 | "nbformat_minor": 1 410 | } 411 | -------------------------------------------------------------------------------- /seminar0405/05_BiasVariance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Bias Variance" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "![title](ml_bias_variance.png)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Мы не будем выписывать строгие формулы, но попробуем объяснить идею этих понятий.\n", 22 | "\n", 23 | "Пусть у нас есть алгоритм обучения, который по данным может создать модель.\n", 24 | "\n", 25 | "Ошибка этих моделей может быть разложена на три части:\n", 26 | "* **Noise** – шум данных, не предсказуем, теоретический минимум ошибки\n", 27 | "* **Bias** – смещение, на сколько хорошо работает средний алгоритм. Средний алгоритм это \"возьмём случайные данные, обучим алгоритм, сделаем предсказания\", **Bias** – это ошибка средних предсказаний.\n", 28 | "* **Variance** – разброс, на сколько устойчиво работает алгоритм. Опять же \"возьмём случайные данные, обучим алгоритм, сделаем предсказания\", **Variance** – это разрос этих предсказаний." 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Бустинг и Бэггинг в терминах Bias и Variance" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Как вы думаете на какую составляющую Бустинг и Бэггинг влияют, а на какую нет?" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 1, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stderr", 52 | "output_type": "stream", 53 | "text": [ 54 | "/anaconda3/lib/python3.7/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n", 55 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n", 56 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n", 57 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n", 58 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "import pandas as pd\n", 64 | "import numpy as np\n", 65 | "from sklearn.model_selection import cross_val_score, train_test_split\n", 66 | "from xgboost import XGBRegressor\n", 67 | "from catboost import CatBoostRegressor\n", 68 | "from lightgbm import LGBMRegressor\n", 69 | "from sklearn.ensemble import RandomForestRegressor\n", 70 | "from sklearn.tree import DecisionTreeRegressor\n", 71 | "from sklearn.linear_model import LinearRegression\n", 72 | "\n", 73 | "import warnings\n", 74 | "warnings.filterwarnings('ignore')" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 2, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "data = pd.read_csv('HR.csv')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/html": [ 94 | "
\n", 95 | "\n", 108 | "\n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | "
last_evaluationnumber_projectaverage_montly_hourstime_spend_companyWork_accidentleftpromotion_last_5years
00.5321573010
10.8652626000
20.8872724010
30.8752235010
40.5221593010
\n", 174 | "
" 175 | ], 176 | "text/plain": [ 177 | " last_evaluation number_project average_montly_hours time_spend_company \\\n", 178 | "0 0.53 2 157 3 \n", 179 | "1 0.86 5 262 6 \n", 180 | "2 0.88 7 272 4 \n", 181 | "3 0.87 5 223 5 \n", 182 | "4 0.52 2 159 3 \n", 183 | "\n", 184 | " Work_accident left promotion_last_5years \n", 185 | "0 0 1 0 \n", 186 | "1 0 0 0 \n", 187 | "2 0 1 0 \n", 188 | "3 0 1 0 \n", 189 | "4 0 1 0 " 190 | ] 191 | }, 192 | "execution_count": 3, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "data.head()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 4, 204 | "metadata": { 205 | "scrolled": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "X, y = data.drop('left', axis=1).values, data['left'].values" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 5, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "array([1, 0])" 221 | ] 222 | }, 223 | "execution_count": 5, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "data['left'].unique()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 6, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "X_train, X_test, y_train, y_test = train_test_split(X, y)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 7, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "def sample_model(seed, model):\n", 248 | " random_gen = np.random.RandomState(seed)\n", 249 | " indices = random_gen.choice(len(y_train), size=len(y_train), replace=True)\n", 250 | " model.fit(X_train[indices, :], y_train[indices])\n", 251 | " return model\n", 252 | "\n", 253 | "def estimate_bias_variance(model, iters_count=100):\n", 254 | " y_preds = []\n", 255 | " for seed in range(iters_count):\n", 256 | " model = sample_model(seed, model)\n", 257 | " y_preds.append(model.predict(X_test))\n", 258 | " y_preds = np.array(y_preds)\n", 259 | " \n", 260 | " print('Bias:', np.mean((y_test - y_preds.mean(axis=0)) ** 2))\n", 261 | " print('Variance:', y_preds.std(axis=0).mean())" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "**Линейная регрессия**" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 8, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "Bias: 0.226828932947683\n", 281 | "Variance: 0.010869101720111095\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "estimate_bias_variance(LinearRegression())" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "**Решающее дерево с max_depth=5**" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 9, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "Bias: 0.17064044012499532\n", 306 | "Variance: 0.040088790535583806\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "estimate_bias_variance(DecisionTreeRegressor(max_depth=5))" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "**Решающее дерево с max_depth=10**" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 10, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "Bias: 0.16967137873478577\n", 331 | "Variance: 0.11481867526048847\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "estimate_bias_variance(DecisionTreeRegressor(max_depth=10))" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "**Решающее дерево с max_depth=15**" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 11, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "Bias: 0.17844968625676377\n", 356 | "Variance: 0.21862362924116188\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "estimate_bias_variance(DecisionTreeRegressor(max_depth=15))" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "**Решающее дерево без ограничения глубины**" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 12, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | "Bias: 0.20331480319602732\n", 381 | "Variance: 0.3219244559334149\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "estimate_bias_variance(DecisionTreeRegressor(max_depth=None))" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "**Случайный лес n_estimators=1**" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 13, 399 | "metadata": {}, 400 | "outputs": [ 401 | { 402 | "name": "stdout", 403 | "output_type": "stream", 404 | "text": [ 405 | "Bias: 0.19262199584781653\n", 406 | "Variance: 0.3556203158413508\n" 407 | ] 408 | } 409 | ], 410 | "source": [ 411 | "estimate_bias_variance(RandomForestRegressor(n_estimators=1, random_state=42))" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "**Случайный лес n_estimators=10**" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 14, 424 | "metadata": {}, 425 | "outputs": [ 426 | { 427 | "name": "stdout", 428 | "output_type": "stream", 429 | "text": [ 430 | "Bias: 0.1904910510615455\n", 431 | "Variance: 0.17132363783477825\n" 432 | ] 433 | } 434 | ], 435 | "source": [ 436 | "estimate_bias_variance(RandomForestRegressor(n_estimators=10, random_state=42))" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "**Случайный лес n_estimators=50**" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 15, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "Bias: 0.19041806467037617\n", 456 | "Variance: 0.14149560741452163\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "estimate_bias_variance(RandomForestRegressor(n_estimators=50, random_state=42))" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "**XGBRegressor**" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "**Бустинг над деревьями max_depth=20**" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 16, 481 | "metadata": {}, 482 | "outputs": [ 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "Bias: 0.23488428479777498\n", 488 | "Variance: 0.023060445\n" 489 | ] 490 | } 491 | ], 492 | "source": [ 493 | "estimate_bias_variance(XGBRegressor(n_estimators=1, max_depth=20))" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "metadata": {}, 499 | "source": [ 500 | "**Бустинг над деревьями max_depth=10**" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 17, 506 | "metadata": {}, 507 | "outputs": [ 508 | { 509 | "name": "stdout", 510 | "output_type": "stream", 511 | "text": [ 512 | "Bias: 0.23432487076945913\n", 513 | "Variance: 0.010548848\n" 514 | ] 515 | } 516 | ], 517 | "source": [ 518 | "estimate_bias_variance(XGBRegressor(n_estimators=1, max_depth=10))" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "**Бустинг над деревьями max_depth=5**" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 18, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "name": "stdout", 535 | "output_type": "stream", 536 | "text": [ 537 | "Bias: 0.23513262193915907\n", 538 | "Variance: 0.0039053415\n" 539 | ] 540 | } 541 | ], 542 | "source": [ 543 | "estimate_bias_variance(XGBRegressor(n_estimators=1, max_depth=5))" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "**Бустинг над деревьями n_estimators=10**" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 19, 556 | "metadata": {}, 557 | "outputs": [ 558 | { 559 | "name": "stdout", 560 | "output_type": "stream", 561 | "text": [ 562 | "Bias: 0.1790620428039229\n", 563 | "Variance: 0.018647127\n" 564 | ] 565 | } 566 | ], 567 | "source": [ 568 | "estimate_bias_variance(XGBRegressor(n_estimators=10, max_depth=5))" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "**Бустинг над деревьями n_estimators=100**" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 20, 581 | "metadata": {}, 582 | "outputs": [ 583 | { 584 | "name": "stdout", 585 | "output_type": "stream", 586 | "text": [ 587 | "Bias: 0.16982377865413528\n", 588 | "Variance: 0.055819906\n" 589 | ] 590 | } 591 | ], 592 | "source": [ 593 | "estimate_bias_variance(XGBRegressor(n_estimators=100, max_depth=5))" 594 | ] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "**CatBoostRegressor**" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 21, 606 | "metadata": {}, 607 | "outputs": [ 608 | { 609 | "name": "stdout", 610 | "output_type": "stream", 611 | "text": [ 612 | "Bias: 0.34771886990134276\n", 613 | "Variance: 0.0012572709037321653\n" 614 | ] 615 | } 616 | ], 617 | "source": [ 618 | "estimate_bias_variance(CatBoostRegressor(n_estimators=1, max_depth=6, verbose=False))" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": 22, 624 | "metadata": {}, 625 | "outputs": [ 626 | { 627 | "name": "stdout", 628 | "output_type": "stream", 629 | "text": [ 630 | "Bias: 0.279855374689732\n", 631 | "Variance: 0.005055356404026022\n" 632 | ] 633 | } 634 | ], 635 | "source": [ 636 | "estimate_bias_variance(CatBoostRegressor(n_estimators=10, max_depth=6, verbose=False))" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 23, 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "name": "stdout", 646 | "output_type": "stream", 647 | "text": [ 648 | "Bias: 0.1737535013844747\n", 649 | "Variance: 0.019539522639503766\n" 650 | ] 651 | } 652 | ], 653 | "source": [ 654 | "estimate_bias_variance(CatBoostRegressor(n_estimators=100, max_depth=6, verbose=False))" 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": {}, 660 | "source": [ 661 | "**LGBMRegressor**" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 24, 667 | "metadata": {}, 668 | "outputs": [ 669 | { 670 | "name": "stdout", 671 | "output_type": "stream", 672 | "text": [ 673 | "Bias: 0.21951401376098226\n", 674 | "Variance: 0.006486320224716261\n" 675 | ] 676 | } 677 | ], 678 | "source": [ 679 | "estimate_bias_variance(LGBMRegressor(n_estimators=1, max_depth=5))" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": 25, 685 | "metadata": {}, 686 | "outputs": [ 687 | { 688 | "name": "stdout", 689 | "output_type": "stream", 690 | "text": [ 691 | "Bias: 0.17805318308924556\n", 692 | "Variance: 0.019661935967302976\n" 693 | ] 694 | } 695 | ], 696 | "source": [ 697 | "estimate_bias_variance(LGBMRegressor(n_estimators=10, max_depth=5))" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 26, 703 | "metadata": {}, 704 | "outputs": [ 705 | { 706 | "name": "stdout", 707 | "output_type": "stream", 708 | "text": [ 709 | "Bias: 0.1703419744585601\n", 710 | "Variance: 0.05454402902000024\n" 711 | ] 712 | } 713 | ], 714 | "source": [ 715 | "estimate_bias_variance(LGBMRegressor(n_estimators=100, max_depth=5))" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [] 724 | } 725 | ], 726 | "metadata": { 727 | "anaconda-cloud": {}, 728 | "kernelspec": { 729 | "display_name": "Python 3", 730 | "language": "python", 731 | "name": "python3" 732 | }, 733 | "language_info": { 734 | "codemirror_mode": { 735 | "name": "ipython", 736 | "version": 3 737 | }, 738 | "file_extension": ".py", 739 | "mimetype": "text/x-python", 740 | "name": "python", 741 | "nbconvert_exporter": "python", 742 | "pygments_lexer": "ipython3", 743 | "version": "3.7.1" 744 | } 745 | }, 746 | "nbformat": 4, 747 | "nbformat_minor": 1 748 | } 749 | -------------------------------------------------------------------------------- /seminar0405/06_ExtractFeaturesFromDecisionTree.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Извлечение признаков, выделяемых деревьями" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pydotplus \n", 17 | "import pandas as pd\n", 18 | "import numpy as np\n", 19 | "\n", 20 | "from IPython.display import Image\n", 21 | "from sklearn import tree\n", 22 | "from sklearn.datasets import load_iris\n", 23 | "from sklearn.model_selection import train_test_split, cross_val_score\n", 24 | "from sklearn.metrics import accuracy_score, classification_report\n", 25 | "from sklearn.linear_model import LogisticRegression" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "['last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "data = pd.read_csv('HR.csv')\n", 43 | "\n", 44 | "target = 'left'\n", 45 | "features = [c for c in data if c != target]\n", 46 | "print(features)\n", 47 | "\n", 48 | "X, y = data[features], data[target]" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "from sklearn.tree import _tree\n", 58 | "\n", 59 | "def tree_to_code(tree, feature_names, return_class=True):\n", 60 | " \n", 61 | " '''\n", 62 | " Outputs a decision tree model as a Python function\n", 63 | " \n", 64 | " Parameters:\n", 65 | " -----------\n", 66 | " tree: decision tree model\n", 67 | " The decision tree to represent as a function\n", 68 | " feature_names: list\n", 69 | " The feature names of the dataset used for building the decision tree\n", 70 | " return class:\n", 71 | " Return most frequent class rather than number of elements of each class in the node\n", 72 | " '''\n", 73 | "\n", 74 | " tree_ = tree.tree_\n", 75 | " feature_name = [\n", 76 | " feature_names[i] if i != _tree.TREE_UNDEFINED else \"undefined!\"\n", 77 | " for i in tree_.feature\n", 78 | " ]\n", 79 | " print(\"def tree({}):\".format(\", \".join(feature_names)))\n", 80 | "\n", 81 | " def recurse(node, depth):\n", 82 | " indent = \" \" * depth\n", 83 | " if tree_.feature[node] != _tree.TREE_UNDEFINED:\n", 84 | " name = feature_name[node]\n", 85 | " threshold = tree_.threshold[node]\n", 86 | " print(\"{}if {} <= {}:\".format(indent, name, threshold))\n", 87 | " recurse(tree_.children_left[node], depth + 1)\n", 88 | " print(\"{}else:\".format(indent, name, threshold))\n", 89 | " recurse(tree_.children_right[node], depth + 1)\n", 90 | " else:\n", 91 | " value = tree_.value[node][0]\n", 92 | " if return_class and len(value) > 1:\n", 93 | " value = np.argmax(tree_.value[node])\n", 94 | "\n", 95 | " print(\"{}return {}\".format(indent, value))\n", 96 | "\n", 97 | " recurse(0, 1)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n", 109 | " max_features=None, max_leaf_nodes=None,\n", 110 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 111 | " min_samples_leaf=1, min_samples_split=2,\n", 112 | " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n", 113 | " splitter='best')" 114 | ] 115 | }, 116 | "execution_count": 4, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "data_sample = data.sample(200)\n", 123 | "model = tree.DecisionTreeClassifier(max_depth=2)\n", 124 | "model.fit(data_sample[features], data_sample[target])" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 5, 130 | "metadata": { 131 | "scrolled": true 132 | }, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "def tree(last_evaluation, number_project, average_montly_hours, time_spend_company, Work_accident, promotion_last_5years):\n", 139 | " if number_project <= 2.5:\n", 140 | " if average_montly_hours <= 159.5:\n", 141 | " return 1\n", 142 | " else:\n", 143 | " return 0\n", 144 | " else:\n", 145 | " if number_project <= 6.5:\n", 146 | " return 0\n", 147 | " else:\n", 148 | " return 1\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "tree_to_code(model, features)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 6, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "Logistic Regression with base features: 0.6287053143962126\n" 166 | ] 167 | }, 168 | { 169 | "name": "stderr", 170 | "output_type": "stream", 171 | "text": [ 172 | "/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n", 173 | " warnings.warn(CV_WARNING, FutureWarning)\n", 174 | "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 175 | " FutureWarning)\n", 176 | "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 177 | " FutureWarning)\n", 178 | "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 179 | " FutureWarning)\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "print('Logistic Regression with base features:', cross_val_score(LogisticRegression(), data[features], data[target]).mean())" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 7, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "data['new_feature_01'] = (data.number_project <= 2.5) & (data.last_evaluation < 0.53)\n", 194 | "data['new_feature_02'] = 0 # your code" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 8, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/html": [ 205 | "
\n", 206 | "\n", 219 | "\n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | "
number_projectlast_evaluationnew_feature_01new_feature_02
020.53False0
150.86False0
270.88False0
350.87False0
\n", 260 | "
" 261 | ], 262 | "text/plain": [ 263 | " number_project last_evaluation new_feature_01 new_feature_02\n", 264 | "0 2 0.53 False 0\n", 265 | "1 5 0.86 False 0\n", 266 | "2 7 0.88 False 0\n", 267 | "3 5 0.87 False 0" 268 | ] 269 | }, 270 | "execution_count": 8, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "data[['number_project', 'last_evaluation', 'new_feature_01', 'new_feature_02']][:4]" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 9, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "Logistic Regression with new features: 0.6749115556444623\n" 289 | ] 290 | }, 291 | { 292 | "name": "stderr", 293 | "output_type": "stream", 294 | "text": [ 295 | "/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n", 296 | " warnings.warn(CV_WARNING, FutureWarning)\n", 297 | "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 298 | " FutureWarning)\n", 299 | "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 300 | " FutureWarning)\n", 301 | "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 302 | " FutureWarning)\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "new_features = [c for c in data if c != target]\n", 308 | "print('Logistic Regression with new features:', cross_val_score(LogisticRegression(), data[new_features], data[target]).mean())" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "## Опциональная задача\n", 316 | "Попробуйте добавить такие признаки, чтобы accuracy линейной модели стало сравнимым с RandomForest на 15 деревьях, то есть дошло до 0.73" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [] 327 | } 328 | ], 329 | "metadata": { 330 | "kernelspec": { 331 | "display_name": "Python 3", 332 | "language": "python", 333 | "name": "python3" 334 | }, 335 | "language_info": { 336 | "codemirror_mode": { 337 | "name": "ipython", 338 | "version": 3 339 | }, 340 | "file_extension": ".py", 341 | "mimetype": "text/x-python", 342 | "name": "python", 343 | "nbconvert_exporter": "python", 344 | "pygments_lexer": "ipython3", 345 | "version": "3.7.1" 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 2 350 | } 351 | -------------------------------------------------------------------------------- /seminar0405/ml_bias_variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar0405/ml_bias_variance.png -------------------------------------------------------------------------------- /seminar06/02_ClusteringText.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Кластеризация текстов\n", 8 | "\n", 9 | "**Вопросы:**\n", 10 | "- В чём задача кластеризации текстов? \n", 11 | "- Что является объектами (samples), что такое признаки для этих объектов?" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 29, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import itertools\n", 21 | "\n", 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "from sklearn.metrics import accuracy_score\n", 25 | "from sklearn.model_selection import cross_val_score\n", 26 | "\n", 27 | "from matplotlib import pyplot as plt\n", 28 | "from matplotlib.colors import ListedColormap\n", 29 | "\n", 30 | "from IPython.display import Image, SVG\n", 31 | "\n", 32 | "%matplotlib inline" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Выборка" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 30, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "from sklearn.datasets import fetch_20newsgroups" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 31, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "train_all = fetch_20newsgroups(subset='train')\n", 66 | "print (train_all.target_names)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 32, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "simple_dataset = fetch_20newsgroups(\n", 76 | " subset='train', \n", 77 | " categories=['comp.sys.mac.hardware', 'soc.religion.christian', 'rec.sport.hockey'])" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "Пример текста" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 33, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "From: erik@cheshire.oxy.edu (Erik Adams)\n", 97 | "Subject: HELP!! My Macintosh \"luggable\" has lines on its screen!\n", 98 | "Organization: Occidental College, Los Angeles, CA 90041 USA.\n", 99 | "Distribution: comp\n", 100 | "Lines: 20\n", 101 | "\n", 102 | "Okay, I don't use it very much, but I would like for it to keep working\n", 103 | "correctly, at least as long as Apple continues to make System software\n", 104 | "that will run on it, if slowly :-)\n", 105 | "\n", 106 | "Here is the problem: When the screen is tilted too far back, vertical\n", 107 | "lines appear on the screen. They are every 10 pixels or so, and seem\n", 108 | "to be affected somewhat by opening windows and pulling down menus.\n", 109 | "It looks to a semi-technical person like there is a loose connection\n", 110 | "between the screen and the rest of the computer.\n", 111 | "\n", 112 | "I am open to suggestions that do not involve buying a new computer,\n", 113 | "or taking this one to the shop. I would also like to not have\n", 114 | "to buy one of Larry Pina's books. I like Larry, but I'm not sure\n", 115 | "I feel strongly enough about the computer to buy a service manual\n", 116 | "for it.\n", 117 | "\n", 118 | "On a related note: what does the monitor connector connect to?\n", 119 | "\n", 120 | "Erik\n", 121 | "\n", 122 | "\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "print(simple_dataset.data[0])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "### Признаки" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 34, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "(1777, 3767)" 146 | ] 147 | }, 148 | "execution_count": 34, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 155 | "\n", 156 | "vectorizer = TfidfVectorizer(max_df=500, min_df=10)\n", 157 | "matrix = vectorizer.fit_transform(simple_dataset.data)\n", 158 | "matrix.shape" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "## Аггломеративная кластеризация" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 35, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "from sklearn.cluster.hierarchical import AgglomerativeClustering\n", 175 | "\n", 176 | "model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete')\n", 177 | "preds = model.fit_predict(matrix.toarray())" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 36, 183 | "metadata": { 184 | "scrolled": true 185 | }, 186 | "outputs": [ 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n" 192 | ] 193 | } 194 | ], 195 | "source": [ 196 | "print(list(preds) [:10])" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 37, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "array([0, 0, 1, ..., 0, 1, 2])" 208 | ] 209 | }, 210 | "execution_count": 37, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "simple_dataset.target" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 38, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/plain": [ 227 | "array([0, 0, 0, ..., 0, 2, 1])" 228 | ] 229 | }, 230 | "execution_count": 38, 231 | "metadata": {}, 232 | "output_type": "execute_result" 233 | } 234 | ], 235 | "source": [ 236 | "preds" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 39, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "0.3590320765334834\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "# Assessement\n", 254 | "mapping = {2 : 1, 1: 2, 0: 0}\n", 255 | "mapped_preds = [mapping[pred] for pred in preds]\n", 256 | "# print (float(sum(mapped_preds != simple_dataset.target)) / len(simple_dataset.target))\n", 257 | "print(accuracy_score(mapped_preds, simple_dataset.target))" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 40, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "0.3590320765334834" 269 | ] 270 | }, 271 | "execution_count": 40, 272 | "metadata": {}, 273 | "output_type": "execute_result" 274 | } 275 | ], 276 | "source": [ 277 | "def validate_with_mappings(preds, target):\n", 278 | " permutations = itertools.permutations([0, 1, 2])\n", 279 | " accuracy_history = []\n", 280 | " for a, b, c in permutations:\n", 281 | " mapping = {2 : a, 1: b, 0: c}\n", 282 | " mapped_preds = [mapping[pred] for pred in preds]\n", 283 | " accuracy_history.append(accuracy_score(mapped_preds, target))\n", 284 | " return np.max(accuracy_history)\n", 285 | " \n", 286 | "validate_with_mappings(preds, simple_dataset.target)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "## KMeans" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 41, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "[0 0 2 ... 0 2 1]\n", 306 | "[0 0 1 ... 0 1 2]\n" 307 | ] 308 | }, 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "0.9527293190770962" 313 | ] 314 | }, 315 | "execution_count": 41, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "from sklearn.cluster import KMeans\n", 322 | "\n", 323 | "model = KMeans(n_clusters=3, random_state=1)\n", 324 | "preds = model.fit_predict(matrix.toarray())\n", 325 | "print (preds)\n", 326 | "print (simple_dataset.target)\n", 327 | "validate_with_mappings(preds, simple_dataset.target)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 42, 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "name": "stdout", 337 | "output_type": "stream", 338 | "text": [ 339 | "0.9853603185880773\n" 340 | ] 341 | }, 342 | { 343 | "name": "stderr", 344 | "output_type": "stream", 345 | "text": [ 346 | "/Users/vladvo/p3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n", 347 | " warnings.warn(CV_WARNING, FutureWarning)\n", 348 | "/Users/vladvo/p3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 349 | " FutureWarning)\n", 350 | "/Users/vladvo/p3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", 351 | " \"this warning.\", FutureWarning)\n" 352 | ] 353 | } 354 | ], 355 | "source": [ 356 | "# Compare with Linear Regression\n", 357 | "from sklearn.linear_model import LogisticRegression\n", 358 | "clf = LogisticRegression()\n", 359 | "print (cross_val_score(clf, matrix, simple_dataset.target).mean())" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "**Вопрос:** очень высокая точность кластеризации текстов, очень близкая к точности Supervised алгоритма. Почему?" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "## Более сложная выборка" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 43, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "noteasy_dataset = fetch_20newsgroups(\n", 383 | " subset='train', \n", 384 | " categories=['comp.sys.mac.hardware', 'comp.os.ms-windows.misc', 'comp.graphics'])\n", 385 | "matrix = vectorizer.fit_transform(noteasy_dataset.data)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 44, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "[0 1 2 ... 0 2 0]\n", 398 | "[2 1 1 ... 2 0 2]\n" 399 | ] 400 | }, 401 | { 402 | "data": { 403 | "text/plain": [ 404 | "0.753565316600114" 405 | ] 406 | }, 407 | "execution_count": 44, 408 | "metadata": {}, 409 | "output_type": "execute_result" 410 | } 411 | ], 412 | "source": [ 413 | "model = KMeans(n_clusters=3, random_state=1)\n", 414 | "preds = model.fit_predict(matrix.toarray())\n", 415 | "print (preds)\n", 416 | "print (noteasy_dataset.target)\n", 417 | "validate_with_mappings(preds, noteasy_dataset.target)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 45, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "name": "stdout", 427 | "output_type": "stream", 428 | "text": [ 429 | "0.917279226713189\n" 430 | ] 431 | }, 432 | { 433 | "name": "stderr", 434 | "output_type": "stream", 435 | "text": [ 436 | "/Users/vladvo/p3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n", 437 | " warnings.warn(CV_WARNING, FutureWarning)\n", 438 | "/Users/vladvo/p3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 439 | " FutureWarning)\n", 440 | "/Users/vladvo/p3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", 441 | " \"this warning.\", FutureWarning)\n" 442 | ] 443 | } 444 | ], 445 | "source": [ 446 | "clf = LogisticRegression()\n", 447 | "print (cross_val_score(clf, matrix, noteasy_dataset.target).mean())" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "## SVD + KMeans" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 46, 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "data": { 464 | "text/plain": [ 465 | "0.793496862521392" 466 | ] 467 | }, 468 | "execution_count": 46, 469 | "metadata": {}, 470 | "output_type": "execute_result" 471 | } 472 | ], 473 | "source": [ 474 | "from sklearn.decomposition import TruncatedSVD\n", 475 | "\n", 476 | "model = KMeans(n_clusters=3, random_state=42)\n", 477 | "svd = TruncatedSVD(n_components=1000, random_state=123)\n", 478 | "features = svd.fit_transform(matrix)\n", 479 | "preds = model.fit_predict(features)\n", 480 | "validate_with_mappings(preds, noteasy_dataset.target)" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 47, 486 | "metadata": {}, 487 | "outputs": [ 488 | { 489 | "data": { 490 | "text/plain": [ 491 | "0.7347404449515117" 492 | ] 493 | }, 494 | "execution_count": 47, 495 | "metadata": {}, 496 | "output_type": "execute_result" 497 | } 498 | ], 499 | "source": [ 500 | "model = KMeans(n_clusters=3, random_state=42)\n", 501 | "svd = TruncatedSVD(n_components=200, random_state=321)\n", 502 | "features = svd.fit_transform(matrix)\n", 503 | "preds = model.fit_predict(features)\n", 504 | "validate_with_mappings(preds, noteasy_dataset.target)" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "\n", 512 | "**Вопрос:** всё равно сумели добиться довольно высокой точности. В чем причина?" 513 | ] 514 | } 515 | ], 516 | "metadata": { 517 | "anaconda-cloud": {}, 518 | "kernelspec": { 519 | "display_name": "Python 3", 520 | "language": "python", 521 | "name": "python3" 522 | }, 523 | "language_info": { 524 | "codemirror_mode": { 525 | "name": "ipython", 526 | "version": 3 527 | }, 528 | "file_extension": ".py", 529 | "mimetype": "text/x-python", 530 | "name": "python", 531 | "nbconvert_exporter": "python", 532 | "pygments_lexer": "ipython3", 533 | "version": "3.7.2" 534 | } 535 | }, 536 | "nbformat": 4, 537 | "nbformat_minor": 1 538 | } 539 | -------------------------------------------------------------------------------- /seminar06/04_Word2Vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Word2Vec" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Будем обучать word2vec из пакета gensim на корпусе opencorpora.
\n", 15 | "Этот корпус небольшой, поэтому выдающихся результатов ждать не стоит, но зато обучение будет происходить быстро." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 33, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import gensim\n", 25 | "from gensim.models.word2vec import LineSentence, Word2Vec\n", 26 | "\n", 27 | "sentences = LineSentence('opencorpora_for_word2vec.txt')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 32, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "108904" 39 | ] 40 | }, 41 | "execution_count": 32, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "len(list(sentences))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 11, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "['в', 'история', 'программа', 'это', 'уже', 'не', 'первый', 'ребрендинг']" 59 | ] 60 | }, 61 | "execution_count": 11, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "list(sentences)[3]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 12, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "model = Word2Vec(sentences, size=300, window=5, min_count=5, workers=4, iter=20)\n", 77 | "model.init_sims(replace=True)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 23, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "Microsoft 0.7615377902984619\n", 90 | "Apple 0.7513457536697388\n", 91 | "Facebook 0.7315123081207275\n", 92 | "сервис 0.7023224830627441\n", 93 | "яндекс 0.656139612197876\n", 94 | "пользователь 0.6541165709495544\n", 95 | "YouTube 0.6529853343963623\n", 96 | "Chrome 0.6462385654449463\n", 97 | "рекламодатель 0.6389379501342773\n", 98 | "Yahoo 0.6317485570907593\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "for w, sim in model.wv.most_similar(positive=['Google']):\n", 104 | " print(w, sim)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 24, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "сотовый 0.7519944906234741\n", 117 | "мтс 0.7208024263381958\n", 118 | "мегафон 0.7087516784667969\n", 119 | "билайн 0.6689847707748413\n", 120 | "фирма 0.6550787687301636\n", 121 | "корпорация 0.6267942786216736\n", 122 | "абонент 0.6221226453781128\n", 123 | "вымпелком 0.61030513048172\n", 124 | "роснефть 0.6077439188957214\n", 125 | "аэрофлот 0.5987288355827332\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "for w, sim in model.wv.most_similar(positive=['оператор']):\n", 131 | " print(w, sim)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 25, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "девочка 0.6356844902038574\n", 144 | "старик 0.5878387093544006\n", 145 | "дама 0.5665048360824585\n", 146 | "муж 0.5535680055618286\n", 147 | "дядя 0.5503150224685669\n", 148 | "сестра 0.5483002662658691\n", 149 | "саша 0.5471621155738831\n", 150 | "пилю 0.5389496684074402\n", 151 | "мать 0.5370166897773743\n", 152 | "ратибор 0.5336532592773438\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "for w, sim in model.wv.most_similar(positive=['мальчик', 'женщина'], negative=['мужчина']):\n", 158 | " print(w, sim)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 26, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "петь 0.6344984769821167\n", 171 | "слушать 0.623060405254364\n", 172 | "гулять 0.6168084740638733\n", 173 | "бегать 0.6123511791229248\n", 174 | "этак 0.6120802164077759\n", 175 | "ездить 0.6004507541656494\n", 176 | "съездить 0.5993729829788208\n", 177 | "танцевать 0.5975024700164795\n", 178 | "куда-то 0.5929892659187317\n", 179 | "сидеть 0.5858666896820068\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "for w, sim in model.wv.most_similar(positive=['ходить']):\n", 185 | " print(w, sim)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 27, 191 | "metadata": { 192 | "scrolled": true 193 | }, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "сестра 0.6414291262626648\n", 200 | "двоюродный 0.6089733839035034\n", 201 | "дочь 0.5852426290512085\n", 202 | "дочка 0.5746685862541199\n", 203 | "мэри 0.567757785320282\n", 204 | "принцесса 0.5600994825363159\n", 205 | "сын 0.5554524660110474\n", 206 | "внук 0.5502660870552063\n", 207 | "аллах 0.5491375923156738\n", 208 | "надзиратель 0.5426605343818665\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "for w, sim in model.wv.most_similar(positive=[u'брат', u'жена'], negative=[u'муж']):\n", 214 | " print(w, sim)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 28, 220 | "metadata": { 221 | "scrolled": true 222 | }, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "машина\n" 229 | ] 230 | }, 231 | { 232 | "name": "stderr", 233 | "output_type": "stream", 234 | "text": [ 235 | "/Users/vladvo/p3/lib/python3.7/site-packages/gensim/models/keyedvectors.py:858: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n", 236 | " vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "print(model.wv.doesnt_match(\"книга журнал машина\".split()))" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 29, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "0.1479371" 253 | ] 254 | }, 255 | "execution_count": 29, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "model.wv.similarity('книга', 'телефон')" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 30, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "array([ 0.08200409, 0.01317941, 0.03823986, -0.01235516, 0.08277431,\n", 273 | " 0.02984009, 0.05065874, -0.00751448, -0.06812365, -0.06842718,\n", 274 | " 0.00220764, 0.05788858, -0.01837883, -0.07313525, -0.10592317,\n", 275 | " -0.02601339, -0.05450147, 0.02554367, 0.06465277, -0.0061393 ,\n", 276 | " -0.01153112, 0.0003752 , 0.09866779, 0.0207594 , -0.06646861,\n", 277 | " 0.00194739, 0.09728996, -0.0094161 , 0.02228041, 0.0660712 ,\n", 278 | " -0.00196059, -0.0338694 , -0.0103764 , 0.01204176, -0.02358722,\n", 279 | " -0.07796294, -0.01034137, 0.04901107, -0.0022719 , -0.03768355,\n", 280 | " -0.05120618, -0.1436158 , -0.04697924, -0.03163331, -0.05223419,\n", 281 | " 0.07652589, 0.02253028, -0.08444852, 0.0453871 , -0.05328702,\n", 282 | " 0.04900654, -0.11324712, 0.06064839, -0.08099993, 0.03223145,\n", 283 | " -0.06607263, 0.00569813, 0.01862273, -0.00043284, -0.08484566,\n", 284 | " 0.04645833, 0.02002369, -0.04478256, -0.02011684, 0.05238428,\n", 285 | " -0.04248028, 0.0170895 , 0.03993165, -0.04428427, 0.09463288,\n", 286 | " -0.03971099, -0.09415146, 0.00749432, -0.03918408, 0.09070554,\n", 287 | " -0.03251258, -0.00924277, 0.02876737, 0.06756766, -0.06433778,\n", 288 | " -0.04986037, 0.01585588, -0.08229308, 0.1475776 , -0.03404282,\n", 289 | " -0.01391482, -0.06584385, -0.00226703, 0.04444695, 0.1207349 ,\n", 290 | " 0.00283512, -0.00140148, -0.01686629, 0.1227438 , -0.10993808,\n", 291 | " 0.09023455, 0.07178912, 0.00825507, -0.03198511, -0.06751874,\n", 292 | " 0.01503683, 0.1140482 , 0.0038303 , 0.09342017, 0.00191473,\n", 293 | " -0.00428296, 0.05904026, -0.02730846, 0.00991113, 0.00437523,\n", 294 | " 0.06706332, 0.01135903, 0.01363238, 0.04106431, -0.01147577,\n", 295 | " 0.02052998, 0.09942389, 0.00793607, -0.04374001, 0.0690505 ,\n", 296 | " -0.05472995, -0.02318553, 0.02937523, -0.01143023, 0.01856702,\n", 297 | " 0.09057596, -0.11675534, -0.00631548, 0.01513159, -0.01193609,\n", 298 | " 0.05027112, -0.04742159, -0.02600596, -0.0039276 , -0.0028742 ,\n", 299 | " -0.00283522, 0.00618316, 0.01369382, 0.05576802, -0.04248947,\n", 300 | " -0.09040287, 0.08484566, 0.11228466, 0.01483243, -0.02098966,\n", 301 | " 0.07054627, -0.03229892, 0.05649905, 0.068694 , -0.00196452,\n", 302 | " 0.04367778, 0.0204044 , -0.02711828, 0.0921466 , 0.01139489,\n", 303 | " 0.02803337, 0.0134455 , 0.03076508, 0.04276306, -0.01909847,\n", 304 | " -0.05072219, 0.07204027, 0.08959574, -0.05955283, -0.05037621,\n", 305 | " 0.04592594, -0.01075466, 0.02027609, 0.05590595, 0.03214094,\n", 306 | " -0.10974209, -0.21272269, -0.00879799, -0.07040294, -0.0757833 ,\n", 307 | " 0.09261775, -0.006126 , -0.03810675, -0.0428365 , -0.02298157,\n", 308 | " 0.0370854 , 0.06754287, 0.07551537, 0.00562134, -0.05401517,\n", 309 | " 0.00856058, -0.02560093, 0.03903861, 0.05372471, 0.07085399,\n", 310 | " 0.03959638, -0.1378293 , -0.01988894, 0.04398787, 0.06145015,\n", 311 | " -0.04273438, -0.06192921, -0.00778198, -0.01894488, 0.04101792,\n", 312 | " 0.12011923, -0.00987583, -0.10845054, -0.05997426, -0.00474409,\n", 313 | " 0.03305182, -0.05851914, 0.07112689, -0.02893843, -0.04338023,\n", 314 | " -0.09425677, -0.05960175, 0.01402743, -0.04919544, -0.02071212,\n", 315 | " -0.00580984, 0.13776027, 0.097374 , -0.01311934, -0.0281004 ,\n", 316 | " 0.10519646, 0.00739926, -0.06161033, -0.00803537, 0.11364392,\n", 317 | " 0.0428979 , 0.03419597, 0.05728316, -0.0277433 , -0.07029855,\n", 318 | " 0.02340626, -0.07528384, -0.01912317, 0.00649373, 0.04975617,\n", 319 | " 0.03719334, 0.00321584, -0.11119477, 0.00780036, 0.04632334,\n", 320 | " 0.01579599, -0.12207319, -0.04174818, 0.09228403, 0.01668 ,\n", 321 | " -0.03713252, -0.01159889, 0.0689313 , -0.03544318, 0.02113092,\n", 322 | " 0.02040517, 0.03616699, -0.05464631, 0.01552263, -0.10600254,\n", 323 | " 0.05624925, -0.00167578, 0.05194965, 0.07514734, -0.08151952,\n", 324 | " -0.04583972, 0.08516677, 0.03505033, 0.03246606, 0.00691416,\n", 325 | " 0.02186404, -0.04164722, -0.00520659, 0.04597766, 0.09491676,\n", 326 | " 0.09678355, -0.05411908, -0.07278382, -0.08139971, -0.02455598,\n", 327 | " 0.0990073 , 0.09243791, -0.02848556, -0.04100103, -0.04105701,\n", 328 | " -0.01744788, 0.08472517, 0.00674182, 0.02118251, -0.0542472 ,\n", 329 | " -0.04408932, 0.11950909, -0.01991587, -0.07013162, 0.04673743,\n", 330 | " -0.03936329, -0.01744512, -0.02671363, -0.05457728, 0.05656848,\n", 331 | " -0.01160341, 0.01564648, 0.10300156, -0.0136303 , -0.03384643],\n", 332 | " dtype=float32)" 333 | ] 334 | }, 335 | "execution_count": 30, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "model.wv['книга']" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [] 350 | } 351 | ], 352 | "metadata": { 353 | "kernelspec": { 354 | "display_name": "Python 3", 355 | "language": "python", 356 | "name": "python3" 357 | }, 358 | "language_info": { 359 | "codemirror_mode": { 360 | "name": "ipython", 361 | "version": 3 362 | }, 363 | "file_extension": ".py", 364 | "mimetype": "text/x-python", 365 | "name": "python", 366 | "nbconvert_exporter": "python", 367 | "pygments_lexer": "ipython3", 368 | "version": "3.7.2" 369 | } 370 | }, 371 | "nbformat": 4, 372 | "nbformat_minor": 1 373 | } 374 | -------------------------------------------------------------------------------- /seminar06/transactions.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data-mining-in-action/DMIA_Base_2019_Spring/d0e3273b7967aae5d30bed53b921d96d66ab41e1/seminar06/transactions.csv.zip --------------------------------------------------------------------------------