├── .gitignore
└── Logistic regression.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 |
--------------------------------------------------------------------------------
/Logistic regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Логистическая регрессия"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# Логистическая регрессия - метод, который позволяет исследовать взаимосвязи между\n",
17 | "# номинативной зависимой переменной (ЗП) и различными предикторами.\n",
18 | "\n",
19 | "# Уравнение логистической регрессии:"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "\\begin{equation}\n",
27 | "log(odds) = B_0 + B_1 x_1 + B_2 x_2 + ... + B_k x_k\n",
28 | "\\end{equation}"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# log - натуральный логарифм.\n",
38 | "# odds - шансы положительного исхода (далее просто \"шансы\")."
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "\\begin{equation}\n",
46 | "p = \\frac{e^{log(odds)}}{1 + e^{log(odds)}} = \\frac{exp(log(odds))}{1 + exp(log(odds))}\n",
47 | "\\end{equation}"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "# p - вероятность положительного исхода (далее просто \"вероятность\")."
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "## Шансы и вероятность"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "import pandas as pd\n",
73 | "import numpy as np\n",
74 | "import matplotlib.pyplot as plt\n",
75 | "import seaborn as sns\n",
76 | "import statsmodels.api as sm\n",
77 | "import statsmodels.formula.api as smf"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 5,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/html": [
88 | "
\n",
89 | "\n",
102 | "
\n",
103 | " \n",
104 | " \n",
105 | " | \n",
106 | " coin_state | \n",
107 | "
\n",
108 | " \n",
109 | " \n",
110 | " \n",
111 | " | 0 | \n",
112 | " 0 | \n",
113 | "
\n",
114 | " \n",
115 | " | 1 | \n",
116 | " 0 | \n",
117 | "
\n",
118 | " \n",
119 | " | 2 | \n",
120 | " 1 | \n",
121 | "
\n",
122 | " \n",
123 | " | 3 | \n",
124 | " 0 | \n",
125 | "
\n",
126 | " \n",
127 | " | 4 | \n",
128 | " 0 | \n",
129 | "
\n",
130 | " \n",
131 | " | 5 | \n",
132 | " 0 | \n",
133 | "
\n",
134 | " \n",
135 | " | 6 | \n",
136 | " 1 | \n",
137 | "
\n",
138 | " \n",
139 | " | 7 | \n",
140 | " 0 | \n",
141 | "
\n",
142 | " \n",
143 | " | 8 | \n",
144 | " 0 | \n",
145 | "
\n",
146 | " \n",
147 | " | 9 | \n",
148 | " 0 | \n",
149 | "
\n",
150 | " \n",
151 | "
\n",
152 | "
"
153 | ],
154 | "text/plain": [
155 | " coin_state\n",
156 | "0 0\n",
157 | "1 0\n",
158 | "2 1\n",
159 | "3 0\n",
160 | "4 0\n",
161 | "5 0\n",
162 | "6 1\n",
163 | "7 0\n",
164 | "8 0\n",
165 | "9 0"
166 | ]
167 | },
168 | "execution_count": 5,
169 | "metadata": {},
170 | "output_type": "execute_result"
171 | }
172 | ],
173 | "source": [
174 | "# Разберем на примере подбрасывания монетки. Пусть у нас было 10 бросков и мы получили следующие результаты:\n",
175 | "# 1 - орел (в примере мы хотим, чтобы выпадали орлы, это будет положительный исход)\n",
176 | "# 0 - решка (отрицательный исход)\n",
177 | "\n",
178 | "df = pd.DataFrame({'coin_state': [0, 0, 1, 0, 0, 0, 1, 0, 0, 0]})\n",
179 | "df"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 6,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/plain": [
190 | "0 8\n",
191 | "1 2\n",
192 | "Name: coin_state, dtype: int64"
193 | ]
194 | },
195 | "execution_count": 6,
196 | "metadata": {},
197 | "output_type": "execute_result"
198 | }
199 | ],
200 | "source": [
201 | "# 2 орла, 8 решек\n",
202 | "df.coin_state.value_counts()"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 7,
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "# Рассчитаем вероятность:"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "\\begin{equation}\n",
219 | "p = \\frac{pos}{pos + neg}\n",
220 | "\\end{equation} "
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 8,
226 | "metadata": {},
227 | "outputs": [
228 | {
229 | "data": {
230 | "text/plain": [
231 | "0.2"
232 | ]
233 | },
234 | "execution_count": 8,
235 | "metadata": {},
236 | "output_type": "execute_result"
237 | }
238 | ],
239 | "source": [
240 | "pos = df.coin_state.value_counts()[1] # количество положительных исходов\n",
241 | "neg = df.coin_state.value_counts()[0] # количество отрицательных исходов\n",
242 | "\n",
243 | "p = pos / (pos + neg)\n",
244 | "p"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 9,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "# Рассчитаем шансы:"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "\\begin{equation}\n",
261 | "odds = \\frac{pos}{neg}\n",
262 | "\\end{equation}"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 10,
268 | "metadata": {},
269 | "outputs": [
270 | {
271 | "data": {
272 | "text/plain": [
273 | "0.25"
274 | ]
275 | },
276 | "execution_count": 10,
277 | "metadata": {},
278 | "output_type": "execute_result"
279 | }
280 | ],
281 | "source": [
282 | "odds = pos / neg\n",
283 | "odds"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 11,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "# Шансы можно посчитать, зная вероятность:"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "\\begin{equation}\n",
300 | "odds = \\frac{p}{1 - p}\n",
301 | "\\end{equation}"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 12,
307 | "metadata": {},
308 | "outputs": [
309 | {
310 | "data": {
311 | "text/plain": [
312 | "0.25"
313 | ]
314 | },
315 | "execution_count": 12,
316 | "metadata": {},
317 | "output_type": "execute_result"
318 | }
319 | ],
320 | "source": [
321 | "p / (1 - p)"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 13,
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "# Вероятность можно посчитать, зная шансы:"
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "\\begin{equation}\n",
338 | "p = \\frac{odds}{1 + odds}\n",
339 | "\\end{equation}"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 14,
345 | "metadata": {},
346 | "outputs": [
347 | {
348 | "data": {
349 | "text/plain": [
350 | "0.2"
351 | ]
352 | },
353 | "execution_count": 14,
354 | "metadata": {},
355 | "output_type": "execute_result"
356 | }
357 | ],
358 | "source": [
359 | "odds / (1 + odds)"
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "metadata": {},
365 | "source": [
366 | "## При чем тут натуральный логарифм?"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 15,
372 | "metadata": {},
373 | "outputs": [],
374 | "source": [
375 | "# Проследим за различным соотношением положительных и отрицательных исходов\n",
376 | "# и рассчитанными на их основе вероятностью, шансами и натуральным логарифмом шансов (далее просто \"логарифм шансов\").\n",
377 | "\n",
378 | "# Вероятность - число от нуля до единицы\n",
379 | "# Шансы - число от нуля до плюс бесконечности\n",
380 | "# Логарифм шансов - число от минус бесконечности до плюс бесконечности"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 16,
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "data": {
390 | "text/html": [
391 | "\n",
392 | "\n",
405 | "
\n",
406 | " \n",
407 | " \n",
408 | " | \n",
409 | " pos | \n",
410 | " neg | \n",
411 | " p | \n",
412 | " odds | \n",
413 | " log(odds) | \n",
414 | "
\n",
415 | " \n",
416 | " \n",
417 | " \n",
418 | " | 0 | \n",
419 | " 0 | \n",
420 | " 10 | \n",
421 | " 0.0 | \n",
422 | " 0.00 | \n",
423 | " NaN | \n",
424 | "
\n",
425 | " \n",
426 | " | 1 | \n",
427 | " 1 | \n",
428 | " 0 | \n",
429 | " 0.1 | \n",
430 | " 0.11 | \n",
431 | " -2.20 | \n",
432 | "
\n",
433 | " \n",
434 | " | 2 | \n",
435 | " 2 | \n",
436 | " 8 | \n",
437 | " 0.2 | \n",
438 | " 0.25 | \n",
439 | " -1.39 | \n",
440 | "
\n",
441 | " \n",
442 | " | 3 | \n",
443 | " 3 | \n",
444 | " 7 | \n",
445 | " 0.3 | \n",
446 | " 0.43 | \n",
447 | " -0.85 | \n",
448 | "
\n",
449 | " \n",
450 | " | 4 | \n",
451 | " 4 | \n",
452 | " 6 | \n",
453 | " 0.4 | \n",
454 | " 0.66 | \n",
455 | " -0.41 | \n",
456 | "
\n",
457 | " \n",
458 | " | 5 | \n",
459 | " 5 | \n",
460 | " 5 | \n",
461 | " 0.5 | \n",
462 | " 1.00 | \n",
463 | " 0.00 | \n",
464 | "
\n",
465 | " \n",
466 | " | 6 | \n",
467 | " 6 | \n",
468 | " 4 | \n",
469 | " 0.6 | \n",
470 | " 1.50 | \n",
471 | " 0.41 | \n",
472 | "
\n",
473 | " \n",
474 | " | 7 | \n",
475 | " 7 | \n",
476 | " 3 | \n",
477 | " 0.7 | \n",
478 | " 2.33 | \n",
479 | " 0.85 | \n",
480 | "
\n",
481 | " \n",
482 | " | 8 | \n",
483 | " 8 | \n",
484 | " 2 | \n",
485 | " 0.8 | \n",
486 | " 4.00 | \n",
487 | " 1.39 | \n",
488 | "
\n",
489 | " \n",
490 | " | 9 | \n",
491 | " 9 | \n",
492 | " 1 | \n",
493 | " 0.9 | \n",
494 | " 9.00 | \n",
495 | " 2.20 | \n",
496 | "
\n",
497 | " \n",
498 | " | 10 | \n",
499 | " 10 | \n",
500 | " 0 | \n",
501 | " 1.0 | \n",
502 | " NaN | \n",
503 | " NaN | \n",
504 | "
\n",
505 | " \n",
506 | "
\n",
507 | "
"
508 | ],
509 | "text/plain": [
510 | " pos neg p odds log(odds)\n",
511 | "0 0 10 0.0 0.00 NaN\n",
512 | "1 1 0 0.1 0.11 -2.20\n",
513 | "2 2 8 0.2 0.25 -1.39\n",
514 | "3 3 7 0.3 0.43 -0.85\n",
515 | "4 4 6 0.4 0.66 -0.41\n",
516 | "5 5 5 0.5 1.00 0.00\n",
517 | "6 6 4 0.6 1.50 0.41\n",
518 | "7 7 3 0.7 2.33 0.85\n",
519 | "8 8 2 0.8 4.00 1.39\n",
520 | "9 9 1 0.9 9.00 2.20\n",
521 | "10 10 0 1.0 NaN NaN"
522 | ]
523 | },
524 | "execution_count": 16,
525 | "metadata": {},
526 | "output_type": "execute_result"
527 | }
528 | ],
529 | "source": [
530 | "pd.DataFrame({\n",
531 | " 'pos': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
532 | " 'neg': [10, 0, 8, 7, 6, 5, 4, 3, 2, 1, 0],\n",
533 | " 'p': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],\n",
534 | " 'odds': [0, 0.11, 0.25, 0.43, 0.66, 1, 1.5, 2.33, 4, 9, np.nan],\n",
535 | " 'log(odds)': [np.nan, -2.2, -1.39, -0.85, -0.41, 0, 0.41, 0.85, 1.39, 2.2, np.nan],\n",
536 | "})"
537 | ]
538 | },
539 | {
540 | "cell_type": "code",
541 | "execution_count": 17,
542 | "metadata": {},
543 | "outputs": [
544 | {
545 | "data": {
546 | "image/png": "\n",
547 | "text/plain": [
548 | ""
549 | ]
550 | },
551 | "metadata": {
552 | "needs_background": "light"
553 | },
554 | "output_type": "display_data"
555 | }
556 | ],
557 | "source": [
558 | "# Вспомним уравнение логистической регрессии - в правой части число от минус бесконечности до плюс бесконечности,\n",
559 | "# в левой - логарифм шансов, зная который, мы можем рассчитать вероятность.\n",
560 | "\n",
561 | "# Визуализируем зависимость вероятности от логарифма шансов:\n",
562 | "\n",
563 | "log_odds = np.arange(-6, 6, 0.1)\n",
564 | "p = np.exp(log_odds) / (1 + np.exp(log_odds))\n",
565 | "\n",
566 | "plt.plot(log_odds, p)\n",
567 | "plt.grid()\n",
568 | "plt.xlim(-6, 6)\n",
569 | "plt.ylim(-0.1, 1.1)\n",
570 | "plt.title('Зависимость вероятности от логарифма шансов', fontsize=12)\n",
571 | "plt.xlabel('log(odds)', fontsize=12)\n",
572 | "plt.ylabel('p', fontsize=12)\n",
573 | "\n",
574 | "plt.show()\n",
575 | "plt.close()"
576 | ]
577 | },
578 | {
579 | "cell_type": "markdown",
580 | "metadata": {},
581 | "source": [
582 | "## Модель без предикторов"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": 18,
588 | "metadata": {},
589 | "outputs": [
590 | {
591 | "data": {
592 | "text/html": [
593 | "\n",
594 | "\n",
607 | "
\n",
608 | " \n",
609 | " \n",
610 | " | \n",
611 | " PassengerId | \n",
612 | " Survived | \n",
613 | " Pclass | \n",
614 | " Name | \n",
615 | " Sex | \n",
616 | " Age | \n",
617 | " SibSp | \n",
618 | " Parch | \n",
619 | " Ticket | \n",
620 | " Fare | \n",
621 | " Cabin | \n",
622 | " Embarked | \n",
623 | "
\n",
624 | " \n",
625 | " \n",
626 | " \n",
627 | " | 0 | \n",
628 | " 1 | \n",
629 | " 0 | \n",
630 | " 3 | \n",
631 | " Braund, Mr. Owen Harris | \n",
632 | " male | \n",
633 | " 22.0 | \n",
634 | " 1 | \n",
635 | " 0 | \n",
636 | " A/5 21171 | \n",
637 | " 7.2500 | \n",
638 | " NaN | \n",
639 | " S | \n",
640 | "
\n",
641 | " \n",
642 | " | 1 | \n",
643 | " 2 | \n",
644 | " 1 | \n",
645 | " 1 | \n",
646 | " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
647 | " female | \n",
648 | " 38.0 | \n",
649 | " 1 | \n",
650 | " 0 | \n",
651 | " PC 17599 | \n",
652 | " 71.2833 | \n",
653 | " C85 | \n",
654 | " C | \n",
655 | "
\n",
656 | " \n",
657 | " | 2 | \n",
658 | " 3 | \n",
659 | " 1 | \n",
660 | " 3 | \n",
661 | " Heikkinen, Miss. Laina | \n",
662 | " female | \n",
663 | " 26.0 | \n",
664 | " 0 | \n",
665 | " 0 | \n",
666 | " STON/O2. 3101282 | \n",
667 | " 7.9250 | \n",
668 | " NaN | \n",
669 | " S | \n",
670 | "
\n",
671 | " \n",
672 | " | 3 | \n",
673 | " 4 | \n",
674 | " 1 | \n",
675 | " 1 | \n",
676 | " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
677 | " female | \n",
678 | " 35.0 | \n",
679 | " 1 | \n",
680 | " 0 | \n",
681 | " 113803 | \n",
682 | " 53.1000 | \n",
683 | " C123 | \n",
684 | " S | \n",
685 | "
\n",
686 | " \n",
687 | " | 4 | \n",
688 | " 5 | \n",
689 | " 0 | \n",
690 | " 3 | \n",
691 | " Allen, Mr. William Henry | \n",
692 | " male | \n",
693 | " 35.0 | \n",
694 | " 0 | \n",
695 | " 0 | \n",
696 | " 373450 | \n",
697 | " 8.0500 | \n",
698 | " NaN | \n",
699 | " S | \n",
700 | "
\n",
701 | " \n",
702 | "
\n",
703 | "
"
704 | ],
705 | "text/plain": [
706 | " PassengerId Survived Pclass \\\n",
707 | "0 1 0 3 \n",
708 | "1 2 1 1 \n",
709 | "2 3 1 3 \n",
710 | "3 4 1 1 \n",
711 | "4 5 0 3 \n",
712 | "\n",
713 | " Name Sex Age SibSp \\\n",
714 | "0 Braund, Mr. Owen Harris male 22.0 1 \n",
715 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
716 | "2 Heikkinen, Miss. Laina female 26.0 0 \n",
717 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
718 | "4 Allen, Mr. William Henry male 35.0 0 \n",
719 | "\n",
720 | " Parch Ticket Fare Cabin Embarked \n",
721 | "0 0 A/5 21171 7.2500 NaN S \n",
722 | "1 0 PC 17599 71.2833 C85 C \n",
723 | "2 0 STON/O2. 3101282 7.9250 NaN S \n",
724 | "3 0 113803 53.1000 C123 S \n",
725 | "4 0 373450 8.0500 NaN S "
726 | ]
727 | },
728 | "execution_count": 18,
729 | "metadata": {},
730 | "output_type": "execute_result"
731 | }
732 | ],
733 | "source": [
734 | "# Датасет Титаника\n",
735 | "\n",
736 | "df = pd.read_csv('https://stepic.org/media/attachments/course/524/train.csv')\n",
737 | "df = df[df.Age.notnull()]\n",
738 | "df.head()"
739 | ]
740 | },
741 | {
742 | "cell_type": "code",
743 | "execution_count": 19,
744 | "metadata": {},
745 | "outputs": [
746 | {
747 | "data": {
748 | "text/html": [
749 | "\n",
750 | "Generalized Linear Model Regression Results\n",
751 | "\n",
752 | " | Dep. Variable: | Survived | No. Observations: | 714 | \n",
753 | "
\n",
754 | "\n",
755 | " | Model: | GLM | Df Residuals: | 713 | \n",
756 | "
\n",
757 | "\n",
758 | " | Model Family: | Binomial | Df Model: | 0 | \n",
759 | "
\n",
760 | "\n",
761 | " | Link Function: | logit | Scale: | 1.0000 | \n",
762 | "
\n",
763 | "\n",
764 | " | Method: | IRLS | Log-Likelihood: | -482.26 | \n",
765 | "
\n",
766 | "\n",
767 | " | Date: | Tue, 08 Jun 2021 | Deviance: | 964.52 | \n",
768 | "
\n",
769 | "\n",
770 | " | Time: | 20:53:33 | Pearson chi2: | 714. | \n",
771 | "
\n",
772 | "\n",
773 | " | No. Iterations: | 4 | | | \n",
774 | "
\n",
775 | "\n",
776 | " | Covariance Type: | nonrobust | | | \n",
777 | "
\n",
778 | "
\n",
779 | "\n",
780 | "\n",
781 | " | coef | std err | z | P>|z| | [0.025 | 0.975] | \n",
782 | "
\n",
783 | "\n",
784 | " | Intercept | -0.3799 | 0.076 | -4.985 | 0.000 | -0.529 | -0.230 | \n",
785 | "
\n",
786 | "
"
787 | ],
788 | "text/plain": [
789 | "\n",
790 | "\"\"\"\n",
791 | " Generalized Linear Model Regression Results \n",
792 | "==============================================================================\n",
793 | "Dep. Variable: Survived No. Observations: 714\n",
794 | "Model: GLM Df Residuals: 713\n",
795 | "Model Family: Binomial Df Model: 0\n",
796 | "Link Function: logit Scale: 1.0000\n",
797 | "Method: IRLS Log-Likelihood: -482.26\n",
798 | "Date: Tue, 08 Jun 2021 Deviance: 964.52\n",
799 | "Time: 20:53:33 Pearson chi2: 714.\n",
800 | "No. Iterations: 4 \n",
801 | "Covariance Type: nonrobust \n",
802 | "==============================================================================\n",
803 | " coef std err z P>|z| [0.025 0.975]\n",
804 | "------------------------------------------------------------------------------\n",
805 | "Intercept -0.3799 0.076 -4.985 0.000 -0.529 -0.230\n",
806 | "==============================================================================\n",
807 | "\"\"\""
808 | ]
809 | },
810 | "execution_count": 19,
811 | "metadata": {},
812 | "output_type": "execute_result"
813 | }
814 | ],
815 | "source": [
816 | "glm_binomial = smf.glm(formula='Survived ~ 1', data=df, family=sm.families.Binomial()).fit()\n",
817 | "glm_binomial.summary()\n",
818 | "\n",
819 | "# Intercept - логарифм шансов (шансов выжить)"
820 | ]
821 | },
822 | {
823 | "cell_type": "code",
824 | "execution_count": 20,
825 | "metadata": {},
826 | "outputs": [],
827 | "source": [
828 | "# Получаем модель:\n",
829 | "# log(odds) = -0.3799"
830 | ]
831 | },
832 | {
833 | "cell_type": "code",
834 | "execution_count": 21,
835 | "metadata": {},
836 | "outputs": [
837 | {
838 | "data": {
839 | "text/plain": [
840 | "0 424\n",
841 | "1 290\n",
842 | "Name: Survived, dtype: int64"
843 | ]
844 | },
845 | "execution_count": 21,
846 | "metadata": {},
847 | "output_type": "execute_result"
848 | }
849 | ],
850 | "source": [
851 | "# Распределение частот ЗП\n",
852 | "df.Survived.value_counts()"
853 | ]
854 | },
855 | {
856 | "cell_type": "code",
857 | "execution_count": 22,
858 | "metadata": {},
859 | "outputs": [
860 | {
861 | "data": {
862 | "text/plain": [
863 | ""
864 | ]
865 | },
866 | "execution_count": 22,
867 | "metadata": {},
868 | "output_type": "execute_result"
869 | },
870 | {
871 | "data": {
872 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEGCAYAAACKB4k+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAASuUlEQVR4nO3dfZBdd33f8fcHyQ9pTPyA1tRImspD1CQmEwRsjIv/KLUzjXEa5KSYmkJQqGcEMyaFSdrEpH/ElHqadAoukOBEqY1lJsVWINQq4ya4Ni4hBJsVEcayQ1HBQYsUa40fwKFxK/HtH/e3x4t0JV3bOveufd+vmTv3nO/5nbPf9Wj24/OcqkKSJIDnTboBSdLyYShIkjqGgiSpYyhIkjqGgiSps3LSDTwTq1atqnXr1k26DUl6VtmxY8dDVTUzbNmzOhTWrVvH3NzcpNuQpGeVJH91pGUePpIkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdZ7VdzQfD3P/8m2TbkHL0OwHfnfSLUgT4Z6CJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKnTeygkWZHkL5J8ss2fneSuJF9NcnOSE1v9pDa/uy1f13dvkqTvN449hXcA9y+Z/y3gmqpaDzwCXN7qlwOPVNUPA9e0cZKkMeo1FJKsAX4G+M9tPsAFwMfakK3AJW16Y5unLb+wjZckjUnfewr/CfhV4Htt/gXAo1V1oM3PA6vb9GpgD0Bb/lgb/32SbE4yl2RuYWGhz94laer0FgpJ/gmwv6p2LC0PGVojLHuyULWlqmaranZmZuY4dCpJWtTnA/HOB16b5GLgZOCHGOw5nJZkZdsbWAPsbePngbXAfJKVwKnAwz32J0k6RG97ClX1rqpaU1XrgMuAO6rqjcCngde1YZuAW9r09jZPW35HVR22pyBJ6s8k7lP4NeCXk+xmcM7gula/DnhBq/8ycOUEepOkqTaW9ylU1Z3AnW36a8C5Q8b8LXDpOPqRJA3nHc2SpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnq9BYKSU5OcneSLyXZleTdrX5Dkq8n2dk+G1o9ST6QZHeSe5K8vK/eJEnD9fnmtSeAC6rq8SQnAJ9N8t/bsn9dVR87ZPxrgPXt80rg2vYtSRqT3vYUauDxNntC+9RRVtkI3NjW+zxwWpKz+upPknS4Xs8pJFmRZCewH7itqu5qi65uh4iuSXJSq60G9ixZfb7VDt3m5iRzSeYWFhb6bF+Spk6voVBVB6tqA7AGODfJjwPvAn4U+EngDODX2vAM28SQbW6pqtmqmp2Zmempc0maTmO5+qiqHgXuBC6qqn3tENETwIeBc9uweWDtktXWAHvH0Z8kaaDPq49mkpzWpn8A+CngLxfPEyQJcAlwb1tlO/DmdhXSecBjVbWvr/4kSYfr8+qjs4CtSVYwCJ9tVfXJJHckmWFwuGgn8LY2/lbgYmA38F3gLT32JkkaordQqKp7gJcNqV9whPEFXNFXP5KkY/OOZklSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSp883r52c5O4kX0qyK8m7W/3sJHcl+WqSm5Oc2Oontfndbfm6vnqTJA3X557CE8AFVfVSYANwUXvN5m8B11TVeuAR4PI2/nLgkar6YeCaNk6SNEa9hUINPN5mT2ifAi4APtbqWxm8pxlgY5unLb+wvcdZkjQmvZ5TSLIiyU5gP3Ab8L+BR6vqQBsyD6xu06uBPQBt+WPAC4Zsc3OSuSRzCwsLfbYvSVOn11CoqoNVtQFYA5wL/NiwYe172F5BHVao2lJVs1U1OzMzc/yalSSN5+qjqnoUuBM4Dzgtycq2aA2wt03PA2sB2vJTgYfH0Z8kaaDPq49mkpzWpn8A+CngfuDTwOvasE3ALW16e5unLb+jqg7bU5Ak9WflsYc8bWcBW5OsYBA+26rqk0nuA25K8u+AvwCua+OvAz6SZDeDPYTLeuxNkjREb6FQVfcALxtS/xqD8wuH1v8WuLSvfqRnm7d9bm7SLWgZ+t1Xzfa6fe9oliR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUqfP13GuTfLpJPcn2ZXkHa1+VZJvJtnZPhcvWeddSXYn+UqSn+6rN0nScH2+jvMA8CtV9cUkzwd2JLmtLbumqv7j0sFJzmHwCs6XAC8C/keSv19VB3vsUZK0RG97ClW1r6q+2Ka/A9wPrD7KKhuBm6rqiar6OrCbIa/tlCT1ZyznFJKsY/C+5rta6e1J7klyfZLTW201sGfJavMMCZEkm5PMJZlbWFjosWtJmj69h0KSU4CPA++sqm8D1wIvBjYA+4D3Lg4dsnodVqjaUlWzVTU7MzPTU9eSNJ1GCoUkt49SGzLmBAaB8AdV9UcAVfVgVR2squ8Bv8+Th4jmgbVLVl8D7B2lP0nS8XHUUEhycpIzgFVJTk9yRvusY3Ay+GjrBrgOuL+q3rekftaSYT8H3NumtwOXJTkpydnAeuDup/oLSZKevmNdffRW4J0MAmAHTx7i+TbwO8dY93zgF4AvJ9nZar8OvCHJBgaHhh5oP4Oq2pVkG3AfgyuXrvDKI0kar6OGQlW9H3h/kl+qqg8+lQ1X1WcZfp7g1qOsczVw9VP5OZKk42ek+xSq6oNJXgWsW7pOVd3YU1+SpAkYKRSSfITBFUM7gcVDOgUYCpL0HDLqHc2zwDlVddglopKk545R71O4F/i7fTYiSZq8UfcUVgH3JbkbeGKxWFWv7aUrSdJEjBoKV/XZhCRpeRj16qP/2XcjkqTJG/Xqo+/w5HOITgROAP6mqn6or8YkSeM36p7C85fOJ7kEH2stSc85T+spqVX1X4ELjnMvkqQJG/Xw0c8vmX0eg/sWvGdBkp5jRr366GeXTB9g8CC7jce9G0nSRI16TuEtfTciSZq8UV+ysybJJ5LsT/Jgko8nWdN3c5Kk8Rr1RPOHGbwE50UM3pv831pNkvQcMmoozFTVh6vqQPvcABz1BclJ1ib5dJL7k+xK8o5WPyPJbUm+2r5Pb/Uk+UCS3UnuSfLyZ/SbSZKeslFD4aEkb0qyon3eBHzrGOscAH6lqn4MOA+4Isk5wJXA7VW1Hri9zQO8hsErONcDm4Frn+LvIkl6hkYNhX8BvB74a2Af8DrgqCefq2pfVX2xTX8HuJ/BoaeNwNY2bCtwSZveCNxYA58HTjvkfc6SpJ6NGgrvATZV1UxVnckgJK4a9YckWQe8DLgLeGFV7YNBcABntmGrgT1LVptvtUO3tTnJXJK5hYWFUVuQJI1g1FD4iap6ZHGmqh5m8Ef+mJKcAnwceGdVfftoQ4fUDrtBrqq2VNVsVc3OzBz1tIYk6SkaNRSet3hCGAYnixnhHockJzAIhD+oqj9q5QcXDwu17/2tPg+sXbL6GmDviP1Jko6DUUPhvcDnkrwnyb8FPgf8h6OtkCTAdcD9VfW+JYu2A5va9CbgliX1N7erkM4DHls8zCRJGo9R72i+Mckcg4fgBfj5qrrvGKudD/wC8OUkO1vt14HfBLYluRz4BnBpW3YrcDGwG/guxziRLUk6/kZ99hEtBI4VBEvHf5bh5wkALhwyvoArRt2+JOn4e1qPzpYkPTcZCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSer0FgpJrk+yP8m9S2pXJflmkp3tc/GSZe9KsjvJV5L8dF99SZKOrM89hRuAi4bUr6mqDe1zK0CSc4DLgJe0dT6UZEWPvUmShugtFKrqM8DDIw7fCNxUVU9U1dcZvJLz3L56kyQNN4lzCm9Pck87vHR6q60G9iwZM99qkqQxGncoXAu8GNgA7APe2+rD3uVcwzaQZHOSuSRzCwsL/XQpSVNqrKFQVQ9W1cGq+h7w+zx5iGgeWLtk6Bpg7xG2saWqZqtqdmZmpt+GJWnKjDUUkpy1ZPbngMUrk7YDlyU5KcnZwHrg7nH2JkmClX1tOMlHgVcDq5LMA78BvDrJBgaHhh4A3gpQVbuSbAPuAw4AV1TVwb56kyQN11soVNUbhpSvO8r4q4Gr++pHknRs3tEsSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkTm+hkOT6JPuT3LukdkaS25J8tX2f3upJ8oEku5Pck+TlffUlSTqyPvcUbgAuOqR2JXB7Va0Hbm/zAK9h8F7m9cBm4Noe+5IkHUFvoVBVnwEePqS8EdjaprcClyyp31gDnwdOS3JWX71JkoYb9zmFF1bVPoD2fWarrwb2LBk332qHSbI5yVySuYWFhV6blaRps1xONGdIrYYNrKotVTVbVbMzMzM9tyVJ02XcofDg4mGh9r2/1eeBtUvGrQH2jrk3SZp64w6F7cCmNr0JuGVJ/c3tKqTzgMcWDzNJksZnZV8bTvJR4NXAqiTzwG8AvwlsS3I58A3g0jb8VuBiYDfwXeAtffUlSTqy3kKhqt5whEUXDhlbwBV99SJJGs1yOdEsSVoGDAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1enufwtEkeQD4DnAQOFBVs0nOAG4G1gEPAK+vqkcm0Z8kTatJ7in8o6raUFWzbf5K4PaqWg/c3uYlSWO0nA4fbQS2tumtwCUT7EWSptKkQqGATyXZkWRzq72wqvYBtO8zh62YZHOSuSRzCwsLY2pXkqbDRM4pAOdX1d4kZwK3JfnLUVesqi3AFoDZ2dnqq0FJmkYT2VOoqr3tez/wCeBc4MEkZwG07/2T6E2SptnYQyHJDyZ5/uI08I+Be4HtwKY2bBNwy7h7k6RpN4nDRy8EPpFk8ef/l6r64yRfALYluRz4BnDpBHqTpKk29lCoqq8BLx1S/xZw4bj7kSQ9aTldkipJmjBDQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUMRQkSR1DQZLUWXahkOSiJF9JsjvJlZPuR5KmybIKhSQrgN8BXgOcA7whyTmT7UqSpseyCgXgXGB3VX2tqv4vcBOwccI9SdLUGPs7mo9hNbBnyfw88MqlA5JsBja32ceTfGVMvU2DVcBDk25iWfjg7026A30//202x+lf5t870oLlFgoZUqvvm6naAmwZTzvTJclcVc1Oug/pUP7bHJ/ldvhoHli7ZH4NsHdCvUjS1FluofAFYH2Ss5OcCFwGbJ9wT5I0NZbV4aOqOpDk7cCfACuA66tq14TbmiYeltNy5b/NMUlVHXuUJGkqLLfDR5KkCTIUJEkdQ0E+WkTLVpLrk+xPcu+ke5kWhsKU89EiWuZuAC6adBPTxFCQjxbRslVVnwEennQf08RQ0LBHi6yeUC+SJsxQ0DEfLSJpehgK8tEikjqGgny0iKSOoTDlquoAsPhokfuBbT5aRMtFko8Cfw78SJL5JJdPuqfnOh9zIUnquKcgSeoYCpKkjqEgSeoYCpKkjqEgSeoYChKQ5N8k2ZXkniQ7k7zyOGzztcfrqbNJHj8e25GOxUtSNfWS/APgfcCrq+qJJKuAE6vqmHd2J1nZ7vXou8fHq+qUvn+O5J6CBGcBD1XVEwBV9VBV7U3yQAsIkswmubNNX5VkS5JPATcmuSvJSxY3luTOJK9I8otJfjvJqW1bz2vL/06SPUlOSPLiJH+cZEeSP03yo23M2Un+PMkXkrxnzP89NMUMBQk+BaxN8r+SfCjJPxxhnVcAG6vqnzN43PjrAZKcBbyoqnYsDqyqx4AvAYvb/VngT6rq/zF4If0vVdUrgH8FfKiNeT9wbVX9JPDXz/g3lEZkKGjqVdXjDP7IbwYWgJuT/OIxVtteVf+nTW8DLm3Trwf+cMj4m4F/1qYvaz/jFOBVwB8m2Qn8HoO9FoDzgY+26Y88pV9IegZWTroBaTmoqoPAncCdSb4MbAIO8OT/OJ18yCp/s2Tdbyb5VpKfYPCH/61DfsR24N8nOYNBAN0B/CDwaFVtOFJbT/PXkZ429xQ09ZL8SJL1S0obgL8CHmDwBxzgnx5jMzcBvwqcWlVfPnRh2xu5m8FhoU9W1cGq+jbw9SSXtj6S5KVtlT9jsEcB8Man/ltJT4+hIMEpwNYk9yW5h8G7qq8C3g28P8mfAgePsY2PMfgjvu0oY24G3tS+F70RuDzJl4BdPPkq1HcAVyT5AnDqU/t1pKfPS1IlSR33FCRJHUNBktQxFCRJHUNBktQxFCRJHUNBktQxFCRJnf8PgOj/RBhAWRoAAAAASUVORK5CYII=\n",
873 | "text/plain": [
874 | ""
875 | ]
876 | },
877 | "metadata": {
878 | "needs_background": "light"
879 | },
880 | "output_type": "display_data"
881 | }
882 | ],
883 | "source": [
884 | "sns.countplot(x='Survived', data=df, palette='hls')"
885 | ]
886 | },
887 | {
888 | "cell_type": "code",
889 | "execution_count": 23,
890 | "metadata": {},
891 | "outputs": [
892 | {
893 | "data": {
894 | "text/plain": [
895 | "0.6839622641509434"
896 | ]
897 | },
898 | "execution_count": 23,
899 | "metadata": {},
900 | "output_type": "execute_result"
901 | }
902 | ],
903 | "source": [
904 | "# df.Survived.value_counts()[1] - количество выживших 290\n",
905 | "# df.Survived.value_counts()[0] - количество погибших 424\n",
906 | "odds = df.Survived.value_counts()[1] / df.Survived.value_counts()[0]\n",
907 | "odds"
908 | ]
909 | },
910 | {
911 | "cell_type": "code",
912 | "execution_count": 24,
913 | "metadata": {},
914 | "outputs": [
915 | {
916 | "data": {
917 | "text/plain": [
918 | "-0.37985253225143806"
919 | ]
920 | },
921 | "execution_count": 24,
922 | "metadata": {},
923 | "output_type": "execute_result"
924 | }
925 | ],
926 | "source": [
927 | "# Логарифм шансов - это и есть Intercept.\n",
928 | "log_odds = np.log(odds)\n",
929 | "log_odds"
930 | ]
931 | },
932 | {
933 | "cell_type": "code",
934 | "execution_count": 25,
935 | "metadata": {},
936 | "outputs": [
937 | {
938 | "data": {
939 | "text/plain": [
940 | "0.6839622641509434"
941 | ]
942 | },
943 | "execution_count": 25,
944 | "metadata": {},
945 | "output_type": "execute_result"
946 | }
947 | ],
948 | "source": [
949 | "# Рассчитаем шансы:\n",
950 | "odds = np.exp(log_odds)\n",
951 | "odds"
952 | ]
953 | },
954 | {
955 | "cell_type": "code",
956 | "execution_count": 26,
957 | "metadata": {},
958 | "outputs": [
959 | {
960 | "data": {
961 | "text/plain": [
962 | "0.4061624649859944"
963 | ]
964 | },
965 | "execution_count": 26,
966 | "metadata": {},
967 | "output_type": "execute_result"
968 | }
969 | ],
970 | "source": [
971 | "# Рассчитаем вероятность:\n",
972 | "p = odds / (1 + odds)\n",
973 | "p"
974 | ]
975 | },
976 | {
977 | "cell_type": "code",
978 | "execution_count": 27,
979 | "metadata": {},
980 | "outputs": [
981 | {
982 | "data": {
983 | "text/plain": [
984 | "966.5159648555248"
985 | ]
986 | },
987 | "execution_count": 27,
988 | "metadata": {},
989 | "output_type": "execute_result"
990 | }
991 | ],
992 | "source": [
993 | "# aic - показатель остатков модели.\n",
994 | "# Чем он ниже, тем лучше модель (по аналогии с суммой квадратов остатков в линейной регрессии).\n",
995 | "glm_binomial.aic"
996 | ]
997 | },
998 | {
999 | "cell_type": "markdown",
1000 | "metadata": {},
1001 | "source": [
1002 | "## Модель с одним номинативным предиктором"
1003 | ]
1004 | },
1005 | {
1006 | "cell_type": "code",
1007 | "execution_count": 28,
1008 | "metadata": {},
1009 | "outputs": [
1010 | {
1011 | "data": {
1012 | "text/html": [
1013 | "\n",
1014 | "Generalized Linear Model Regression Results\n",
1015 | "\n",
1016 | " | Dep. Variable: | Survived | No. Observations: | 714 | \n",
1017 | "
\n",
1018 | "\n",
1019 | " | Model: | GLM | Df Residuals: | 712 | \n",
1020 | "
\n",
1021 | "\n",
1022 | " | Model Family: | Binomial | Df Model: | 1 | \n",
1023 | "
\n",
1024 | "\n",
1025 | " | Link Function: | logit | Scale: | 1.0000 | \n",
1026 | "
\n",
1027 | "\n",
1028 | " | Method: | IRLS | Log-Likelihood: | -375.35 | \n",
1029 | "
\n",
1030 | "\n",
1031 | " | Date: | Tue, 08 Jun 2021 | Deviance: | 750.70 | \n",
1032 | "
\n",
1033 | "\n",
1034 | " | Time: | 20:53:34 | Pearson chi2: | 714. | \n",
1035 | "
\n",
1036 | "\n",
1037 | " | No. Iterations: | 4 | | | \n",
1038 | "
\n",
1039 | "\n",
1040 | " | Covariance Type: | nonrobust | | | \n",
1041 | "
\n",
1042 | "
\n",
1043 | "\n",
1044 | "\n",
1045 | " | coef | std err | z | P>|z| | [0.025 | 0.975] | \n",
1046 | "
\n",
1047 | "\n",
1048 | " | Intercept | 1.1243 | 0.144 | 7.814 | 0.000 | 0.842 | 1.406 | \n",
1049 | "
\n",
1050 | "\n",
1051 | " | C(Sex)[T.male] | -2.4778 | 0.185 | -13.392 | 0.000 | -2.840 | -2.115 | \n",
1052 | "
\n",
1053 | "
"
1054 | ],
1055 | "text/plain": [
1056 | "\n",
1057 | "\"\"\"\n",
1058 | " Generalized Linear Model Regression Results \n",
1059 | "==============================================================================\n",
1060 | "Dep. Variable: Survived No. Observations: 714\n",
1061 | "Model: GLM Df Residuals: 712\n",
1062 | "Model Family: Binomial Df Model: 1\n",
1063 | "Link Function: logit Scale: 1.0000\n",
1064 | "Method: IRLS Log-Likelihood: -375.35\n",
1065 | "Date: Tue, 08 Jun 2021 Deviance: 750.70\n",
1066 | "Time: 20:53:34 Pearson chi2: 714.\n",
1067 | "No. Iterations: 4 \n",
1068 | "Covariance Type: nonrobust \n",
1069 | "==================================================================================\n",
1070 | " coef std err z P>|z| [0.025 0.975]\n",
1071 | "----------------------------------------------------------------------------------\n",
1072 | "Intercept 1.1243 0.144 7.814 0.000 0.842 1.406\n",
1073 | "C(Sex)[T.male] -2.4778 0.185 -13.392 0.000 -2.840 -2.115\n",
1074 | "==================================================================================\n",
1075 | "\"\"\""
1076 | ]
1077 | },
1078 | "execution_count": 28,
1079 | "metadata": {},
1080 | "output_type": "execute_result"
1081 | }
1082 | ],
1083 | "source": [
1084 | "glm_binomial = smf.glm(formula='Survived ~ C(Sex)', data=df, family=sm.families.Binomial()).fit()\n",
1085 | "glm_binomial.summary()\n",
1086 | "\n",
1087 | "# Intercept - логарифм шансов для женщин\n",
1088 | "# C(Sex)[T.male] - логарифм отношения шансов для мужчин к шансам для женщин"
1089 | ]
1090 | },
1091 | {
1092 | "cell_type": "code",
1093 | "execution_count": 29,
1094 | "metadata": {},
1095 | "outputs": [
1096 | {
1097 | "data": {
1098 | "text/html": [
1099 | "\n",
1100 | "\n",
1113 | "
\n",
1114 | " \n",
1115 | " \n",
1116 | " | Sex | \n",
1117 | " female | \n",
1118 | " male | \n",
1119 | "
\n",
1120 | " \n",
1121 | " | Survived | \n",
1122 | " | \n",
1123 | " | \n",
1124 | "
\n",
1125 | " \n",
1126 | " \n",
1127 | " \n",
1128 | " | 0 | \n",
1129 | " 64 | \n",
1130 | " 360 | \n",
1131 | "
\n",
1132 | " \n",
1133 | " | 1 | \n",
1134 | " 197 | \n",
1135 | " 93 | \n",
1136 | "
\n",
1137 | " \n",
1138 | "
\n",
1139 | "
"
1140 | ],
1141 | "text/plain": [
1142 | "Sex female male\n",
1143 | "Survived \n",
1144 | "0 64 360\n",
1145 | "1 197 93"
1146 | ]
1147 | },
1148 | "execution_count": 29,
1149 | "metadata": {},
1150 | "output_type": "execute_result"
1151 | }
1152 | ],
1153 | "source": [
1154 | "# Таблица сопряженности между переменными выжил/не выжил и пол\n",
1155 | "cross_table = pd.crosstab(df.Survived, df.Sex)\n",
1156 | "cross_table"
1157 | ]
1158 | },
1159 | {
1160 | "cell_type": "code",
1161 | "execution_count": 30,
1162 | "metadata": {},
1163 | "outputs": [
1164 | {
1165 | "data": {
1166 | "text/plain": [
1167 | "Text(0, 0.5, 'Количество')"
1168 | ]
1169 | },
1170 | "execution_count": 30,
1171 | "metadata": {},
1172 | "output_type": "execute_result"
1173 | },
1174 | {
1175 | "data": {
1176 | "image/png": "\n",
1177 | "text/plain": [
1178 | ""
1179 | ]
1180 | },
1181 | "metadata": {
1182 | "needs_background": "light"
1183 | },
1184 | "output_type": "display_data"
1185 | }
1186 | ],
1187 | "source": [
1188 | "# Визуализировать таблицу сопряженности можно так:\n",
1189 | "pd.crosstab(df.Sex, df.Survived).plot(kind='bar')\n",
1190 | "plt.title('Количество мужчин и женщин на Титанике')\n",
1191 | "plt.xlabel('Пол')\n",
1192 | "plt.ylabel('Количество')"
1193 | ]
1194 | },
1195 | {
1196 | "cell_type": "code",
1197 | "execution_count": 31,
1198 | "metadata": {},
1199 | "outputs": [
1200 | {
1201 | "data": {
1202 | "text/plain": [
1203 | "3.078125"
1204 | ]
1205 | },
1206 | "execution_count": 31,
1207 | "metadata": {},
1208 | "output_type": "execute_result"
1209 | }
1210 | ],
1211 | "source": [
1212 | "# cross_table.female[0] - количество погибших женщин\n",
1213 | "# cross_table.female[1] - количество выживших женщин\n",
1214 | "odds_female = cross_table.female[1] / cross_table.female[0]\n",
1215 | "odds_female"
1216 | ]
1217 | },
1218 | {
1219 | "cell_type": "code",
1220 | "execution_count": 32,
1221 | "metadata": {},
1222 | "outputs": [
1223 | {
1224 | "data": {
1225 | "text/plain": [
1226 | "1.1243206453783166"
1227 | ]
1228 | },
1229 | "execution_count": 32,
1230 | "metadata": {},
1231 | "output_type": "execute_result"
1232 | }
1233 | ],
1234 | "source": [
1235 | "# Логарифм шансов для женщин - это и есть Intercept\n",
1236 | "np.log(odds_female)"
1237 | ]
1238 | },
1239 | {
1240 | "cell_type": "code",
1241 | "execution_count": 33,
1242 | "metadata": {},
1243 | "outputs": [
1244 | {
1245 | "data": {
1246 | "text/plain": [
1247 | "0.25833333333333336"
1248 | ]
1249 | },
1250 | "execution_count": 33,
1251 | "metadata": {},
1252 | "output_type": "execute_result"
1253 | }
1254 | ],
1255 | "source": [
1256 | "# cross_table.male[0] - количество погибших мужчин\n",
1257 | "# cross_table.male[1] - количество выживших мужчин\n",
1258 | "odds_male = cross_table.male[1] / cross_table.male[0]\n",
1259 | "odds_male"
1260 | ]
1261 | },
1262 | {
1263 | "cell_type": "code",
1264 | "execution_count": 34,
1265 | "metadata": {},
1266 | "outputs": [
1267 | {
1268 | "data": {
1269 | "text/plain": [
1270 | "0.08392554991539763"
1271 | ]
1272 | },
1273 | "execution_count": 34,
1274 | "metadata": {},
1275 | "output_type": "execute_result"
1276 | }
1277 | ],
1278 | "source": [
1279 | "odds_ratio = odds_male / odds_female\n",
1280 | "odds_ratio"
1281 | ]
1282 | },
1283 | {
1284 | "cell_type": "code",
1285 | "execution_count": 35,
1286 | "metadata": {},
1287 | "outputs": [
1288 | {
1289 | "data": {
1290 | "text/plain": [
1291 | "-2.4778251836752165"
1292 | ]
1293 | },
1294 | "execution_count": 35,
1295 | "metadata": {},
1296 | "output_type": "execute_result"
1297 | }
1298 | ],
1299 | "source": [
1300 | "np.log(odds_ratio)\n",
1301 | "# Логарифм отношения шансов для мужчин к шансам для женщин - это и есть значение C(Sex)[T.male]"
1302 | ]
1303 | },
1304 | {
1305 | "cell_type": "code",
1306 | "execution_count": 36,
1307 | "metadata": {},
1308 | "outputs": [
1309 | {
1310 | "data": {
1311 | "text/plain": [
1312 | "-1.3534999999999997"
1313 | ]
1314 | },
1315 | "execution_count": 36,
1316 | "metadata": {},
1317 | "output_type": "execute_result"
1318 | }
1319 | ],
1320 | "source": [
1321 | "# Получаем модель:\n",
1322 | "# log(odds) = 1.1243 - (2.4778 * Sex_male)\n",
1323 | "\n",
1324 | "# Если предсказываем логарифм шансов для женщин (Sex_male = 0):\n",
1325 | "# log(odds) = 1.1243 - (2.4778 * 0)\n",
1326 | "# log(odds) = 1.1243\n",
1327 | "\n",
1328 | "# Если предсказываем логарифм шансов для мужчин (Sex_male = 1):\n",
1329 | "# log(odds) = 1.1243 - (2.4778 * 1)\n",
1330 | "# log(odds) = 1.1243 - 2.4778\n",
1331 | "# log(odds) -1.3535\n",
1332 | "\n",
1333 | "log_odds_male = 1.1243 - 2.4778\n",
1334 | "log_odds_male"
1335 | ]
1336 | },
1337 | {
1338 | "cell_type": "code",
1339 | "execution_count": 37,
1340 | "metadata": {},
1341 | "outputs": [
1342 | {
1343 | "data": {
1344 | "text/plain": [
1345 | "754.7002061466517"
1346 | ]
1347 | },
1348 | "execution_count": 37,
1349 | "metadata": {},
1350 | "output_type": "execute_result"
1351 | }
1352 | ],
1353 | "source": [
1354 | "# aic у этой модели ниже, чем у предыдущей, значит эта модель лучше.\n",
1355 | "glm_binomial.aic"
1356 | ]
1357 | },
1358 | {
1359 | "cell_type": "markdown",
1360 | "metadata": {},
1361 | "source": [
1362 | "## Модель с двумя номинативными предикторами"
1363 | ]
1364 | },
1365 | {
1366 | "cell_type": "code",
1367 | "execution_count": 38,
1368 | "metadata": {},
1369 | "outputs": [
1370 | {
1371 | "data": {
1372 | "text/html": [
1373 | "\n",
1374 | "Generalized Linear Model Regression Results\n",
1375 | "\n",
1376 | " | Dep. Variable: | Survived | No. Observations: | 714 | \n",
1377 | "
\n",
1378 | "\n",
1379 | " | Model: | GLM | Df Residuals: | 708 | \n",
1380 | "
\n",
1381 | "\n",
1382 | " | Model Family: | Binomial | Df Model: | 5 | \n",
1383 | "
\n",
1384 | "\n",
1385 | " | Link Function: | logit | Scale: | 1.0000 | \n",
1386 | "
\n",
1387 | "\n",
1388 | " | Method: | IRLS | Log-Likelihood: | -321.14 | \n",
1389 | "
\n",
1390 | "\n",
1391 | " | Date: | Tue, 08 Jun 2021 | Deviance: | 642.28 | \n",
1392 | "
\n",
1393 | "\n",
1394 | " | Time: | 20:53:34 | Pearson chi2: | 714. | \n",
1395 | "
\n",
1396 | "\n",
1397 | " | No. Iterations: | 6 | | | \n",
1398 | "
\n",
1399 | "\n",
1400 | " | Covariance Type: | nonrobust | | | \n",
1401 | "
\n",
1402 | "
\n",
1403 | "\n",
1404 | "\n",
1405 | " | coef | std err | z | P>|z| | [0.025 | 0.975] | \n",
1406 | "
\n",
1407 | "\n",
1408 | " | Intercept | 3.3081 | 0.588 | 5.628 | 0.000 | 2.156 | 4.460 | \n",
1409 | "
\n",
1410 | "\n",
1411 | " | C(Sex)[T.male] | -3.7301 | 0.622 | -5.997 | 0.000 | -4.949 | -2.511 | \n",
1412 | "
\n",
1413 | "\n",
1414 | " | C(Pclass)[T.2] | -0.8804 | 0.726 | -1.213 | 0.225 | -2.303 | 0.542 | \n",
1415 | "
\n",
1416 | "\n",
1417 | " | C(Pclass)[T.3] | -3.4653 | 0.620 | -5.585 | 0.000 | -4.681 | -2.249 | \n",
1418 | "
\n",
1419 | "\n",
1420 | " | C(Sex)[T.male]:C(Pclass)[T.2] | -0.4204 | 0.804 | -0.523 | 0.601 | -1.997 | 1.156 | \n",
1421 | "
\n",
1422 | "\n",
1423 | " | C(Sex)[T.male]:C(Pclass)[T.3] | 2.1542 | 0.676 | 3.185 | 0.001 | 0.829 | 3.480 | \n",
1424 | "
\n",
1425 | "
"
1426 | ],
1427 | "text/plain": [
1428 | "\n",
1429 | "\"\"\"\n",
1430 | " Generalized Linear Model Regression Results \n",
1431 | "==============================================================================\n",
1432 | "Dep. Variable: Survived No. Observations: 714\n",
1433 | "Model: GLM Df Residuals: 708\n",
1434 | "Model Family: Binomial Df Model: 5\n",
1435 | "Link Function: logit Scale: 1.0000\n",
1436 | "Method: IRLS Log-Likelihood: -321.14\n",
1437 | "Date: Tue, 08 Jun 2021 Deviance: 642.28\n",
1438 | "Time: 20:53:34 Pearson chi2: 714.\n",
1439 | "No. Iterations: 6 \n",
1440 | "Covariance Type: nonrobust \n",
1441 | "=================================================================================================\n",
1442 | " coef std err z P>|z| [0.025 0.975]\n",
1443 | "-------------------------------------------------------------------------------------------------\n",
1444 | "Intercept 3.3081 0.588 5.628 0.000 2.156 4.460\n",
1445 | "C(Sex)[T.male] -3.7301 0.622 -5.997 0.000 -4.949 -2.511\n",
1446 | "C(Pclass)[T.2] -0.8804 0.726 -1.213 0.225 -2.303 0.542\n",
1447 | "C(Pclass)[T.3] -3.4653 0.620 -5.585 0.000 -4.681 -2.249\n",
1448 | "C(Sex)[T.male]:C(Pclass)[T.2] -0.4204 0.804 -0.523 0.601 -1.997 1.156\n",
1449 | "C(Sex)[T.male]:C(Pclass)[T.3] 2.1542 0.676 3.185 0.001 0.829 3.480\n",
1450 | "=================================================================================================\n",
1451 | "\"\"\""
1452 | ]
1453 | },
1454 | "execution_count": 38,
1455 | "metadata": {},
1456 | "output_type": "execute_result"
1457 | }
1458 | ],
1459 | "source": [
1460 | "glm_binomial = smf.glm(formula='Survived ~ C(Sex) * C(Pclass)', data=df, family=sm.families.Binomial()).fit()\n",
1461 | "glm_binomial.summary()\n",
1462 | "\n",
1463 | "# Intercept - логарифм шансов для Ж в 1кл\n",
1464 | "# C(Sex)[T.male] - логарифм отношения шансов для М в 1кл к шансам Ж в 1кл\n",
1465 | "# C(Pclass)[T.2] - логарифм отношения шансов для Ж во 2кл к шансам Ж в 1кл\n",
1466 | "# C(Pclass)[T.3] - логарифм отношения шансов для Ж в 3кл к шансам Ж в 1кл\n",
1467 | "# C(Sex)[T.male]:C(Pclass)[T.2] - разность логарифмов отношения шансов, рассчитанных для М/Ж во 2кл и М/Ж в 1кл\n",
1468 | "# C(Sex)[T.male]:C(Pclass)[T.3] - разность логарифмов отношения шансов, рассчитанных для М/Ж во 3кл и М/Ж в 1кл"
1469 | ]
1470 | },
1471 | {
1472 | "cell_type": "code",
1473 | "execution_count": 39,
1474 | "metadata": {},
1475 | "outputs": [
1476 | {
1477 | "data": {
1478 | "text/html": [
1479 | "\n",
1480 | "\n",
1493 | "
\n",
1494 | " \n",
1495 | " \n",
1496 | " | Pclass | \n",
1497 | " 1 | \n",
1498 | " 2 | \n",
1499 | " 3 | \n",
1500 | "
\n",
1501 | " \n",
1502 | " | Survived | \n",
1503 | " | \n",
1504 | " | \n",
1505 | " | \n",
1506 | "
\n",
1507 | " \n",
1508 | " \n",
1509 | " \n",
1510 | " | 0 | \n",
1511 | " 3 | \n",
1512 | " 6 | \n",
1513 | " 55 | \n",
1514 | "
\n",
1515 | " \n",
1516 | " | 1 | \n",
1517 | " 82 | \n",
1518 | " 68 | \n",
1519 | " 47 | \n",
1520 | "
\n",
1521 | " \n",
1522 | "
\n",
1523 | "
"
1524 | ],
1525 | "text/plain": [
1526 | "Pclass 1 2 3\n",
1527 | "Survived \n",
1528 | "0 3 6 55\n",
1529 | "1 82 68 47"
1530 | ]
1531 | },
1532 | "execution_count": 39,
1533 | "metadata": {},
1534 | "output_type": "execute_result"
1535 | }
1536 | ],
1537 | "source": [
1538 | "# Таблица сопряженности между переменными выжил/не выжил и классом пассажира для женщин.\n",
1539 | "cross_table_female = pd.crosstab(df[df.Sex == 'female'].Survived, df.Pclass)\n",
1540 | "cross_table_female"
1541 | ]
1542 | },
1543 | {
1544 | "cell_type": "code",
1545 | "execution_count": 40,
1546 | "metadata": {},
1547 | "outputs": [
1548 | {
1549 | "data": {
1550 | "text/plain": [
1551 | "Text(0, 0.5, 'Количество')"
1552 | ]
1553 | },
1554 | "execution_count": 40,
1555 | "metadata": {},
1556 | "output_type": "execute_result"
1557 | },
1558 | {
1559 | "data": {
1560 | "image/png": "\n",
1561 | "text/plain": [
1562 | ""
1563 | ]
1564 | },
1565 | "metadata": {
1566 | "needs_background": "light"
1567 | },
1568 | "output_type": "display_data"
1569 | }
1570 | ],
1571 | "source": [
1572 | "pd.crosstab(df.Pclass[df.Sex == 'female'], df.Survived).plot(kind='bar')\n",
1573 | "plt.title('Выживаемость женщин в разных классах')\n",
1574 | "plt.xlabel('Класс пассажира')\n",
1575 | "plt.ylabel('Количество')"
1576 | ]
1577 | },
1578 | {
1579 | "cell_type": "code",
1580 | "execution_count": 41,
1581 | "metadata": {},
1582 | "outputs": [
1583 | {
1584 | "data": {
1585 | "text/html": [
1586 | "\n",
1587 | "\n",
1600 | "
\n",
1601 | " \n",
1602 | " \n",
1603 | " | Pclass | \n",
1604 | " 1 | \n",
1605 | " 2 | \n",
1606 | " 3 | \n",
1607 | "
\n",
1608 | " \n",
1609 | " | Survived | \n",
1610 | " | \n",
1611 | " | \n",
1612 | " | \n",
1613 | "
\n",
1614 | " \n",
1615 | " \n",
1616 | " \n",
1617 | " | 0 | \n",
1618 | " 61 | \n",
1619 | " 84 | \n",
1620 | " 215 | \n",
1621 | "
\n",
1622 | " \n",
1623 | " | 1 | \n",
1624 | " 40 | \n",
1625 | " 15 | \n",
1626 | " 38 | \n",
1627 | "
\n",
1628 | " \n",
1629 | "
\n",
1630 | "
"
1631 | ],
1632 | "text/plain": [
1633 | "Pclass 1 2 3\n",
1634 | "Survived \n",
1635 | "0 61 84 215\n",
1636 | "1 40 15 38"
1637 | ]
1638 | },
1639 | "execution_count": 41,
1640 | "metadata": {},
1641 | "output_type": "execute_result"
1642 | }
1643 | ],
1644 | "source": [
1645 | "# Таблица сопряженности между переменными выжил/не выжил и классом пассажира для мужчин.\n",
1646 | "cross_table_male = pd.crosstab(df[df.Sex == 'male'].Survived, df.Pclass)\n",
1647 | "cross_table_male"
1648 | ]
1649 | },
1650 | {
1651 | "cell_type": "code",
1652 | "execution_count": 42,
1653 | "metadata": {},
1654 | "outputs": [
1655 | {
1656 | "data": {
1657 | "text/plain": [
1658 | "Text(0, 0.5, 'Количество')"
1659 | ]
1660 | },
1661 | "execution_count": 42,
1662 | "metadata": {},
1663 | "output_type": "execute_result"
1664 | },
1665 | {
1666 | "data": {
1667 | "image/png": "\n",
1668 | "text/plain": [
1669 | ""
1670 | ]
1671 | },
1672 | "metadata": {
1673 | "needs_background": "light"
1674 | },
1675 | "output_type": "display_data"
1676 | }
1677 | ],
1678 | "source": [
1679 | "pd.crosstab(df.Pclass[df.Sex == 'male'], df.Survived).plot(kind='bar')\n",
1680 | "plt.title('Выживаемость мужчин в разных классах')\n",
1681 | "plt.xlabel('Класс пассажира')\n",
1682 | "plt.ylabel('Количество')"
1683 | ]
1684 | },
1685 | {
1686 | "cell_type": "code",
1687 | "execution_count": 43,
1688 | "metadata": {},
1689 | "outputs": [
1690 | {
1691 | "data": {
1692 | "image/png": "\n",
1693 | "text/plain": [
1694 | ""
1695 | ]
1696 | },
1697 | "metadata": {
1698 | "needs_background": "light"
1699 | },
1700 | "output_type": "display_data"
1701 | }
1702 | ],
1703 | "source": [
1704 | "# График сопряженности между классом пассажира, полом и переменной выжил/не выжил.\n",
1705 | "\n",
1706 | "from statsmodels.graphics.mosaicplot import mosaic\n",
1707 | "\n",
1708 | "mosaic(df, ['Pclass', 'Sex', 'Survived'], title='Мозаичный график', horizontal=0, gap=0.03)\n",
1709 | "plt.show()"
1710 | ]
1711 | },
1712 | {
1713 | "cell_type": "code",
1714 | "execution_count": 44,
1715 | "metadata": {},
1716 | "outputs": [
1717 | {
1718 | "data": {
1719 | "text/plain": [
1720 | "27.333333333333332"
1721 | ]
1722 | },
1723 | "execution_count": 44,
1724 | "metadata": {},
1725 | "output_type": "execute_result"
1726 | }
1727 | ],
1728 | "source": [
1729 | "odds_female_pc1 = cross_table_female[1][1] / cross_table_female[1][0]\n",
1730 | "odds_female_pc1"
1731 | ]
1732 | },
1733 | {
1734 | "cell_type": "code",
1735 | "execution_count": 45,
1736 | "metadata": {},
1737 | "outputs": [
1738 | {
1739 | "data": {
1740 | "text/plain": [
1741 | "3.3081069585961433"
1742 | ]
1743 | },
1744 | "execution_count": 45,
1745 | "metadata": {},
1746 | "output_type": "execute_result"
1747 | }
1748 | ],
1749 | "source": [
1750 | "np.log(odds_female_pc1)\n",
1751 | "# Это значение Intercept"
1752 | ]
1753 | },
1754 | {
1755 | "cell_type": "code",
1756 | "execution_count": 46,
1757 | "metadata": {},
1758 | "outputs": [
1759 | {
1760 | "data": {
1761 | "text/plain": [
1762 | "0.6557377049180327"
1763 | ]
1764 | },
1765 | "execution_count": 46,
1766 | "metadata": {},
1767 | "output_type": "execute_result"
1768 | }
1769 | ],
1770 | "source": [
1771 | "odds_male_pc1 = cross_table_male[1][1] / cross_table_male[1][0]\n",
1772 | "odds_male_pc1"
1773 | ]
1774 | },
1775 | {
1776 | "cell_type": "code",
1777 | "execution_count": 47,
1778 | "metadata": {},
1779 | "outputs": [
1780 | {
1781 | "data": {
1782 | "text/plain": [
1783 | "-3.7301013686555184"
1784 | ]
1785 | },
1786 | "execution_count": 47,
1787 | "metadata": {},
1788 | "output_type": "execute_result"
1789 | }
1790 | ],
1791 | "source": [
1792 | "np.log(odds_male_pc1 / odds_female_pc1)\n",
1793 | "# Это значение C(Sex)[T.male]"
1794 | ]
1795 | },
1796 | {
1797 | "cell_type": "code",
1798 | "execution_count": 48,
1799 | "metadata": {},
1800 | "outputs": [
1801 | {
1802 | "data": {
1803 | "text/plain": [
1804 | "11.333333333333334"
1805 | ]
1806 | },
1807 | "execution_count": 48,
1808 | "metadata": {},
1809 | "output_type": "execute_result"
1810 | }
1811 | ],
1812 | "source": [
1813 | "odds_female_pc2 = cross_table_female[2][1] / cross_table_female[2][0]\n",
1814 | "odds_female_pc2"
1815 | ]
1816 | },
1817 | {
1818 | "cell_type": "code",
1819 | "execution_count": 49,
1820 | "metadata": {},
1821 | "outputs": [
1822 | {
1823 | "data": {
1824 | "text/plain": [
1825 | "-0.8803587226480917"
1826 | ]
1827 | },
1828 | "execution_count": 49,
1829 | "metadata": {},
1830 | "output_type": "execute_result"
1831 | }
1832 | ],
1833 | "source": [
1834 | "np.log(odds_female_pc2 / odds_female_pc1)\n",
1835 | "# Это значение C(Pclass)[T.2]"
1836 | ]
1837 | },
1838 | {
1839 | "cell_type": "code",
1840 | "execution_count": 50,
1841 | "metadata": {},
1842 | "outputs": [
1843 | {
1844 | "data": {
1845 | "text/plain": [
1846 | "0.8545454545454545"
1847 | ]
1848 | },
1849 | "execution_count": 50,
1850 | "metadata": {},
1851 | "output_type": "execute_result"
1852 | }
1853 | ],
1854 | "source": [
1855 | "odds_female_pc3 = cross_table_female[3][1] / cross_table_female[3][0]\n",
1856 | "odds_female_pc3"
1857 | ]
1858 | },
1859 | {
1860 | "cell_type": "code",
1861 | "execution_count": 51,
1862 | "metadata": {},
1863 | "outputs": [
1864 | {
1865 | "data": {
1866 | "text/plain": [
1867 | "-3.4652925421185556"
1868 | ]
1869 | },
1870 | "execution_count": 51,
1871 | "metadata": {},
1872 | "output_type": "execute_result"
1873 | }
1874 | ],
1875 | "source": [
1876 | "np.log(odds_female_pc3 / odds_female_pc1)\n",
1877 | "# Это значение C(Pclass)[T.3]"
1878 | ]
1879 | },
1880 | {
1881 | "cell_type": "code",
1882 | "execution_count": 52,
1883 | "metadata": {},
1884 | "outputs": [
1885 | {
1886 | "data": {
1887 | "text/plain": [
1888 | "0.17857142857142858"
1889 | ]
1890 | },
1891 | "execution_count": 52,
1892 | "metadata": {},
1893 | "output_type": "execute_result"
1894 | }
1895 | ],
1896 | "source": [
1897 | "odds_male_pc2 = cross_table_male[2][1] / cross_table_male[2][0]\n",
1898 | "odds_male_pc2"
1899 | ]
1900 | },
1901 | {
1902 | "cell_type": "code",
1903 | "execution_count": 53,
1904 | "metadata": {},
1905 | "outputs": [
1906 | {
1907 | "data": {
1908 | "text/plain": [
1909 | "-0.42041346503363686"
1910 | ]
1911 | },
1912 | "execution_count": 53,
1913 | "metadata": {},
1914 | "output_type": "execute_result"
1915 | }
1916 | ],
1917 | "source": [
1918 | "np.log(odds_male_pc2 / odds_female_pc2) - np.log(odds_male_pc1 / odds_female_pc1)\n",
1919 | "# Это значение C(Sex)[T.male]:C(Pclass)[T.2]"
1920 | ]
1921 | },
1922 | {
1923 | "cell_type": "code",
1924 | "execution_count": 54,
1925 | "metadata": {},
1926 | "outputs": [
1927 | {
1928 | "data": {
1929 | "text/plain": [
1930 | "0.17674418604651163"
1931 | ]
1932 | },
1933 | "execution_count": 54,
1934 | "metadata": {},
1935 | "output_type": "execute_result"
1936 | }
1937 | ],
1938 | "source": [
1939 | "odds_male_pc3 = cross_table_male[3][1] / cross_table_male[3][0]\n",
1940 | "odds_male_pc3"
1941 | ]
1942 | },
1943 | {
1944 | "cell_type": "code",
1945 | "execution_count": 55,
1946 | "metadata": {},
1947 | "outputs": [
1948 | {
1949 | "data": {
1950 | "text/plain": [
1951 | "2.154235083776654"
1952 | ]
1953 | },
1954 | "execution_count": 55,
1955 | "metadata": {},
1956 | "output_type": "execute_result"
1957 | }
1958 | ],
1959 | "source": [
1960 | "np.log(odds_male_pc3 / odds_female_pc3) - np.log(odds_male_pc1 / odds_female_pc1)\n",
1961 | "# Это значение C(Sex)[T.male]:C(Pclass)[T.3]"
1962 | ]
1963 | },
1964 | {
1965 | "cell_type": "code",
1966 | "execution_count": 56,
1967 | "metadata": {},
1968 | "outputs": [
1969 | {
1970 | "data": {
1971 | "text/plain": [
1972 | "-0.42200000000000015"
1973 | ]
1974 | },
1975 | "execution_count": 56,
1976 | "metadata": {},
1977 | "output_type": "execute_result"
1978 | }
1979 | ],
1980 | "source": [
1981 | "# Получаем модель:\n",
1982 | "# log(odds) = 3.3081 - (3.7301 * Sex_male) - (0.8804 * Pclass2) - (3.4653 * Pclass3) - (0.4204 * Sex_male * Pclass2) + (2.1542 * Sex_male * Pclass3)\n",
1983 | "\n",
1984 | "# Если предсказываем логарифм шансов для Ж в 1кл (Sex_male = 0, Pclass2 = 0, Pclass3 = 0):\n",
1985 | "# log(odds) = 3.3081 - (3.7301 * 0) - (0.8804 * 0) - (3.4653 * 0) - (0.4204 * 0 * 0) + (2.1542 * 0 * 0)\n",
1986 | "# log(odds) = 3.3081\n",
1987 | "\n",
1988 | "# Если предсказываем логарифм шансов для М в 1кл (Sex_male = 1, Pclass2 = 0, Pclass3 = 0):\n",
1989 | "# log(odds) = 3.3081 - (3.7301 * 1) - (0.8804 * 0) - (3.4653 * 0) - (0.4204 * 1 * 0) + (2.1542 * 1 * 0)\n",
1990 | "# log(odds) = 3.3081 - 3.7301\n",
1991 | "# log(odds) = -0.422\n",
1992 | "\n",
1993 | "log_odds_male = 3.3081 - 3.7301\n",
1994 | "log_odds_male"
1995 | ]
1996 | },
1997 | {
1998 | "cell_type": "code",
1999 | "execution_count": 57,
2000 | "metadata": {},
2001 | "outputs": [
2002 | {
2003 | "data": {
2004 | "text/plain": [
2005 | "2.4277"
2006 | ]
2007 | },
2008 | "execution_count": 57,
2009 | "metadata": {},
2010 | "output_type": "execute_result"
2011 | }
2012 | ],
2013 | "source": [
2014 | "# Если предсказываем логарифм шансов для Ж во 2кл (Sex_male = 0, Pclass2 = 1, Pclass3 = 0):\n",
2015 | "# log(odds) = 3.3081 - (3.7301 * 0) - (0.8804 * 1) - (3.4653 * 0) - (0.4204 * 0 * 1) + (2.1542 * 0 * 0)\n",
2016 | "# log(odds) = 3.3081 - 0.8804\n",
2017 | "# log(odds) = 2.4277\n",
2018 | "\n",
2019 | "log_odds_female_pc2 = 3.3081 - 0.8804\n",
2020 | "log_odds_female_pc2"
2021 | ]
2022 | },
2023 | {
2024 | "cell_type": "code",
2025 | "execution_count": 58,
2026 | "metadata": {},
2027 | "outputs": [
2028 | {
2029 | "data": {
2030 | "text/plain": [
2031 | "-0.1572"
2032 | ]
2033 | },
2034 | "execution_count": 58,
2035 | "metadata": {},
2036 | "output_type": "execute_result"
2037 | }
2038 | ],
2039 | "source": [
2040 | "# Если предсказываем логарифм шансов для Ж в 3кл (Sex_male = 0, Pclass2 = 0, Pclass3 = 1):\n",
2041 | "# log(odds) = 3.3081 - (3.7301 * 0) - (0.8804 * 0) - (3.4653 * 1) - (0.4204 * 0 * 0) + (2.1542 * 0 * 1)\n",
2042 | "# log(odds) = 3.3081 - 3.4653\n",
2043 | "# log(odds) = -0.1572\n",
2044 | "\n",
2045 | "log_odds_female_pc3 = 3.3081 - 3.4653\n",
2046 | "log_odds_female_pc3"
2047 | ]
2048 | },
2049 | {
2050 | "cell_type": "code",
2051 | "execution_count": 59,
2052 | "metadata": {},
2053 | "outputs": [
2054 | {
2055 | "data": {
2056 | "text/plain": [
2057 | "-1.7227999999999999"
2058 | ]
2059 | },
2060 | "execution_count": 59,
2061 | "metadata": {},
2062 | "output_type": "execute_result"
2063 | }
2064 | ],
2065 | "source": [
2066 | "# Если предсказываем логарифм шансов для М во 2кл (Sex_male = 1, Pclass2 = 1, Pclass3 = 0):\n",
2067 | "# log(odds) = 3.3081 - (3.7301 * 1) - (0.8804 * 1) - (3.4653 * 0) - (0.4204 * 1 * 1) + (2.1542 * 1 * 0)\n",
2068 | "# log(odds) = 3.3081 - 3.7301 - 0.8804 - 0.4204\n",
2069 | "# log(odds) = -1.7228\n",
2070 | "\n",
2071 | "log_odds_male_pc2 = 3.3081 - 3.7301 - 0.8804 - 0.4204\n",
2072 | "log_odds_male_pc2"
2073 | ]
2074 | },
2075 | {
2076 | "cell_type": "code",
2077 | "execution_count": 60,
2078 | "metadata": {},
2079 | "outputs": [
2080 | {
2081 | "data": {
2082 | "text/plain": [
2083 | "-1.7331000000000003"
2084 | ]
2085 | },
2086 | "execution_count": 60,
2087 | "metadata": {},
2088 | "output_type": "execute_result"
2089 | }
2090 | ],
2091 | "source": [
2092 | "# Если предсказываем логарифм шансов для М в 3кл (Sex_male = 1, Pclass2 = 0, Pclass3 = 1):\n",
2093 | "# log(odds) = 3.3081 - (3.7301 * 1) - (0.8804 * 0) - (3.4653 * 1) - (0.4204 * 1 * 0) + (2.1542 * 1 * 1)\n",
2094 | "# log(odds) = 3.3081 - 3.7301 - 3.4653 + 2.1542\n",
2095 | "# log(odds) = -1.7331\n",
2096 | "\n",
2097 | "log_odds_male_pc3 = 3.3081 - 3.7301 - 3.4653 + 2.1542\n",
2098 | "log_odds_male_pc3"
2099 | ]
2100 | },
2101 | {
2102 | "cell_type": "code",
2103 | "execution_count": 61,
2104 | "metadata": {},
2105 | "outputs": [
2106 | {
2107 | "data": {
2108 | "text/plain": [
2109 | "654.2750443468178"
2110 | ]
2111 | },
2112 | "execution_count": 61,
2113 | "metadata": {},
2114 | "output_type": "execute_result"
2115 | }
2116 | ],
2117 | "source": [
2118 | "# aic у этой модели ниже, чем у двух предыдущих, значит эта модель лучше.\n",
2119 | "glm_binomial.aic"
2120 | ]
2121 | },
2122 | {
2123 | "cell_type": "markdown",
2124 | "metadata": {},
2125 | "source": [
2126 | "## Комбинация предикторов разного типа"
2127 | ]
2128 | },
2129 | {
2130 | "cell_type": "code",
2131 | "execution_count": 62,
2132 | "metadata": {},
2133 | "outputs": [
2134 | {
2135 | "data": {
2136 | "text/html": [
2137 | "\n",
2138 | "Generalized Linear Model Regression Results\n",
2139 | "\n",
2140 | " | Dep. Variable: | Survived | No. Observations: | 714 | \n",
2141 | "
\n",
2142 | "\n",
2143 | " | Model: | GLM | Df Residuals: | 709 | \n",
2144 | "
\n",
2145 | "\n",
2146 | " | Model Family: | Binomial | Df Model: | 4 | \n",
2147 | "
\n",
2148 | "\n",
2149 | " | Link Function: | logit | Scale: | 1.0000 | \n",
2150 | "
\n",
2151 | "\n",
2152 | " | Method: | IRLS | Log-Likelihood: | -323.64 | \n",
2153 | "
\n",
2154 | "\n",
2155 | " | Date: | Tue, 08 Jun 2021 | Deviance: | 647.28 | \n",
2156 | "
\n",
2157 | "\n",
2158 | " | Time: | 20:53:35 | Pearson chi2: | 767. | \n",
2159 | "
\n",
2160 | "\n",
2161 | " | No. Iterations: | 5 | | | \n",
2162 | "
\n",
2163 | "\n",
2164 | " | Covariance Type: | nonrobust | | | \n",
2165 | "
\n",
2166 | "
\n",
2167 | "\n",
2168 | "\n",
2169 | " | coef | std err | z | P>|z| | [0.025 | 0.975] | \n",
2170 | "
\n",
2171 | "\n",
2172 | " | Intercept | 3.7770 | 0.401 | 9.416 | 0.000 | 2.991 | 4.563 | \n",
2173 | "
\n",
2174 | "\n",
2175 | " | C(Sex)[T.male] | -2.5228 | 0.207 | -12.164 | 0.000 | -2.929 | -2.116 | \n",
2176 | "
\n",
2177 | "\n",
2178 | " | C(Pclass)[T.2] | -1.3098 | 0.278 | -4.710 | 0.000 | -1.855 | -0.765 | \n",
2179 | "
\n",
2180 | "\n",
2181 | " | C(Pclass)[T.3] | -2.5806 | 0.281 | -9.169 | 0.000 | -3.132 | -2.029 | \n",
2182 | "
\n",
2183 | "\n",
2184 | " | Age | -0.0370 | 0.008 | -4.831 | 0.000 | -0.052 | -0.022 | \n",
2185 | "
\n",
2186 | "
"
2187 | ],
2188 | "text/plain": [
2189 | "\n",
2190 | "\"\"\"\n",
2191 | " Generalized Linear Model Regression Results \n",
2192 | "==============================================================================\n",
2193 | "Dep. Variable: Survived No. Observations: 714\n",
2194 | "Model: GLM Df Residuals: 709\n",
2195 | "Model Family: Binomial Df Model: 4\n",
2196 | "Link Function: logit Scale: 1.0000\n",
2197 | "Method: IRLS Log-Likelihood: -323.64\n",
2198 | "Date: Tue, 08 Jun 2021 Deviance: 647.28\n",
2199 | "Time: 20:53:35 Pearson chi2: 767.\n",
2200 | "No. Iterations: 5 \n",
2201 | "Covariance Type: nonrobust \n",
2202 | "==================================================================================\n",
2203 | " coef std err z P>|z| [0.025 0.975]\n",
2204 | "----------------------------------------------------------------------------------\n",
2205 | "Intercept 3.7770 0.401 9.416 0.000 2.991 4.563\n",
2206 | "C(Sex)[T.male] -2.5228 0.207 -12.164 0.000 -2.929 -2.116\n",
2207 | "C(Pclass)[T.2] -1.3098 0.278 -4.710 0.000 -1.855 -0.765\n",
2208 | "C(Pclass)[T.3] -2.5806 0.281 -9.169 0.000 -3.132 -2.029\n",
2209 | "Age -0.0370 0.008 -4.831 0.000 -0.052 -0.022\n",
2210 | "==================================================================================\n",
2211 | "\"\"\""
2212 | ]
2213 | },
2214 | "execution_count": 62,
2215 | "metadata": {},
2216 | "output_type": "execute_result"
2217 | }
2218 | ],
2219 | "source": [
2220 | "glm_binomial = smf.glm(formula='Survived ~ C(Sex) + C(Pclass) + Age', data=df, family=sm.families.Binomial()).fit()\n",
2221 | "glm_binomial.summary()\n",
2222 | "\n",
2223 | "# Intercept - логарифм шансов для Ж в 1кл при условии, что их возраст равен нулю.\n",
2224 | "# Age - коэффициент при возрасте показывает,\n",
2225 | "# на сколько изменяется логарифм шансов для базового уровня при единичном изменении возраста."
2226 | ]
2227 | },
2228 | {
2229 | "cell_type": "code",
2230 | "execution_count": 63,
2231 | "metadata": {},
2232 | "outputs": [
2233 | {
2234 | "data": {
2235 | "text/plain": [
2236 | "3.037"
2237 | ]
2238 | },
2239 | "execution_count": 63,
2240 | "metadata": {},
2241 | "output_type": "execute_result"
2242 | }
2243 | ],
2244 | "source": [
2245 | "# Получаем модель:\n",
2246 | "# log(odds) = 3.7770 - (2.5228 * Sex_male) - (1.3098 * Pclass2) - (2.5806 * Pclass3) - (0.0370 * Age)\n",
2247 | "\n",
2248 | "# Если предсказываем логарифм шансов для Ж в 1кл в возрасте 20 лет:\n",
2249 | "# log(odds) = 3.7770 - (2.5228 * 0) - (1.3098 * 0) - (2.5806 * 0) - (0.0370 * 20)\n",
2250 | "# log(odds) = 3.7770 - (0.0370 * 20)\n",
2251 | "# log(odds) = 3.037\n",
2252 | "\n",
2253 | "3.7770 - (0.0370 * 20)"
2254 | ]
2255 | },
2256 | {
2257 | "cell_type": "code",
2258 | "execution_count": 64,
2259 | "metadata": {},
2260 | "outputs": [
2261 | {
2262 | "data": {
2263 | "text/plain": [
2264 | "657.2831255018241"
2265 | ]
2266 | },
2267 | "execution_count": 64,
2268 | "metadata": {},
2269 | "output_type": "execute_result"
2270 | }
2271 | ],
2272 | "source": [
2273 | "# Кстати, модель без учета возраста была лучше.\n",
2274 | "glm_binomial.aic"
2275 | ]
2276 | },
2277 | {
2278 | "cell_type": "code",
2279 | "execution_count": null,
2280 | "metadata": {},
2281 | "outputs": [],
2282 | "source": []
2283 | }
2284 | ],
2285 | "metadata": {
2286 | "kernelspec": {
2287 | "display_name": "Python 3",
2288 | "language": "python",
2289 | "name": "python3"
2290 | },
2291 | "language_info": {
2292 | "codemirror_mode": {
2293 | "name": "ipython",
2294 | "version": 3
2295 | },
2296 | "file_extension": ".py",
2297 | "mimetype": "text/x-python",
2298 | "name": "python",
2299 | "nbconvert_exporter": "python",
2300 | "pygments_lexer": "ipython3",
2301 | "version": "3.7.4"
2302 | }
2303 | },
2304 | "nbformat": 4,
2305 | "nbformat_minor": 2
2306 | }
2307 |
--------------------------------------------------------------------------------