├── Analyzing CIA Factbook Data Using SQL ├── Analyzing NYC High School Data ├── Answering Business Questions using SQL.ipynb ├── Building a Spam Filter with Naive Bayes.ipynb ├── Clean And Analyze Employee Exit Surveys.ipynb ├── Exploring Ebay Car Sales Data.ipynb ├── Exploring Hacker News Posts.ipynb ├── Finding the Best Markets to Advertise In.ipynb ├── Investigating Fandango Movie Ratings.ipynb ├── Mobile App for Lottery Addiction.ipynb ├── Popular Data Science Questions.ipynb ├── Predicting Bike Rentals.ipynb ├── Predicting Car Prices.ipynb ├── Predicting House Sale Prices.ipynb ├── Predicting the stock market.py ├── Profitable App Profiles for the App Store and Google Play Markets.ipynb ├── Star Wars Survey ├── The Ultimate Halloween Candy Power Ranking.ipynb ├── Visualizing Earnings Based On College Majors.ipynb ├── Visualizing The Gender Gap In College Degrees.ipynb └── Workload forecast using linear regression - the case of a call center.ipynb /Analyzing CIA Factbook Data Using SQL: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Table exploration" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "'Connected: None@factbook.db'" 21 | ] 22 | }, 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "%%capture\n", 30 | "%load_ext sql\n", 31 | "%sql sqlite:///factbook.db" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 8, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Done.\n" 46 | ] 47 | }, 48 | { 49 | "data": { 50 | "text/html": [ 51 | "\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | "
typenametbl_namerootpagesql
tablesqlite_sequencesqlite_sequence3CREATE TABLE sqlite_sequence(name,seq)
tablefactsfacts47CREATE TABLE "facts" ("id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, "code" varchar(255) NOT NULL, "name" varchar(255) NOT NULL, "area" integer, "area_land" integer, "area_water" integer, "population" integer, "population_growth" float, "birth_rate" float, "death_rate" float, "migration_rate" float)
" 74 | ], 75 | "text/plain": [ 76 | "[('table', 'sqlite_sequence', 'sqlite_sequence', 3, 'CREATE TABLE sqlite_sequence(name,seq)'),\n", 77 | " ('table', 'facts', 'facts', 47, 'CREATE TABLE \"facts\" (\"id\" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \"code\" varchar(255) NOT NULL, \"name\" varchar(255) NOT NULL, \"area\" integer, \"a ... (4 characters truncated) ... land\" integer, \"area_water\" integer, \"population\" integer, \"population_growth\" float, \"birth_rate\" float, \"death_rate\" float, \"migration_rate\" float)')]" 78 | ] 79 | }, 80 | "execution_count": 8, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "%%sql\n", 87 | "SELECT * FROM sqlite_master WHERE type='table'" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 10, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "Done.\n" 102 | ] 103 | }, 104 | { 105 | "data": { 106 | "text/html": [ 107 | "\n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | "
idcodenameareaarea_landarea_waterpopulationpopulation_growthbirth_ratedeath_ratemigration_rate
1afAfghanistan6522306522300325643422.3238.5713.891.51
2alAlbania2874827398135030292780.312.926.583.3
3agAlgeria238174123817410395421661.8423.674.310.92
4anAndorra4684680855800.128.136.960.0
5aoAngola124670012467000196253532.7838.7811.490.46
" 187 | ], 188 | "text/plain": [ 189 | "[(1, 'af', 'Afghanistan', 652230, 652230, 0, 32564342, 2.32, 38.57, 13.89, 1.51),\n", 190 | " (2, 'al', 'Albania', 28748, 27398, 1350, 3029278, 0.3, 12.92, 6.58, 3.3),\n", 191 | " (3, 'ag', 'Algeria', 2381741, 2381741, 0, 39542166, 1.84, 23.67, 4.31, 0.92),\n", 192 | " (4, 'an', 'Andorra', 468, 468, 0, 85580, 0.12, 8.13, 6.96, 0.0),\n", 193 | " (5, 'ao', 'Angola', 1246700, 1246700, 0, 19625353, 2.78, 38.78, 11.49, 0.46)]" 194 | ] 195 | }, 196 | "execution_count": 10, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "%%sql\n", 203 | "SELECT * FROM facts LIMIT 5" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "# Minimum and maximum population and population growth" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 11, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "Done.\n" 225 | ] 226 | }, 227 | { 228 | "data": { 229 | "text/html": [ 230 | "\n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | "
MIN(population)MAX(population)MIN(population_growth)MAX(population_growth)
072564900110.04.02
" 244 | ], 245 | "text/plain": [ 246 | "[(0, 7256490011, 0.0, 4.02)]" 247 | ] 248 | }, 249 | "execution_count": 11, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "%%sql\n", 256 | "SELECT MIN(population),MAX(population),MIN(population_growth),MAX(population_growth) FROM facts" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 15, 262 | "metadata": { 263 | "collapsed": false 264 | }, 265 | "outputs": [ 266 | { 267 | "name": "stdout", 268 | "output_type": "stream", 269 | "text": [ 270 | "Done.\n" 271 | ] 272 | }, 273 | { 274 | "data": { 275 | "text/html": [ 276 | "\n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | "
idcodenameareaarea_landarea_waterpopulationpopulation_growthbirth_ratedeath_ratemigration_rate
261xxWorldNoneNoneNone72564900111.0818.67.8None
" 304 | ], 305 | "text/plain": [ 306 | "[(261, 'xx', 'World', None, None, None, 7256490011, 1.08, 18.6, 7.8, None)]" 307 | ] 308 | }, 309 | "execution_count": 15, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "%%sql\n", 316 | "SELECT * FROM facts\n", 317 | "WHERE population == (SELECT MAX(population) FROM facts)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 16, 323 | "metadata": { 324 | "collapsed": false 325 | }, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "Done.\n" 332 | ] 333 | }, 334 | { 335 | "data": { 336 | "text/html": [ 337 | "\n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | "
idcodenameareaarea_landarea_waterpopulationpopulation_growthbirth_ratedeath_ratemigration_rate
250ayAntarcticaNone280000None0NoneNoneNoneNone
" 365 | ], 366 | "text/plain": [ 367 | "[(250, 'ay', 'Antarctica', None, 280000, None, 0, None, None, None, None)]" 368 | ] 369 | }, 370 | "execution_count": 16, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | } 374 | ], 375 | "source": [ 376 | "%%sql\n", 377 | "SELECT * FROM facts\n", 378 | "WHERE population == (SELECT MIN(population) FROM facts)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "# Countries with above average population and below average area" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 19, 391 | "metadata": { 392 | "collapsed": false 393 | }, 394 | "outputs": [ 395 | { 396 | "name": "stdout", 397 | "output_type": "stream", 398 | "text": [ 399 | "Done.\n" 400 | ] 401 | }, 402 | { 403 | "data": { 404 | "text/html": [ 405 | "\n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | "
avg_popavg_area
62094928.32231405555093.546184739
" 415 | ], 416 | "text/plain": [ 417 | "[(62094928.32231405, 555093.546184739)]" 418 | ] 419 | }, 420 | "execution_count": 19, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "%%sql\n", 427 | "SELECT AVG(population) avg_pop, AVG(area) avg_area FROM facts" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 21, 433 | "metadata": { 434 | "collapsed": false 435 | }, 436 | "outputs": [ 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "Done.\n" 442 | ] 443 | }, 444 | { 445 | "data": { 446 | "text/html": [ 447 | "\n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | "
namepopulationarea
Bangladesh168957745148460
Germany80854408357022
Japan126919659377915
Philippines100998376300000
Thailand67976405513120
United Kingdom64088222243610
Vietnam94348835331210
" 489 | ], 490 | "text/plain": [ 491 | "[('Bangladesh', 168957745, 148460),\n", 492 | " ('Germany', 80854408, 357022),\n", 493 | " ('Japan', 126919659, 377915),\n", 494 | " ('Philippines', 100998376, 300000),\n", 495 | " ('Thailand', 67976405, 513120),\n", 496 | " ('United Kingdom', 64088222, 243610),\n", 497 | " ('Vietnam', 94348835, 331210)]" 498 | ] 499 | }, 500 | "execution_count": 21, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "%%sql\n", 507 | "SELECT name,population,area FROM facts\n", 508 | "WHERE population > (SELECT AVG(population) FROM facts) AND area < (SELECT AVG(area) FROM facts)" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "# Which countries have the highest ratios of water to land?" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 27, 521 | "metadata": { 522 | "collapsed": false 523 | }, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "Done.\n" 530 | ] 531 | }, 532 | { 533 | "data": { 534 | "text/html": [ 535 | "\n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | "
namewater_to_land
British Indian Ocean Territory905.6666666666666
Virgin Islands4.520231213872832
Puerto Rico0.5547914317925592
Bahamas, The0.3866133866133866
Guinea-Bissau0.2846728307254623
" 561 | ], 562 | "text/plain": [ 563 | "[('British Indian Ocean Territory', 905.6666666666666),\n", 564 | " ('Virgin Islands', 4.520231213872832),\n", 565 | " ('Puerto Rico', 0.5547914317925592),\n", 566 | " ('Bahamas, The', 0.3866133866133866),\n", 567 | " ('Guinea-Bissau', 0.2846728307254623)]" 568 | ] 569 | }, 570 | "execution_count": 27, 571 | "metadata": {}, 572 | "output_type": "execute_result" 573 | } 574 | ], 575 | "source": [ 576 | "%%sql\n", 577 | "SELECT name,CAST(area_water as Float)/CAST(area_land as Float) water_to_land FROM facts\n", 578 | "ORDER BY water_to_land DESC\n", 579 | "LIMIT 5" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": {}, 585 | "source": [ 586 | "# Which countries have more water than land?" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 28, 592 | "metadata": { 593 | "collapsed": false 594 | }, 595 | "outputs": [ 596 | { 597 | "name": "stdout", 598 | "output_type": "stream", 599 | "text": [ 600 | "Done.\n" 601 | ] 602 | }, 603 | { 604 | "data": { 605 | "text/html": [ 606 | "\n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | "
name
British Indian Ocean Territory
Virgin Islands
" 617 | ], 618 | "text/plain": [ 619 | "[('British Indian Ocean Territory',), ('Virgin Islands',)]" 620 | ] 621 | }, 622 | "execution_count": 28, 623 | "metadata": {}, 624 | "output_type": "execute_result" 625 | } 626 | ], 627 | "source": [ 628 | "%%sql\n", 629 | "SELECT name FROM facts\n", 630 | "WHERE area_water > area_land" 631 | ] 632 | }, 633 | { 634 | "cell_type": "markdown", 635 | "metadata": {}, 636 | "source": [ 637 | "# Which countries will add the most people to their population next year?" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 38, 643 | "metadata": { 644 | "collapsed": false 645 | }, 646 | "outputs": [ 647 | { 648 | "name": "stdout", 649 | "output_type": "stream", 650 | "text": [ 651 | "Done.\n" 652 | ] 653 | }, 654 | { 655 | "data": { 656 | "text/html": [ 657 | "\n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | "
nametotal_growth
World78370092.1188
India15270686.1248
China6153684.246
Nigeria4448270.372
Pakistan2906653.3662
" 683 | ], 684 | "text/plain": [ 685 | "[('World', 78370092.1188),\n", 686 | " ('India', 15270686.1248),\n", 687 | " ('China', 6153684.246),\n", 688 | " ('Nigeria', 4448270.372),\n", 689 | " ('Pakistan', 2906653.3662)]" 690 | ] 691 | }, 692 | "execution_count": 38, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "%%sql\n", 699 | "SELECT name, (population * population_growth / 100) total_growth FROM facts\n", 700 | "ORDER BY total_growth DESC\n", 701 | "LIMIT 5" 702 | ] 703 | }, 704 | { 705 | "cell_type": "markdown", 706 | "metadata": {}, 707 | "source": [ 708 | "# Which countries have a higher death rate than birth rate?" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": 40, 714 | "metadata": { 715 | "collapsed": false 716 | }, 717 | "outputs": [ 718 | { 719 | "name": "stdout", 720 | "output_type": "stream", 721 | "text": [ 722 | "Done.\n" 723 | ] 724 | }, 725 | { 726 | "data": { 727 | "text/html": [ 728 | "\n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | "
namebirth_ratedeath_rate
Austria9.419.42
Belarus10.713.36
Bosnia and Herzegovina8.879.75
Bulgaria8.9214.44
Croatia9.4512.18
" 760 | ], 761 | "text/plain": [ 762 | "[('Austria', 9.41, 9.42),\n", 763 | " ('Belarus', 10.7, 13.36),\n", 764 | " ('Bosnia and Herzegovina', 8.87, 9.75),\n", 765 | " ('Bulgaria', 8.92, 14.44),\n", 766 | " ('Croatia', 9.45, 12.18)]" 767 | ] 768 | }, 769 | "execution_count": 40, 770 | "metadata": {}, 771 | "output_type": "execute_result" 772 | } 773 | ], 774 | "source": [ 775 | "%%sql\n", 776 | "SELECT name,birth_rate,death_rate FROM facts\n", 777 | "WHERE birth_rate < death_rate\n", 778 | "LIMIT 5" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": null, 784 | "metadata": { 785 | "collapsed": true 786 | }, 787 | "outputs": [], 788 | "source": [] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": null, 793 | "metadata": { 794 | "collapsed": true 795 | }, 796 | "outputs": [], 797 | "source": [] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": null, 802 | "metadata": { 803 | "collapsed": true 804 | }, 805 | "outputs": [], 806 | "source": [] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": null, 811 | "metadata": { 812 | "collapsed": true 813 | }, 814 | "outputs": [], 815 | "source": [] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": null, 820 | "metadata": { 821 | "collapsed": true 822 | }, 823 | "outputs": [], 824 | "source": [] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": null, 829 | "metadata": { 830 | "collapsed": true 831 | }, 832 | "outputs": [], 833 | "source": [] 834 | } 835 | ], 836 | "metadata": { 837 | "kernelspec": { 838 | "display_name": "Python 3", 839 | "language": "python", 840 | "name": "python3" 841 | }, 842 | "language_info": { 843 | "codemirror_mode": { 844 | "name": "ipython", 845 | "version": 3 846 | }, 847 | "file_extension": ".py", 848 | "mimetype": "text/x-python", 849 | "name": "python", 850 | "nbconvert_exporter": "python", 851 | "pygments_lexer": "ipython3", 852 | "version": "3.4.3" 853 | } 854 | }, 855 | "nbformat": 4, 856 | "nbformat_minor": 2 857 | } 858 | -------------------------------------------------------------------------------- /Exploring Hacker News Posts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Hacker News\n", 10 | "In this notebook we will answer two questions:\n", 11 | "- Do `Ask HN` or `Show HN` receive more comments on average?\n", 12 | "- Do posts created at a certain time receive more comments on average?" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 4, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'],\n", 26 | " ['12224879',\n", 27 | " 'Interactive Dynamic Video',\n", 28 | " 'http://www.interactivedynamicvideo.com/',\n", 29 | " '386',\n", 30 | " '52',\n", 31 | " 'ne0phyte',\n", 32 | " '8/4/2016 11:52'],\n", 33 | " ['10975351',\n", 34 | " 'How to Use Open Source and Shut the Fuck Up at the Same Time',\n", 35 | " 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/',\n", 36 | " '39',\n", 37 | " '10',\n", 38 | " 'josep2',\n", 39 | " '1/26/2016 19:30'],\n", 40 | " ['11964716',\n", 41 | " \"Florida DJs May Face Felony for April Fools' Water Joke\",\n", 42 | " 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/',\n", 43 | " '2',\n", 44 | " '1',\n", 45 | " 'vezycash',\n", 46 | " '6/23/2016 22:20'],\n", 47 | " ['11919867',\n", 48 | " 'Technology ventures: From Idea to Enterprise',\n", 49 | " 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',\n", 50 | " '3',\n", 51 | " '1',\n", 52 | " 'hswarna',\n", 53 | " '6/17/2016 0:01']]" 54 | ] 55 | }, 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "from csv import reader\n", 63 | "hn = list(reader(open('hacker_news.csv')))\n", 64 | "hn[:5]" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']" 78 | ] 79 | }, 80 | "execution_count": 5, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "headers = hn[0]\n", 87 | "hn = hn[1:]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']" 101 | ] 102 | }, 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "headers" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 9, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "[['12224879',\n", 123 | " 'Interactive Dynamic Video',\n", 124 | " 'http://www.interactivedynamicvideo.com/',\n", 125 | " '386',\n", 126 | " '52',\n", 127 | " 'ne0phyte',\n", 128 | " '8/4/2016 11:52'],\n", 129 | " ['10975351',\n", 130 | " 'How to Use Open Source and Shut the Fuck Up at the Same Time',\n", 131 | " 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/',\n", 132 | " '39',\n", 133 | " '10',\n", 134 | " 'josep2',\n", 135 | " '1/26/2016 19:30'],\n", 136 | " ['11964716',\n", 137 | " \"Florida DJs May Face Felony for April Fools' Water Joke\",\n", 138 | " 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/',\n", 139 | " '2',\n", 140 | " '1',\n", 141 | " 'vezycash',\n", 142 | " '6/23/2016 22:20']]" 143 | ] 144 | }, 145 | "execution_count": 9, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "hn[:3]" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 10, 157 | "metadata": { 158 | "collapsed": true 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "ask_posts = []\n", 163 | "show_posts = []\n", 164 | "other_posts = []" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 11, 170 | "metadata": { 171 | "collapsed": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "for row in hn:\n", 176 | " title = row[1]\n", 177 | " if title.lower().startswith('ask hn'):\n", 178 | " ask_posts.append(row)\n", 179 | " elif title.lower().startswith('show hn'):\n", 180 | " show_posts.append(row)\n", 181 | " else:\n", 182 | " other_posts.append(row)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 12, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "1744\n", 197 | "1162\n", 198 | "17194\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "print(len(ask_posts))\n", 204 | "print(len(show_posts))\n", 205 | "print(len(other_posts))" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 14, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "14.038417431192661" 219 | ] 220 | }, 221 | "execution_count": 14, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "total_ask_comments = 0\n", 228 | "for row in ask_posts:\n", 229 | " total_ask_comments += int(row[4])\n", 230 | "avg_ask_comments = total_ask_comments / len(ask_posts)\n", 231 | "avg_ask_comments" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 15, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "10.31669535283993" 245 | ] 246 | }, 247 | "execution_count": 15, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "total_show_comments = 0\n", 254 | "for row in show_posts:\n", 255 | " total_show_comments += int(row[4])\n", 256 | "avg_show_comments = total_show_comments / len(show_posts)\n", 257 | "avg_show_comments" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "#### Do Ask HN or Show HN receive more comments on average?\n", 265 | "The answer is `Ask HN`, receiving an average of 4 more comments than `Show HN` posts\n", 266 | "\n", 267 | "From now on we will focus on `AsK HN` posts. Moving on to the comments per time created question." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 16, 273 | "metadata": { 274 | "collapsed": true 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "import datetime as dt" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 27, 284 | "metadata": { 285 | "collapsed": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "results_list = []\n", 290 | "for row in ask_posts:\n", 291 | " results_list.append([row[6],row[4]])" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 28, 297 | "metadata": { 298 | "collapsed": false 299 | }, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "[['8/16/2016 9:55', '6'], ['11/22/2015 13:43', '29'], ['5/2/2016 10:14', '1']]" 305 | ] 306 | }, 307 | "execution_count": 28, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "results_list[:3]" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 29, 319 | "metadata": { 320 | "collapsed": true 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "counts_by_hour = {}\n", 325 | "comments_by_hour = {}\n", 326 | "\n", 327 | "for row in results_list:\n", 328 | " hour = dt.datetime.strptime(row[0],'%m/%d/%Y %H:%M').strftime('%H')\n", 329 | " if hour not in counts_by_hour:\n", 330 | " counts_by_hour[hour] = 1\n", 331 | " comments_by_hour[hour] = int(row[1])\n", 332 | " else:\n", 333 | " counts_by_hour[hour] += 1\n", 334 | " comments_by_hour[hour] += int(row[1])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 31, 340 | "metadata": { 341 | "collapsed": false 342 | }, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "{'00': 55,\n", 348 | " '01': 60,\n", 349 | " '02': 58,\n", 350 | " '03': 54,\n", 351 | " '04': 47,\n", 352 | " '05': 46,\n", 353 | " '06': 44,\n", 354 | " '07': 34,\n", 355 | " '08': 48,\n", 356 | " '09': 45,\n", 357 | " '10': 59,\n", 358 | " '11': 58,\n", 359 | " '12': 73,\n", 360 | " '13': 85,\n", 361 | " '14': 107,\n", 362 | " '15': 116,\n", 363 | " '16': 108,\n", 364 | " '17': 100,\n", 365 | " '18': 109,\n", 366 | " '19': 110,\n", 367 | " '20': 80,\n", 368 | " '21': 109,\n", 369 | " '22': 71,\n", 370 | " '23': 68}" 371 | ] 372 | }, 373 | "execution_count": 31, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "counts_by_hour" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 33, 385 | "metadata": { 386 | "collapsed": false 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "avg_by_hour = []\n", 391 | "for hour in counts_by_hour:\n", 392 | " avg_by_hour.append([hour,comments_by_hour[hour]/counts_by_hour[hour]])" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 34, 398 | "metadata": { 399 | "collapsed": false 400 | }, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "[['04', 7.170212765957447],\n", 406 | " ['11', 11.051724137931034],\n", 407 | " ['23', 7.985294117647059],\n", 408 | " ['01', 11.383333333333333],\n", 409 | " ['18', 13.20183486238532],\n", 410 | " ['05', 10.08695652173913],\n", 411 | " ['03', 7.796296296296297],\n", 412 | " ['07', 7.852941176470588],\n", 413 | " ['08', 10.25],\n", 414 | " ['10', 13.440677966101696],\n", 415 | " ['00', 8.127272727272727],\n", 416 | " ['14', 13.233644859813085],\n", 417 | " ['22', 6.746478873239437],\n", 418 | " ['06', 9.022727272727273],\n", 419 | " ['20', 21.525],\n", 420 | " ['12', 9.41095890410959],\n", 421 | " ['02', 23.810344827586206],\n", 422 | " ['16', 16.796296296296298],\n", 423 | " ['13', 14.741176470588234],\n", 424 | " ['15', 38.5948275862069],\n", 425 | " ['19', 10.8],\n", 426 | " ['09', 5.5777777777777775],\n", 427 | " ['21', 16.009174311926607],\n", 428 | " ['17', 11.46]]" 429 | ] 430 | }, 431 | "execution_count": 34, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "avg_by_hour" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 35, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [ 447 | { 448 | "data": { 449 | "text/plain": [ 450 | "[[7.170212765957447, '04'],\n", 451 | " [11.051724137931034, '11'],\n", 452 | " [7.985294117647059, '23'],\n", 453 | " [11.383333333333333, '01'],\n", 454 | " [13.20183486238532, '18'],\n", 455 | " [10.08695652173913, '05'],\n", 456 | " [7.796296296296297, '03'],\n", 457 | " [7.852941176470588, '07'],\n", 458 | " [10.25, '08'],\n", 459 | " [13.440677966101696, '10'],\n", 460 | " [8.127272727272727, '00'],\n", 461 | " [13.233644859813085, '14'],\n", 462 | " [6.746478873239437, '22'],\n", 463 | " [9.022727272727273, '06'],\n", 464 | " [21.525, '20'],\n", 465 | " [9.41095890410959, '12'],\n", 466 | " [23.810344827586206, '02'],\n", 467 | " [16.796296296296298, '16'],\n", 468 | " [14.741176470588234, '13'],\n", 469 | " [38.5948275862069, '15'],\n", 470 | " [10.8, '19'],\n", 471 | " [5.5777777777777775, '09'],\n", 472 | " [16.009174311926607, '21'],\n", 473 | " [11.46, '17']]" 474 | ] 475 | }, 476 | "execution_count": 35, 477 | "metadata": {}, 478 | "output_type": "execute_result" 479 | } 480 | ], 481 | "source": [ 482 | "swap_avg_by_hour = []\n", 483 | "for row in avg_by_hour:\n", 484 | " swap_avg_by_hour.append([row[1],row[0]])\n", 485 | "swap_avg_by_hour" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 36, 491 | "metadata": { 492 | "collapsed": false 493 | }, 494 | "outputs": [ 495 | { 496 | "data": { 497 | "text/plain": [ 498 | "[[38.5948275862069, '15'],\n", 499 | " [23.810344827586206, '02'],\n", 500 | " [21.525, '20'],\n", 501 | " [16.796296296296298, '16'],\n", 502 | " [16.009174311926607, '21'],\n", 503 | " [14.741176470588234, '13'],\n", 504 | " [13.440677966101696, '10'],\n", 505 | " [13.233644859813085, '14'],\n", 506 | " [13.20183486238532, '18'],\n", 507 | " [11.46, '17'],\n", 508 | " [11.383333333333333, '01'],\n", 509 | " [11.051724137931034, '11'],\n", 510 | " [10.8, '19'],\n", 511 | " [10.25, '08'],\n", 512 | " [10.08695652173913, '05'],\n", 513 | " [9.41095890410959, '12'],\n", 514 | " [9.022727272727273, '06'],\n", 515 | " [8.127272727272727, '00'],\n", 516 | " [7.985294117647059, '23'],\n", 517 | " [7.852941176470588, '07'],\n", 518 | " [7.796296296296297, '03'],\n", 519 | " [7.170212765957447, '04'],\n", 520 | " [6.746478873239437, '22'],\n", 521 | " [5.5777777777777775, '09']]" 522 | ] 523 | }, 524 | "execution_count": 36, 525 | "metadata": {}, 526 | "output_type": "execute_result" 527 | } 528 | ], 529 | "source": [ 530 | "sorted_swap = sorted(swap_avg_by_hour,reverse=True)\n", 531 | "sorted_swap" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 41, 537 | "metadata": { 538 | "collapsed": false 539 | }, 540 | "outputs": [ 541 | { 542 | "name": "stdout", 543 | "output_type": "stream", 544 | "text": [ 545 | "Top 5 Hours for Ask Posts Comments\n", 546 | "15:00 38.59 average comments per post\n", 547 | "02:00 23.81 average comments per post\n", 548 | "20:00 21.52 average comments per post\n", 549 | "16:00 16.80 average comments per post\n", 550 | "21:00 16.01 average comments per post\n", 551 | "13:00 14.74 average comments per post\n" 552 | ] 553 | } 554 | ], 555 | "source": [ 556 | "print('Top 5 Hours for Ask Posts Comments')\n", 557 | "for row in sorted_swap[:6]:\n", 558 | " print('{} {:.2f} average comments per post'.format(dt.datetime.strptime(row[1],'%H').strftime('%H:%M'),row[0]))" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "If you want to increase your chance of having comments, " 566 | ] 567 | } 568 | ], 569 | "metadata": { 570 | "kernelspec": { 571 | "display_name": "Python 3", 572 | "language": "python", 573 | "name": "python3" 574 | }, 575 | "language_info": { 576 | "codemirror_mode": { 577 | "name": "ipython", 578 | "version": 3 579 | }, 580 | "file_extension": ".py", 581 | "mimetype": "text/x-python", 582 | "name": "python", 583 | "nbconvert_exporter": "python", 584 | "pygments_lexer": "ipython3", 585 | "version": "3.4.3" 586 | } 587 | }, 588 | "nbformat": 4, 589 | "nbformat_minor": 2 590 | } 591 | -------------------------------------------------------------------------------- /Mobile App for Lottery Addiction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Mobile App for Lottery Addiction\n", 10 | "\n", 11 | "A medical institute that aims to prevent and treat gambling addictions wants to build a dedicated mobile app to help lottery addicts better estimate their chances of winning. The institute has a team of engineers that will build the app, but they need us to create the logical core of the app and calculate probabilities.\n", 12 | "\n", 13 | "For the first version of the app, they want us to focus on the 6/49 lottery and build functions that enable users to answer questions like:\n", 14 | "\n", 15 | "- What is the probability of winning the big prize with a single ticket?\n", 16 | "- What is the probability of winning the big prize if we play 40 different tickets (or any other number)?\n", 17 | "- What is the probability of having at least five (or four, or three, or two) winning numbers on a single ticket?\n", 18 | "\n", 19 | "The institute also wants us to consider historical data coming from the national 6/49 lottery game in Canada. The data set has data for 3,665 drawings, dating from 1982 to 2018." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import seaborn as sns\n", 31 | "import numpy as np\n", 32 | "%matplotlib inline" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "#### Auxiliary functions\n", 40 | "\n", 41 | "We will start by defining functions that will help us throughout the probabilities calculation: `factorial` and `combinations`." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "def factorial(n):\n", 51 | " if n == 0:\n", 52 | " return 1\n", 53 | " else:\n", 54 | " return n * factorial(n-1)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 7, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "120" 66 | ] 67 | }, 68 | "execution_count": 7, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "factorial(5)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 14, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "def combinations(n,k):\n", 84 | " return (factorial(n)) / (factorial(k) * factorial(n-k))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 16, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "10.0" 96 | ] 97 | }, 98 | "execution_count": 16, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "combinations(5,2)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## First app version:\n", 112 | "\n", 113 | "For the first version of the app, we want players to be able to calculate the probability of winning the big prize with the various numbers they play on a single ticket (for each ticket a player chooses six numbers out of 49). So, we'll start by building a function that calculates the probability of winning the big prize for any given ticket.\n", 114 | "\n", 115 | "We discussed with the engineering team of the medical institute, and they told us we need to be aware of the following details when we write the function:\n", 116 | "\n", 117 | "- Inside the app, the user inputs six different numbers from 1 to 49.\n", 118 | "- Under the hood, the six numbers will come as a Python list, which will serve as the single input to our function.\n", 119 | "- The engineering team wants the function to print the probability value in a friendly way — in a way that people without any probability training are able to understand." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 54, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "def one_ticket_probability(numbers):\n", 129 | " total_outcomes = combinations(49,6)\n", 130 | " successful_outcomes = 1\n", 131 | " probability = successful_outcomes / total_outcomes\n", 132 | " print('Option 1: Betting on these numbers you would win the big prize {} out of 100,000,000 (100 million) times!\\nOption 2: Betting on these numbers you would win the big prize 1 out of {:,} times!'.format(int(round(probability*100000000,0)),int(1/probability)))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 55, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "Option 1: Betting on these numbers you would win the big prize 7 out of 100,000,000 (100 million) times!\n", 145 | "Option 2: Betting on these numbers you would win the big prize 1 out of 13,983,816 times!\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "one_ticket_probability([1,2,3,4,5,6])" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "#### Choosing the format\n", 158 | "\n", 159 | "Raw probability would have been printed as:" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 39, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "7.151123842018516e-08\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "total_outcomes = combinations(49,6)\n", 177 | "successful_outcomes = 1\n", 178 | "probability = successful_outcomes / total_outcomes\n", 179 | "print(probability)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "which is not a very user friendly way to show what we mean. Therefore, we opted to write the function as the number of times someone would win the big prize. We will keep option 2." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 105, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "def one_ticket_probability(numbers):\n", 196 | " total_outcomes = combinations(49,6)\n", 197 | " successful_outcomes = 1\n", 198 | " probability = successful_outcomes / total_outcomes\n", 199 | " print('Betting on these numbers you would win the big prize 1 out of {:,} times!'.format(int(1/probability)))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## Checking historical data\n", 207 | "\n", 208 | "We'll focus on exploring the historical data coming from the Canada 6/49 lottery. The data set can be downloaded from [Kaggle](https://www.kaggle.com/datascienceai/lottery-dataset).\n", 209 | "\n", 210 | "The data set contains historical data for 3,665 drawings (each row shows data for a single drawing), dating from 1982 to 2018. For each drawing, we can find the six numbers drawn in the following six columns:\n", 211 | "\n", 212 | "- NUMBER DRAWN 1\n", 213 | "- NUMBER DRAWN 2\n", 214 | "- NUMBER DRAWN 3\n", 215 | "- NUMBER DRAWN 4\n", 216 | "- NUMBER DRAWN 5\n", 217 | "- NUMBER DRAWN 6\n", 218 | "\n", 219 | "We will start by exploring the dataframe." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 56, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "(3665, 11)" 231 | ] 232 | }, 233 | "execution_count": 56, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "hist = pd.read_csv('649.csv')\n", 240 | "hist.shape" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 59, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/html": [ 251 | "
\n", 252 | "\n", 265 | "\n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | "
PRODUCTDRAW NUMBERSEQUENCE NUMBERDRAW DATENUMBER DRAWN 1NUMBER DRAWN 2NUMBER DRAWN 3NUMBER DRAWN 4NUMBER DRAWN 5NUMBER DRAWN 6BONUS NUMBER
0649106/12/19823111214414313
3664649359106/20/201814243135374817
\n", 313 | "
" 314 | ], 315 | "text/plain": [ 316 | " PRODUCT DRAW NUMBER SEQUENCE NUMBER DRAW DATE NUMBER DRAWN 1 \\\n", 317 | "0 649 1 0 6/12/1982 3 \n", 318 | "3664 649 3591 0 6/20/2018 14 \n", 319 | "\n", 320 | " NUMBER DRAWN 2 NUMBER DRAWN 3 NUMBER DRAWN 4 NUMBER DRAWN 5 \\\n", 321 | "0 11 12 14 41 \n", 322 | "3664 24 31 35 37 \n", 323 | "\n", 324 | " NUMBER DRAWN 6 BONUS NUMBER \n", 325 | "0 43 13 \n", 326 | "3664 48 17 " 327 | ] 328 | }, 329 | "execution_count": 59, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "hist.loc[[0,3664]]" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "#### Creating an extraction function\n", 343 | "\n", 344 | "This function will receive a row from the dataset and will return a Python set with the six winning numbers for that extraction." 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 65, 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/plain": [ 355 | "3" 356 | ] 357 | }, 358 | "execution_count": 65, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "hist.iloc[0,4]" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 89, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "numbers_columns = range(4,10)\n", 374 | "def extract_numbers(row):\n", 375 | " numbers_drawn = []\n", 376 | " for i in numbers_columns:\n", 377 | " numbers_drawn.append(row[i])\n", 378 | " return set(numbers_drawn)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "We will create a new column on the DataFrame with all the winning numbers as a Python set." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 90, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "hist['ALL NUMBERS'] = hist.apply(extract_numbers,axis=1)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 91, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/html": [ 405 | "
\n", 406 | "\n", 419 | "\n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | "
PRODUCTDRAW NUMBERSEQUENCE NUMBERDRAW DATENUMBER DRAWN 1NUMBER DRAWN 2NUMBER DRAWN 3NUMBER DRAWN 4NUMBER DRAWN 5NUMBER DRAWN 6BONUS NUMBERALL NUMBERS
0649106/12/19823111214414313{3, 41, 11, 12, 43, 14}
1649206/19/1982833363739419{33, 36, 37, 39, 8, 41}
2649306/26/1982162324273934{1, 6, 39, 23, 24, 27}
3649407/3/1982391013204334{3, 9, 10, 43, 13, 20}
4649507/10/19825142131344745{34, 5, 14, 47, 21, 31}
5649607/17/19828202125314133{8, 41, 20, 21, 25, 31}
6649707/24/19821825283336427{33, 36, 42, 18, 25, 28}
7649807/31/19827161731404826{7, 40, 16, 17, 48, 31}
8649908/7/19825102327373833{37, 5, 38, 10, 23, 27}
96491008/14/1982415303746483{4, 37, 46, 15, 48, 30}
106491108/21/1982792133384245{33, 38, 7, 9, 42, 21}
116491208/28/19821117192036439{36, 11, 43, 17, 19, 20}
126491309/4/19827141720374734{37, 7, 14, 47, 17, 20}
136491409/11/19822528293035443{35, 44, 25, 28, 29, 30}
146491509/18/19828183639414731{36, 39, 8, 41, 47, 18}
156491609/25/19829121314444818{9, 12, 13, 14, 44, 48}
1664917010/2/1982414184043445{4, 40, 43, 44, 14, 18}
1764918010/9/198213161834353626{34, 35, 36, 13, 16, 18}
1864919010/16/198211232528293627{36, 11, 23, 25, 28, 29}
1964920010/23/19827182325374539{37, 7, 45, 18, 23, 25}
2064921010/30/198211181931374541{37, 11, 45, 18, 19, 31}
2164922011/6/19828141618314845{8, 14, 16, 48, 18, 31}
2264923011/13/19824112324254541{4, 11, 45, 23, 24, 25}
2364924011/20/1982341933344839{33, 34, 3, 4, 48, 19}
2464925011/27/19825172128304336{5, 43, 17, 21, 28, 30}
2564926012/4/19826172436384629{36, 6, 38, 46, 17, 24}
2664927012/11/198249101143463{4, 9, 10, 11, 43, 46}
2764928012/18/19827132332334536{32, 33, 7, 13, 45, 23}
2864929012/25/198211182228353742{35, 37, 11, 18, 22, 28}
296493001/1/198325263135454812{35, 45, 48, 25, 26, 31}
.......................................
3635649356203/10/20182102843444839{2, 10, 43, 44, 48, 28}
3636649356303/14/2018191019334321{1, 33, 9, 10, 43, 19}
3637649356403/17/201816173543444649{35, 43, 44, 46, 16, 17}
3638649356503/21/201815912394128{1, 5, 39, 9, 41, 12}
3639649356603/24/2018911283045495{9, 11, 45, 49, 28, 30}
3640649356703/28/201811121924284639{11, 12, 46, 19, 24, 28}
3641649356803/31/2018372226434748{3, 7, 43, 47, 22, 26}
3642649356904/4/20186891334404{34, 6, 8, 9, 40, 13}
3643649357004/7/201818223640474932{36, 40, 47, 49, 18, 22}
3644649357104/11/2018371032414333{32, 3, 7, 41, 10, 43}
3645649357204/14/2018111193648499{1, 36, 11, 48, 49, 19}
3646649357304/18/201812171922273938{39, 12, 17, 19, 22, 27}
3647649357404/21/20181219203041499{41, 12, 49, 19, 20, 30}
3648649357504/25/20181014151640466{40, 10, 14, 15, 16, 46}
3649649357604/28/201824313334444610{33, 34, 44, 46, 24, 31}
3650649357705/2/2018472836374921{4, 37, 36, 7, 49, 28}
3651649357805/5/20184182426334834{33, 4, 48, 18, 24, 26}
3652649357905/9/201815172124363732{36, 37, 15, 17, 21, 24}
3653649358005/12/20186172134374413{34, 37, 6, 44, 17, 21}
3654649358105/16/20181921283039467{39, 46, 19, 21, 28, 30}
3655649358205/19/201814152834384430{34, 38, 44, 14, 15, 28}
3656649358305/23/2018491520424431{4, 9, 42, 44, 15, 20}
3657649358405/26/20181213162339442{39, 12, 13, 44, 16, 23}
3658649358505/30/20189141721383922{38, 39, 9, 14, 17, 21}
3659649358606/2/20182263240454929{32, 2, 40, 45, 49, 26}
3660649358706/6/201810152338404135{38, 40, 41, 10, 15, 23}
3661649358806/9/201819253136464726{36, 46, 47, 19, 25, 31}
3662649358906/13/20186222431323416{32, 34, 6, 22, 24, 31}
3663649359006/16/2018215213138498{2, 38, 15, 49, 21, 31}
3664649359106/20/201814243135374817{35, 37, 14, 48, 24, 31}
\n", 1355 | "

3665 rows × 12 columns

\n", 1356 | "
" 1357 | ], 1358 | "text/plain": [ 1359 | " PRODUCT DRAW NUMBER SEQUENCE NUMBER DRAW DATE NUMBER DRAWN 1 \\\n", 1360 | "0 649 1 0 6/12/1982 3 \n", 1361 | "1 649 2 0 6/19/1982 8 \n", 1362 | "2 649 3 0 6/26/1982 1 \n", 1363 | "3 649 4 0 7/3/1982 3 \n", 1364 | "4 649 5 0 7/10/1982 5 \n", 1365 | "5 649 6 0 7/17/1982 8 \n", 1366 | "6 649 7 0 7/24/1982 18 \n", 1367 | "7 649 8 0 7/31/1982 7 \n", 1368 | "8 649 9 0 8/7/1982 5 \n", 1369 | "9 649 10 0 8/14/1982 4 \n", 1370 | "10 649 11 0 8/21/1982 7 \n", 1371 | "11 649 12 0 8/28/1982 11 \n", 1372 | "12 649 13 0 9/4/1982 7 \n", 1373 | "13 649 14 0 9/11/1982 25 \n", 1374 | "14 649 15 0 9/18/1982 8 \n", 1375 | "15 649 16 0 9/25/1982 9 \n", 1376 | "16 649 17 0 10/2/1982 4 \n", 1377 | "17 649 18 0 10/9/1982 13 \n", 1378 | "18 649 19 0 10/16/1982 11 \n", 1379 | "19 649 20 0 10/23/1982 7 \n", 1380 | "20 649 21 0 10/30/1982 11 \n", 1381 | "21 649 22 0 11/6/1982 8 \n", 1382 | "22 649 23 0 11/13/1982 4 \n", 1383 | "23 649 24 0 11/20/1982 3 \n", 1384 | "24 649 25 0 11/27/1982 5 \n", 1385 | "25 649 26 0 12/4/1982 6 \n", 1386 | "26 649 27 0 12/11/1982 4 \n", 1387 | "27 649 28 0 12/18/1982 7 \n", 1388 | "28 649 29 0 12/25/1982 11 \n", 1389 | "29 649 30 0 1/1/1983 25 \n", 1390 | "... ... ... ... ... ... \n", 1391 | "3635 649 3562 0 3/10/2018 2 \n", 1392 | "3636 649 3563 0 3/14/2018 1 \n", 1393 | "3637 649 3564 0 3/17/2018 16 \n", 1394 | "3638 649 3565 0 3/21/2018 1 \n", 1395 | "3639 649 3566 0 3/24/2018 9 \n", 1396 | "3640 649 3567 0 3/28/2018 11 \n", 1397 | "3641 649 3568 0 3/31/2018 3 \n", 1398 | "3642 649 3569 0 4/4/2018 6 \n", 1399 | "3643 649 3570 0 4/7/2018 18 \n", 1400 | "3644 649 3571 0 4/11/2018 3 \n", 1401 | "3645 649 3572 0 4/14/2018 1 \n", 1402 | "3646 649 3573 0 4/18/2018 12 \n", 1403 | "3647 649 3574 0 4/21/2018 12 \n", 1404 | "3648 649 3575 0 4/25/2018 10 \n", 1405 | "3649 649 3576 0 4/28/2018 24 \n", 1406 | "3650 649 3577 0 5/2/2018 4 \n", 1407 | "3651 649 3578 0 5/5/2018 4 \n", 1408 | "3652 649 3579 0 5/9/2018 15 \n", 1409 | "3653 649 3580 0 5/12/2018 6 \n", 1410 | "3654 649 3581 0 5/16/2018 19 \n", 1411 | "3655 649 3582 0 5/19/2018 14 \n", 1412 | "3656 649 3583 0 5/23/2018 4 \n", 1413 | "3657 649 3584 0 5/26/2018 12 \n", 1414 | "3658 649 3585 0 5/30/2018 9 \n", 1415 | "3659 649 3586 0 6/2/2018 2 \n", 1416 | "3660 649 3587 0 6/6/2018 10 \n", 1417 | "3661 649 3588 0 6/9/2018 19 \n", 1418 | "3662 649 3589 0 6/13/2018 6 \n", 1419 | "3663 649 3590 0 6/16/2018 2 \n", 1420 | "3664 649 3591 0 6/20/2018 14 \n", 1421 | "\n", 1422 | " NUMBER DRAWN 2 NUMBER DRAWN 3 NUMBER DRAWN 4 NUMBER DRAWN 5 \\\n", 1423 | "0 11 12 14 41 \n", 1424 | "1 33 36 37 39 \n", 1425 | "2 6 23 24 27 \n", 1426 | "3 9 10 13 20 \n", 1427 | "4 14 21 31 34 \n", 1428 | "5 20 21 25 31 \n", 1429 | "6 25 28 33 36 \n", 1430 | "7 16 17 31 40 \n", 1431 | "8 10 23 27 37 \n", 1432 | "9 15 30 37 46 \n", 1433 | "10 9 21 33 38 \n", 1434 | "11 17 19 20 36 \n", 1435 | "12 14 17 20 37 \n", 1436 | "13 28 29 30 35 \n", 1437 | "14 18 36 39 41 \n", 1438 | "15 12 13 14 44 \n", 1439 | "16 14 18 40 43 \n", 1440 | "17 16 18 34 35 \n", 1441 | "18 23 25 28 29 \n", 1442 | "19 18 23 25 37 \n", 1443 | "20 18 19 31 37 \n", 1444 | "21 14 16 18 31 \n", 1445 | "22 11 23 24 25 \n", 1446 | "23 4 19 33 34 \n", 1447 | "24 17 21 28 30 \n", 1448 | "25 17 24 36 38 \n", 1449 | "26 9 10 11 43 \n", 1450 | "27 13 23 32 33 \n", 1451 | "28 18 22 28 35 \n", 1452 | "29 26 31 35 45 \n", 1453 | "... ... ... ... ... \n", 1454 | "3635 10 28 43 44 \n", 1455 | "3636 9 10 19 33 \n", 1456 | "3637 17 35 43 44 \n", 1457 | "3638 5 9 12 39 \n", 1458 | "3639 11 28 30 45 \n", 1459 | "3640 12 19 24 28 \n", 1460 | "3641 7 22 26 43 \n", 1461 | "3642 8 9 13 34 \n", 1462 | "3643 22 36 40 47 \n", 1463 | "3644 7 10 32 41 \n", 1464 | "3645 11 19 36 48 \n", 1465 | "3646 17 19 22 27 \n", 1466 | "3647 19 20 30 41 \n", 1467 | "3648 14 15 16 40 \n", 1468 | "3649 31 33 34 44 \n", 1469 | "3650 7 28 36 37 \n", 1470 | "3651 18 24 26 33 \n", 1471 | "3652 17 21 24 36 \n", 1472 | "3653 17 21 34 37 \n", 1473 | "3654 21 28 30 39 \n", 1474 | "3655 15 28 34 38 \n", 1475 | "3656 9 15 20 42 \n", 1476 | "3657 13 16 23 39 \n", 1477 | "3658 14 17 21 38 \n", 1478 | "3659 26 32 40 45 \n", 1479 | "3660 15 23 38 40 \n", 1480 | "3661 25 31 36 46 \n", 1481 | "3662 22 24 31 32 \n", 1482 | "3663 15 21 31 38 \n", 1483 | "3664 24 31 35 37 \n", 1484 | "\n", 1485 | " NUMBER DRAWN 6 BONUS NUMBER ALL NUMBERS \n", 1486 | "0 43 13 {3, 41, 11, 12, 43, 14} \n", 1487 | "1 41 9 {33, 36, 37, 39, 8, 41} \n", 1488 | "2 39 34 {1, 6, 39, 23, 24, 27} \n", 1489 | "3 43 34 {3, 9, 10, 43, 13, 20} \n", 1490 | "4 47 45 {34, 5, 14, 47, 21, 31} \n", 1491 | "5 41 33 {8, 41, 20, 21, 25, 31} \n", 1492 | "6 42 7 {33, 36, 42, 18, 25, 28} \n", 1493 | "7 48 26 {7, 40, 16, 17, 48, 31} \n", 1494 | "8 38 33 {37, 5, 38, 10, 23, 27} \n", 1495 | "9 48 3 {4, 37, 46, 15, 48, 30} \n", 1496 | "10 42 45 {33, 38, 7, 9, 42, 21} \n", 1497 | "11 43 9 {36, 11, 43, 17, 19, 20} \n", 1498 | "12 47 34 {37, 7, 14, 47, 17, 20} \n", 1499 | "13 44 3 {35, 44, 25, 28, 29, 30} \n", 1500 | "14 47 31 {36, 39, 8, 41, 47, 18} \n", 1501 | "15 48 18 {9, 12, 13, 14, 44, 48} \n", 1502 | "16 44 5 {4, 40, 43, 44, 14, 18} \n", 1503 | "17 36 26 {34, 35, 36, 13, 16, 18} \n", 1504 | "18 36 27 {36, 11, 23, 25, 28, 29} \n", 1505 | "19 45 39 {37, 7, 45, 18, 23, 25} \n", 1506 | "20 45 41 {37, 11, 45, 18, 19, 31} \n", 1507 | "21 48 45 {8, 14, 16, 48, 18, 31} \n", 1508 | "22 45 41 {4, 11, 45, 23, 24, 25} \n", 1509 | "23 48 39 {33, 34, 3, 4, 48, 19} \n", 1510 | "24 43 36 {5, 43, 17, 21, 28, 30} \n", 1511 | "25 46 29 {36, 6, 38, 46, 17, 24} \n", 1512 | "26 46 3 {4, 9, 10, 11, 43, 46} \n", 1513 | "27 45 36 {32, 33, 7, 13, 45, 23} \n", 1514 | "28 37 42 {35, 37, 11, 18, 22, 28} \n", 1515 | "29 48 12 {35, 45, 48, 25, 26, 31} \n", 1516 | "... ... ... ... \n", 1517 | "3635 48 39 {2, 10, 43, 44, 48, 28} \n", 1518 | "3636 43 21 {1, 33, 9, 10, 43, 19} \n", 1519 | "3637 46 49 {35, 43, 44, 46, 16, 17} \n", 1520 | "3638 41 28 {1, 5, 39, 9, 41, 12} \n", 1521 | "3639 49 5 {9, 11, 45, 49, 28, 30} \n", 1522 | "3640 46 39 {11, 12, 46, 19, 24, 28} \n", 1523 | "3641 47 48 {3, 7, 43, 47, 22, 26} \n", 1524 | "3642 40 4 {34, 6, 8, 9, 40, 13} \n", 1525 | "3643 49 32 {36, 40, 47, 49, 18, 22} \n", 1526 | "3644 43 33 {32, 3, 7, 41, 10, 43} \n", 1527 | "3645 49 9 {1, 36, 11, 48, 49, 19} \n", 1528 | "3646 39 38 {39, 12, 17, 19, 22, 27} \n", 1529 | "3647 49 9 {41, 12, 49, 19, 20, 30} \n", 1530 | "3648 46 6 {40, 10, 14, 15, 16, 46} \n", 1531 | "3649 46 10 {33, 34, 44, 46, 24, 31} \n", 1532 | "3650 49 21 {4, 37, 36, 7, 49, 28} \n", 1533 | "3651 48 34 {33, 4, 48, 18, 24, 26} \n", 1534 | "3652 37 32 {36, 37, 15, 17, 21, 24} \n", 1535 | "3653 44 13 {34, 37, 6, 44, 17, 21} \n", 1536 | "3654 46 7 {39, 46, 19, 21, 28, 30} \n", 1537 | "3655 44 30 {34, 38, 44, 14, 15, 28} \n", 1538 | "3656 44 31 {4, 9, 42, 44, 15, 20} \n", 1539 | "3657 44 2 {39, 12, 13, 44, 16, 23} \n", 1540 | "3658 39 22 {38, 39, 9, 14, 17, 21} \n", 1541 | "3659 49 29 {32, 2, 40, 45, 49, 26} \n", 1542 | "3660 41 35 {38, 40, 41, 10, 15, 23} \n", 1543 | "3661 47 26 {36, 46, 47, 19, 25, 31} \n", 1544 | "3662 34 16 {32, 34, 6, 22, 24, 31} \n", 1545 | "3663 49 8 {2, 38, 15, 49, 21, 31} \n", 1546 | "3664 48 17 {35, 37, 14, 48, 24, 31} \n", 1547 | "\n", 1548 | "[3665 rows x 12 columns]" 1549 | ] 1550 | }, 1551 | "execution_count": 91, 1552 | "metadata": {}, 1553 | "output_type": "execute_result" 1554 | } 1555 | ], 1556 | "source": [ 1557 | "hist" 1558 | ] 1559 | }, 1560 | { 1561 | "cell_type": "markdown", 1562 | "metadata": {}, 1563 | "source": [ 1564 | "#### Checking historical occurrence\n", 1565 | "\n", 1566 | "We will create a function to check whether a specific combination has ever occured in history." 1567 | ] 1568 | }, 1569 | { 1570 | "cell_type": "code", 1571 | "execution_count": 93, 1572 | "metadata": { 1573 | "scrolled": true 1574 | }, 1575 | "outputs": [ 1576 | { 1577 | "data": { 1578 | "text/plain": [ 1579 | "0 True\n", 1580 | "1 False\n", 1581 | "2 False\n", 1582 | "3 False\n", 1583 | "4 False\n", 1584 | "5 False\n", 1585 | "6 False\n", 1586 | "7 False\n", 1587 | "8 False\n", 1588 | "9 False\n", 1589 | "10 False\n", 1590 | "11 False\n", 1591 | "12 False\n", 1592 | "13 False\n", 1593 | "14 False\n", 1594 | "15 False\n", 1595 | "16 False\n", 1596 | "17 False\n", 1597 | "18 False\n", 1598 | "19 False\n", 1599 | "20 False\n", 1600 | "21 False\n", 1601 | "22 False\n", 1602 | "23 False\n", 1603 | "24 False\n", 1604 | "25 False\n", 1605 | "26 False\n", 1606 | "27 False\n", 1607 | "28 False\n", 1608 | "29 False\n", 1609 | " ... \n", 1610 | "3635 False\n", 1611 | "3636 False\n", 1612 | "3637 False\n", 1613 | "3638 False\n", 1614 | "3639 False\n", 1615 | "3640 False\n", 1616 | "3641 False\n", 1617 | "3642 False\n", 1618 | "3643 False\n", 1619 | "3644 False\n", 1620 | "3645 False\n", 1621 | "3646 False\n", 1622 | "3647 False\n", 1623 | "3648 False\n", 1624 | "3649 False\n", 1625 | "3650 False\n", 1626 | "3651 False\n", 1627 | "3652 False\n", 1628 | "3653 False\n", 1629 | "3654 False\n", 1630 | "3655 False\n", 1631 | "3656 False\n", 1632 | "3657 False\n", 1633 | "3658 False\n", 1634 | "3659 False\n", 1635 | "3660 False\n", 1636 | "3661 False\n", 1637 | "3662 False\n", 1638 | "3663 False\n", 1639 | "3664 False\n", 1640 | "Name: ALL NUMBERS, Length: 3665, dtype: bool" 1641 | ] 1642 | }, 1643 | "execution_count": 93, 1644 | "metadata": {}, 1645 | "output_type": "execute_result" 1646 | } 1647 | ], 1648 | "source": [ 1649 | "hist['ALL NUMBERS'] == {3, 41, 11, 12, 43, 14}" 1650 | ] 1651 | }, 1652 | { 1653 | "cell_type": "code", 1654 | "execution_count": 108, 1655 | "metadata": {}, 1656 | "outputs": [], 1657 | "source": [ 1658 | "def check_historical_occurence(numbers,history):\n", 1659 | " numbers_set = set(numbers)\n", 1660 | " num_of_occurences = (history == numbers_set).sum()\n", 1661 | " total_extractions = len(history)\n", 1662 | " print('The combination {} has occurred {} time(s) in history ({} extractions).\\n'.format(numbers_set,num_of_occurences,total_extractions))\n", 1663 | " one_ticket_probability(numbers)" 1664 | ] 1665 | }, 1666 | { 1667 | "cell_type": "code", 1668 | "execution_count": 109, 1669 | "metadata": { 1670 | "scrolled": true 1671 | }, 1672 | "outputs": [ 1673 | { 1674 | "name": "stdout", 1675 | "output_type": "stream", 1676 | "text": [ 1677 | "The combination {3, 41, 11, 12, 43, 14} has occurred 1 time(s) in history (3665 extractions).\n", 1678 | "\n", 1679 | "Betting on these numbers you would win the big prize 1 out of 13,983,816 times!\n" 1680 | ] 1681 | } 1682 | ], 1683 | "source": [ 1684 | "check_historical_occurence([3, 41, 11, 12, 43, 14],hist['ALL NUMBERS'])" 1685 | ] 1686 | }, 1687 | { 1688 | "cell_type": "markdown", 1689 | "metadata": {}, 1690 | "source": [ 1691 | "## Multi-ticket probability\n", 1692 | "\n", 1693 | "Lottery addicts usually play more than one ticket on a single drawing, thinking that this might increase their chances of winning significantly. Our purpose is to help them better estimate their chances of winning — on this screen, we're going to write a function that will allow the users to calculate the chances of winning for any number of different tickets.\n", 1694 | "\n", 1695 | "We've talked with the engineering team and they gave us the following information:\n", 1696 | "\n", 1697 | "- The user will input the number of different tickets they want to play (without inputting the specific combinations they intend to play).\n", 1698 | "- Our function will see an integer between 1 and 13,983,816 (the maximum number of different tickets).\n", 1699 | "- The function should print information about the probability of winning the big prize depending on the number of different tickets played." 1700 | ] 1701 | }, 1702 | { 1703 | "cell_type": "code", 1704 | "execution_count": 162, 1705 | "metadata": {}, 1706 | "outputs": [], 1707 | "source": [ 1708 | "def multi_ticket_probability(successful_outcomes):\n", 1709 | " possible_outcomes = combinations(49,6)\n", 1710 | " probability = successful_outcomes / possible_outcomes\n", 1711 | " print('Betting on {:,} different tickets, you would win the big prize 1 out of {:,} times (a {:.2f}% chance)!'.format(successful_outcomes,int(1/probability),probability*100))" 1712 | ] 1713 | }, 1714 | { 1715 | "cell_type": "code", 1716 | "execution_count": 163, 1717 | "metadata": { 1718 | "scrolled": true 1719 | }, 1720 | "outputs": [ 1721 | { 1722 | "name": "stdout", 1723 | "output_type": "stream", 1724 | "text": [ 1725 | "Betting on 1 different tickets, you would win the big prize 1 out of 13,983,816 times (a 0.00% chance)!\n", 1726 | "Betting on 10 different tickets, you would win the big prize 1 out of 1,398,381 times (a 0.00% chance)!\n", 1727 | "Betting on 100 different tickets, you would win the big prize 1 out of 139,838 times (a 0.00% chance)!\n", 1728 | "Betting on 10,000 different tickets, you would win the big prize 1 out of 1,398 times (a 0.07% chance)!\n", 1729 | "Betting on 1,000,000 different tickets, you would win the big prize 1 out of 13 times (a 7.15% chance)!\n", 1730 | "Betting on 6,991,908 different tickets, you would win the big prize 1 out of 2 times (a 50.00% chance)!\n", 1731 | "Betting on 13,983,816 different tickets, you would win the big prize 1 out of 1 times (a 100.00% chance)!\n" 1732 | ] 1733 | } 1734 | ], 1735 | "source": [ 1736 | "multi_ticket_probability(1)\n", 1737 | "multi_ticket_probability(10)\n", 1738 | "multi_ticket_probability(100)\n", 1739 | "multi_ticket_probability(10000)\n", 1740 | "multi_ticket_probability(1000000)\n", 1741 | "multi_ticket_probability(6991908)\n", 1742 | "multi_ticket_probability(13983816)" 1743 | ] 1744 | }, 1745 | { 1746 | "cell_type": "markdown", 1747 | "metadata": {}, 1748 | "source": [ 1749 | "## Less winning numbers\n", 1750 | "\n", 1751 | "We're going to write one more function to allow the users to calculate probabilities for two, three, four, or five winning numbers.\n", 1752 | "\n", 1753 | "For extra context, in most 6/49 lotteries there are smaller prizes if a player's ticket match two, three, four, or five of the six numbers drawn. As a consequence, the users might be interested in knowing the probability of having two, three, four, or five winning numbers.\n", 1754 | "\n", 1755 | "These are the engineering details we'll need to be aware of:\n", 1756 | "\n", 1757 | "- Inside the app, the user inputs:\n", 1758 | " - six different numbers from 1 to 49; and\n", 1759 | " - an integer between 2 and 5 that represents the number of winning numbers expected\n", 1760 | "- Our function prints information about the probability of having the inputted number of winning numbers.\n", 1761 | "\n", 1762 | "As the numbers inputed by the user are irrelevant, we will only code the function with one input: the number of correct lottery numbers." 1763 | ] 1764 | }, 1765 | { 1766 | "cell_type": "code", 1767 | "execution_count": 164, 1768 | "metadata": {}, 1769 | "outputs": [], 1770 | "source": [ 1771 | "def probability_less_6(n):\n", 1772 | " total_outcomes = combinations(49,6)\n", 1773 | " successful_outcomes = combinations(6,n) * combinations(43,6-n)\n", 1774 | " probability = successful_outcomes / total_outcomes\n", 1775 | " print('You would get {} correct numbers 1 out of {:,} times (a {:.2f}% chance)!'.format(n,int(round(1/probability)),probability*100))" 1776 | ] 1777 | }, 1778 | { 1779 | "cell_type": "code", 1780 | "execution_count": 165, 1781 | "metadata": {}, 1782 | "outputs": [ 1783 | { 1784 | "name": "stdout", 1785 | "output_type": "stream", 1786 | "text": [ 1787 | "You would get 5 correct numbers 1 out of 54,201 times (a 0.00% chance)!\n", 1788 | "You would get 4 correct numbers 1 out of 1,032 times (a 0.10% chance)!\n", 1789 | "You would get 3 correct numbers 1 out of 57 times (a 1.77% chance)!\n", 1790 | "You would get 2 correct numbers 1 out of 8 times (a 13.24% chance)!\n", 1791 | "You would get 1 correct numbers 1 out of 2 times (a 41.30% chance)!\n" 1792 | ] 1793 | } 1794 | ], 1795 | "source": [ 1796 | "probability_less_6(5)\n", 1797 | "probability_less_6(4)\n", 1798 | "probability_less_6(3)\n", 1799 | "probability_less_6(2)\n", 1800 | "probability_less_6(1)" 1801 | ] 1802 | } 1803 | ], 1804 | "metadata": { 1805 | "kernelspec": { 1806 | "display_name": "Python 3", 1807 | "language": "python", 1808 | "name": "python3" 1809 | }, 1810 | "language_info": { 1811 | "codemirror_mode": { 1812 | "name": "ipython", 1813 | "version": 3 1814 | }, 1815 | "file_extension": ".py", 1816 | "mimetype": "text/x-python", 1817 | "name": "python", 1818 | "nbconvert_exporter": "python", 1819 | "pygments_lexer": "ipython3", 1820 | "version": "3.6.5" 1821 | } 1822 | }, 1823 | "nbformat": 4, 1824 | "nbformat_minor": 2 1825 | } 1826 | -------------------------------------------------------------------------------- /Predicting the stock market.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from datetime import datetime 4 | from sklearn.linear_model import LinearRegression 5 | from sklearn.metrics import mean_absolute_error 6 | df = pd.read_csv('sphist.csv') 7 | df['Date'] = pd.to_datetime(df['Date']) 8 | df = df.sort_values('Date') 9 | df = df.reset_index(drop=True) 10 | df = df.reset_index() 11 | 12 | df.rename({'index':'day'},inplace=True,axis=1) 13 | 14 | # average price from the past X days 15 | def average_price(row,days): 16 | if row['day'] < days: 17 | return 0 18 | else: 19 | day_start = row['day'] - days 20 | day_end = row['day'] - 1 21 | return np.mean(df.loc[day_start:day_end,'Close']) 22 | 23 | df['mean_5_days'] = df.apply(average_price,axis=1,days=5) 24 | df['mean_30_days'] = df.apply(average_price,axis=1,days=30) 25 | df['mean_365_days'] = df.apply(average_price,axis=1,days=365) 26 | 27 | df = df.iloc[365:] 28 | df = df.dropna(axis=0) 29 | 30 | train = df[df['Date'] < datetime(year=2013,month=1,day=1)] 31 | test = df[df['Date'] >= datetime(year=2013,month=1,day=1)] 32 | 33 | features = ['mean_5_days','mean_30_days','mean_365_days'] 34 | lr = LinearRegression() 35 | lr.fit(train[features],train['Close']) 36 | predictions = lr.predict(test[features]) 37 | 38 | mae = mean_absolute_error(predictions,test['Close']) 39 | 40 | print(mae) -------------------------------------------------------------------------------- /Profitable App Profiles for the App Store and Google Play Markets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Finding what type of apps attract more users\n", 10 | "\n", 11 | "As a company developing free to download apps, our main source of revenue are the in-app ads. To maximize our revenue we need to attract as much users as possible so the watch our ads.\n", 12 | "\n", 13 | "With this project we intend to understand what are the apps currently attracting more users." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "def explore_data(dataset, start, end, rows_and_columns=False):\n", 25 | " dataset_slice = dataset[start:end] \n", 26 | " for row in dataset_slice:\n", 27 | " print(row)\n", 28 | " print('\\n') # adds a new (empty) line after each row\n", 29 | "\n", 30 | " if rows_and_columns:\n", 31 | " print('Number of rows:', len(dataset))\n", 32 | " print('Number of columns:', len(dataset[0]))" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "file_apple = open('AppleStore.csv')\n", 44 | "file_google = open('googleplaystore.csv')\n", 45 | "from csv import reader\n", 46 | "appstore = list(reader(file_apple))\n", 47 | "googleplay = list(reader(file_google))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "appstore_header = appstore[0]\n", 59 | "appstore = appstore[1:]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "googleplay_header = googleplay[0]\n", 71 | "googleplay = googleplay[1:]" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "['id',\n", 85 | " 'track_name',\n", 86 | " 'size_bytes',\n", 87 | " 'currency',\n", 88 | " 'price',\n", 89 | " 'rating_count_tot',\n", 90 | " 'rating_count_ver',\n", 91 | " 'user_rating',\n", 92 | " 'user_rating_ver',\n", 93 | " 'ver',\n", 94 | " 'cont_rating',\n", 95 | " 'prime_genre',\n", 96 | " 'sup_devices.num',\n", 97 | " 'ipadSc_urls.num',\n", 98 | " 'lang.num',\n", 99 | " 'vpp_lic']" 100 | ] 101 | }, 102 | "execution_count": 5, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "appstore_header" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "['App',\n", 122 | " 'Category',\n", 123 | " 'Rating',\n", 124 | " 'Reviews',\n", 125 | " 'Size',\n", 126 | " 'Installs',\n", 127 | " 'Type',\n", 128 | " 'Price',\n", 129 | " 'Content Rating',\n", 130 | " 'Genres',\n", 131 | " 'Last Updated',\n", 132 | " 'Current Ver',\n", 133 | " 'Android Ver']" 134 | ] 135 | }, 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "googleplay_header" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 7, 148 | "metadata": { 149 | "collapsed": false, 150 | "scrolled": false 151 | }, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']\n", 158 | "\n", 159 | "\n", 160 | "['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']\n", 161 | "\n", 162 | "\n", 163 | "['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']\n", 164 | "\n", 165 | "\n", 166 | "Number of rows: 7197\n", 167 | "Number of columns: 16\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "explore_data(appstore,0,3,True)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 8, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']\n", 187 | "\n", 188 | "\n", 189 | "['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']\n", 190 | "\n", 191 | "\n", 192 | "['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']\n", 193 | "\n", 194 | "\n", 195 | "Number of rows: 10841\n", 196 | "Number of columns: 13\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "explore_data(googleplay,0,3,True)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 9, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "del googleplay[10472]" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "## Information that can be relevant to our study:\n", 220 | "### AppStore\n", 221 | "(Check all the documentation [here](https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps))\n", 222 | "- Price `[price]`\n", 223 | "- Number of user ratings `[rating_count_tot]`\n", 224 | "- Average user rating `[user_rating]`\n", 225 | "- Content rating `[cont_rating]`\n", 226 | "- Primary genre `[prime_genre]`\n", 227 | "\n", 228 | "### GooglePlay\n", 229 | "(Check all the documentation [here](https://www.kaggle.com/lava18/google-play-store-apps))\n", 230 | "- Rating `[Ratings]`\n", 231 | "- Reviews `[Reviews]`\n", 232 | "- Installs `[Installs]`\n", 233 | "- Type (paid or free) `[Type]`\n", 234 | "- Content rating `[Content Rating]`\n", 235 | "- Genre `[Genre]`\n" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 10, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "def get_dict(store,app_name_index):\n", 247 | " final_dict = {}\n", 248 | " for app in store:\n", 249 | " if app[app_name_index] in final_dict:\n", 250 | " final_dict[app[app_name_index]] += 1\n", 251 | " else:\n", 252 | " final_dict[app[app_name_index]] = 1\n", 253 | " return final_dict" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 11, 259 | "metadata": { 260 | "collapsed": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "def get_duplicates(dictionary):\n", 265 | " duplicates = []\n", 266 | " for app in dictionary:\n", 267 | " if dictionary[app] > 1:\n", 268 | " duplicates.append(app)\n", 269 | " return duplicates" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 12, 275 | "metadata": { 276 | "collapsed": false 277 | }, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "['VR Roller Coaster', 'Mannequin Challenge']" 283 | ] 284 | }, 285 | "execution_count": 12, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "apple_duplicates = get_duplicates(get_dict(appstore,1))\n", 292 | "apple_duplicates" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "### Google Play duplicates\n", 300 | "We found out there are some duplicates at the Google Play database, as seen below:" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 13, 306 | "metadata": { 307 | "collapsed": false, 308 | "scrolled": true 309 | }, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "Number of duplicate apps: 1181\n", 316 | "\n", 317 | "\n", 318 | "Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "duplicate_apps = []\n", 324 | "unique_apps = []\n", 325 | "\n", 326 | "for app in googleplay:\n", 327 | " if app[0] in unique_apps:\n", 328 | " duplicate_apps.append(app[0])\n", 329 | " else:\n", 330 | " unique_apps.append(app[0])\n", 331 | " \n", 332 | "print('Number of duplicate apps:',len(duplicate_apps))\n", 333 | "print('\\n')\n", 334 | "print('Examples of duplicate apps:',duplicate_apps[:10])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "We will remove duplicates, keeping only the version with more reviews." 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 14, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "reviews_max = {}\n", 353 | "\n", 354 | "for app in googleplay:\n", 355 | " name = app[0]\n", 356 | " n_reviews = float(app[3])\n", 357 | " if name in reviews_max and reviews_max[name] < n_reviews:\n", 358 | " reviews_max[name] = n_reviews\n", 359 | " if name not in reviews_max:\n", 360 | " reviews_max[name] = n_reviews" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 15, 366 | "metadata": { 367 | "collapsed": false 368 | }, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "9659" 374 | ] 375 | }, 376 | "execution_count": 15, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "len(reviews_max)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "We will loop through the dataset to get a new list with unique apps." 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 16, 395 | "metadata": { 396 | "collapsed": true 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "android_clean = []\n", 401 | "already_added = []\n", 402 | "\n", 403 | "for app in googleplay:\n", 404 | " name = app[0]\n", 405 | " n_reviews = float(app[3])\n", 406 | " if reviews_max[name] == n_reviews and name not in already_added:\n", 407 | " android_clean.append(app)\n", 408 | " already_added.append(name) " 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 17, 414 | "metadata": { 415 | "collapsed": false 416 | }, 417 | "outputs": [ 418 | { 419 | "data": { 420 | "text/plain": [ 421 | "9659" 422 | ] 423 | }, 424 | "execution_count": 17, 425 | "metadata": {}, 426 | "output_type": "execute_result" 427 | } 428 | ], 429 | "source": [ 430 | "len(android_clean)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "We will check for apps with non-English characters and remove them." 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 18, 443 | "metadata": { 444 | "collapsed": true 445 | }, 446 | "outputs": [], 447 | "source": [ 448 | "def english(string):\n", 449 | " count = 0\n", 450 | " for letter in string:\n", 451 | " if ord(letter) > 127:\n", 452 | " count += 1\n", 453 | " if count > 3:\n", 454 | " return False\n", 455 | " return True" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 19, 461 | "metadata": { 462 | "collapsed": false 463 | }, 464 | "outputs": [ 465 | { 466 | "name": "stdout", 467 | "output_type": "stream", 468 | "text": [ 469 | "True\n", 470 | "False\n", 471 | "True\n", 472 | "True\n" 473 | ] 474 | } 475 | ], 476 | "source": [ 477 | "print(english('Instagram'))\n", 478 | "print(english('爱奇艺PPS -《欢乐颂2》电视剧热播'))\n", 479 | "print(english('Docs To Go™ Free Office Suite'))\n", 480 | "print(english('Instachat 😜'))" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 20, 486 | "metadata": { 487 | "collapsed": false 488 | }, 489 | "outputs": [], 490 | "source": [ 491 | "android_english = []\n", 492 | "appstore_english = []\n", 493 | "\n", 494 | "for app in android_clean:\n", 495 | " if english(app[0]):\n", 496 | " android_english.append(app)\n", 497 | " \n", 498 | "for app in appstore:\n", 499 | " if english(app[1]):\n", 500 | " appstore_english.append(app)" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 21, 506 | "metadata": { 507 | "collapsed": false 508 | }, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "9614\n", 515 | "6183\n" 516 | ] 517 | } 518 | ], 519 | "source": [ 520 | "print(len(android_english))\n", 521 | "print(len(appstore_english))" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "metadata": {}, 527 | "source": [ 528 | "We will isolate free apps, as they are the study objective" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 22, 534 | "metadata": { 535 | "collapsed": false, 536 | "scrolled": false 537 | }, 538 | "outputs": [ 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']\n", 544 | "\n", 545 | "\n", 546 | "['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']\n", 547 | "\n", 548 | "\n", 549 | "Number of rows: 8863\n", 550 | "Number of columns: 13\n", 551 | "['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']\n", 552 | "\n", 553 | "\n", 554 | "['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']\n", 555 | "\n", 556 | "\n", 557 | "Number of rows: 3222\n", 558 | "Number of columns: 16\n" 559 | ] 560 | } 561 | ], 562 | "source": [ 563 | "android_free = []\n", 564 | "ios_free = []\n", 565 | "\n", 566 | "for app in android_english:\n", 567 | " if app[6] == 'Free':\n", 568 | " android_free.append(app)\n", 569 | " \n", 570 | "for app in appstore_english:\n", 571 | " if app[4] == '0.0':\n", 572 | " ios_free.append(app)\n", 573 | " \n", 574 | "explore_data(android_free,0,2,True)\n", 575 | "explore_data(ios_free,0,2,True)" 576 | ] 577 | }, 578 | { 579 | "cell_type": "markdown", 580 | "metadata": {}, 581 | "source": [ 582 | "We found an app with Price '0' but not Type 'Free', so we corrected it" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 23, 588 | "metadata": { 589 | "collapsed": false 590 | }, 591 | "outputs": [ 592 | { 593 | "name": "stdout", 594 | "output_type": "stream", 595 | "text": [ 596 | "['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']\n", 597 | "['Command & Conquer: Rivals', 'FAMILY', 'NaN', '0', 'Varies with device', '0', 'NaN', '0', 'Everyone 10+', 'Strategy', 'June 28, 2018', 'Varies with device', 'Varies with device']\n" 598 | ] 599 | } 600 | ], 601 | "source": [ 602 | "count = 0\n", 603 | "for app in android_english:\n", 604 | " if app[6] != 'Free' and app[7] == '0':\n", 605 | " mistake = app\n", 606 | " break\n", 607 | " count += 1\n", 608 | " \n", 609 | "print(googleplay_header)\n", 610 | "print(mistake)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 24, 616 | "metadata": { 617 | "collapsed": false, 618 | "scrolled": false 619 | }, 620 | "outputs": [], 621 | "source": [ 622 | "android_english[7939][6] = 'Free'" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 25, 628 | "metadata": { 629 | "collapsed": false, 630 | "scrolled": false 631 | }, 632 | "outputs": [ 633 | { 634 | "name": "stdout", 635 | "output_type": "stream", 636 | "text": [ 637 | "['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']\n", 638 | "\n", 639 | "\n", 640 | "['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']\n", 641 | "\n", 642 | "\n", 643 | "['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']\n", 644 | "\n", 645 | "\n", 646 | "Number of rows: 3222\n", 647 | "Number of columns: 16\n", 648 | "\n", 649 | "\n", 650 | "\n", 651 | "\n", 652 | "['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']\n", 653 | "\n", 654 | "\n", 655 | "['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']\n", 656 | "\n", 657 | "\n", 658 | "['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']\n", 659 | "\n", 660 | "\n", 661 | "Number of rows: 8863\n", 662 | "Number of columns: 13\n" 663 | ] 664 | } 665 | ], 666 | "source": [ 667 | "explore_data(ios_free,0,3,True)\n", 668 | "print('\\n\\n\\n')\n", 669 | "explore_data(android_free,0,3,True)" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 26, 675 | "metadata": { 676 | "collapsed": true 677 | }, 678 | "outputs": [], 679 | "source": [ 680 | "android_header = googleplay_header\n", 681 | "ios_header = appstore_header" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "As we mentioned in the introduction, our aim is to determine the kinds of apps that are likely to attract more users because our revenue is highly influenced by the number of people using our apps.\n", 689 | "\n", 690 | "To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps:\n", 691 | "\n", 692 | "1. Build a minimal Android version of the app, and add it to Google Play.\n", 693 | "2. If the app has a good response from users, we develop it further.\n", 694 | "3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.\n", 695 | "\n", 696 | "Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful on both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.\n", 697 | "\n", 698 | "Let's inspect the datasets' headers and find out what data can help us reach our goal:" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 27, 704 | "metadata": { 705 | "collapsed": false, 706 | "scrolled": true 707 | }, 708 | "outputs": [ 709 | { 710 | "name": "stdout", 711 | "output_type": "stream", 712 | "text": [ 713 | "['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']\n", 714 | "['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']\n" 715 | ] 716 | } 717 | ], 718 | "source": [ 719 | "print(android_header)\n", 720 | "print(ios_header)" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "For android, `'Category'` and `'Genres'`. For iOS, `'prime_genre'`. See one example below for each store:" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 28, 733 | "metadata": { 734 | "collapsed": false, 735 | "scrolled": true 736 | }, 737 | "outputs": [ 738 | { 739 | "name": "stdout", 740 | "output_type": "stream", 741 | "text": [ 742 | "['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']\n", 743 | "\n", 744 | "\n", 745 | "['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']\n" 746 | ] 747 | } 748 | ], 749 | "source": [ 750 | "print(android_free[0])\n", 751 | "print('\\n')\n", 752 | "print(ios_free[0])" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 29, 758 | "metadata": { 759 | "collapsed": true 760 | }, 761 | "outputs": [], 762 | "source": [ 763 | "def freq_table(dataset,index):\n", 764 | " dictionary = {}\n", 765 | " for app in dataset:\n", 766 | " if app[index] in dictionary:\n", 767 | " dictionary[app[index]] += 100/len(dataset)\n", 768 | " else:\n", 769 | " dictionary[app[index]] = 100/len(dataset)\n", 770 | " return dictionary" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": 30, 776 | "metadata": { 777 | "collapsed": true 778 | }, 779 | "outputs": [], 780 | "source": [ 781 | "def display_table(dataset, index):\n", 782 | " table = freq_table(dataset, index)\n", 783 | " table_display = []\n", 784 | " for key in table:\n", 785 | " key_val_as_tuple = (table[key], key)\n", 786 | " table_display.append(key_val_as_tuple)\n", 787 | "\n", 788 | " table_sorted = sorted(table_display, reverse = True)\n", 789 | " for entry in table_sorted:\n", 790 | " print(entry[1], ':', entry[0])" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 31, 796 | "metadata": { 797 | "collapsed": false, 798 | "scrolled": false 799 | }, 800 | "outputs": [ 801 | { 802 | "name": "stdout", 803 | "output_type": "stream", 804 | "text": [ 805 | "Games : 58.1626319056464\n", 806 | "Entertainment : 7.883302296710134\n", 807 | "Photo & Video : 4.965859714463075\n", 808 | "Education : 3.6623215394165176\n", 809 | "Social Networking : 3.2898820608317867\n", 810 | "Shopping : 2.6070763500931133\n", 811 | "Utilities : 2.5139664804469306\n", 812 | "Sports : 2.1415270018621997\n", 813 | "Music : 2.048417132216017\n", 814 | "Health & Fitness : 2.0173805090006227\n", 815 | "Productivity : 1.7380509000620747\n", 816 | "Lifestyle : 1.5828677839851035\n", 817 | "News : 1.3345747982619496\n", 818 | "Travel : 1.2414649286157668\n", 819 | "Finance : 1.1173184357541899\n", 820 | "Weather : 0.8690254500310364\n", 821 | "Food & Drink : 0.8069522036002481\n", 822 | "Reference : 0.558659217877095\n", 823 | "Business : 0.5276225946617009\n", 824 | "Book : 0.4345127250155184\n", 825 | "Navigation : 0.186219739292365\n", 826 | "Medical : 0.186219739292365\n", 827 | "Catalogs : 0.12414649286157665\n" 828 | ] 829 | } 830 | ], 831 | "source": [ 832 | "prime_genre_freq = display_table(ios_free,11)" 833 | ] 834 | }, 835 | { 836 | "cell_type": "markdown", 837 | "metadata": {}, 838 | "source": [ 839 | "AppStore is clearly dominated by entertainment apps (Games and Entertainment account for top 2)" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 32, 845 | "metadata": { 846 | "collapsed": false, 847 | "scrolled": false 848 | }, 849 | "outputs": [ 850 | { 851 | "name": "stdout", 852 | "output_type": "stream", 853 | "text": [ 854 | "Tools : 8.450863138892059\n", 855 | "Entertainment : 6.070179397495205\n", 856 | "Education : 5.348076272142616\n", 857 | "Business : 4.592124562789124\n", 858 | "Productivity : 3.8925871601038025\n", 859 | "Lifestyle : 3.8925871601038025\n", 860 | "Finance : 3.700778517432021\n", 861 | "Medical : 3.5315355974275078\n", 862 | "Sports : 3.4638384294257025\n", 863 | "Personalization : 3.317161232088458\n", 864 | "Communication : 3.2381812027530184\n", 865 | "Action : 3.102786866749408\n", 866 | "Health & Fitness : 3.0802211440821394\n", 867 | "Photography : 2.944826808078529\n", 868 | "News & Magazines : 2.798149610741284\n", 869 | "Social : 2.6627552747376737\n", 870 | "Travel & Local : 2.3242694347286474\n", 871 | "Shopping : 2.245289405393208\n", 872 | "Books & Reference : 2.1437436533905\n", 873 | "Simulation : 2.0421979013877922\n", 874 | "Dating : 1.8616721200496447\n", 875 | "Arcade : 1.8503892587160105\n", 876 | "Video Players & Editors : 1.771409229380571\n", 877 | "Casual : 1.7601263680469368\n", 878 | "Maps & Navigation : 1.399074805370642\n", 879 | "Food & Drink : 1.2411147466997632\n", 880 | "Puzzle : 1.128286133363421\n", 881 | "Racing : 0.9928917973598105\n", 882 | "Role Playing : 0.9364774906916394\n", 883 | "Libraries & Demo : 0.9364774906916394\n", 884 | "Auto & Vehicles : 0.9251946293580052\n", 885 | "Strategy : 0.9026289066907368\n", 886 | "House & Home : 0.8236488773552973\n", 887 | "Weather : 0.8010831546880289\n", 888 | "Events : 0.7108202640189553\n", 889 | "Adventure : 0.6769716800180526\n", 890 | "Comics : 0.6092745120162474\n", 891 | "Beauty : 0.5979916506826132\n", 892 | "Art & Design : 0.5979916506826132\n", 893 | "Parenting : 0.49644589867990524\n", 894 | "Card : 0.4513144533453684\n", 895 | "Casino : 0.4287487306781\n", 896 | "Trivia : 0.4174658693444658\n", 897 | "Educational;Education : 0.39490014667719736\n", 898 | "Board : 0.38361728534356315\n", 899 | "Educational : 0.37233442400992894\n", 900 | "Education;Education : 0.3384858400090263\n", 901 | "Word : 0.25950581067358686\n", 902 | "Casual;Pretend Play : 0.2369400880063184\n", 903 | "Music : 0.20309150400541578\n", 904 | "Racing;Action & Adventure : 0.16924292000451316\n", 905 | "Puzzle;Brain Games : 0.16924292000451316\n", 906 | "Entertainment;Music & Video : 0.16924292000451316\n", 907 | "Casual;Brain Games : 0.13539433600361053\n", 908 | "Casual;Action & Adventure : 0.13539433600361053\n", 909 | "Arcade;Action & Adventure : 0.12411147466997631\n", 910 | "Action;Action & Adventure : 0.10154575200270789\n", 911 | "Educational;Pretend Play : 0.09026289066907368\n", 912 | "Simulation;Action & Adventure : 0.07898002933543948\n", 913 | "Parenting;Education : 0.07898002933543948\n", 914 | "Entertainment;Brain Games : 0.07898002933543948\n", 915 | "Board;Brain Games : 0.07898002933543948\n", 916 | "Parenting;Music & Video : 0.06769716800180527\n", 917 | "Educational;Brain Games : 0.06769716800180527\n", 918 | "Casual;Creativity : 0.06769716800180527\n", 919 | "Art & Design;Creativity : 0.06769716800180527\n", 920 | "Education;Pretend Play : 0.05641430666817105\n", 921 | "Role Playing;Pretend Play : 0.04513144533453684\n", 922 | "Education;Creativity : 0.04513144533453684\n", 923 | "Role Playing;Action & Adventure : 0.03384858400090263\n", 924 | "Puzzle;Action & Adventure : 0.03384858400090263\n", 925 | "Entertainment;Creativity : 0.03384858400090263\n", 926 | "Entertainment;Action & Adventure : 0.03384858400090263\n", 927 | "Educational;Creativity : 0.03384858400090263\n", 928 | "Educational;Action & Adventure : 0.03384858400090263\n", 929 | "Education;Music & Video : 0.03384858400090263\n", 930 | "Education;Brain Games : 0.03384858400090263\n", 931 | "Education;Action & Adventure : 0.03384858400090263\n", 932 | "Adventure;Action & Adventure : 0.03384858400090263\n", 933 | "Video Players & Editors;Music & Video : 0.02256572266726842\n", 934 | "Sports;Action & Adventure : 0.02256572266726842\n", 935 | "Simulation;Pretend Play : 0.02256572266726842\n", 936 | "Puzzle;Creativity : 0.02256572266726842\n", 937 | "Music;Music & Video : 0.02256572266726842\n", 938 | "Entertainment;Pretend Play : 0.02256572266726842\n", 939 | "Casual;Education : 0.02256572266726842\n", 940 | "Board;Action & Adventure : 0.02256572266726842\n", 941 | "Video Players & Editors;Creativity : 0.01128286133363421\n", 942 | "Trivia;Education : 0.01128286133363421\n", 943 | "Travel & Local;Action & Adventure : 0.01128286133363421\n", 944 | "Tools;Education : 0.01128286133363421\n", 945 | "Strategy;Education : 0.01128286133363421\n", 946 | "Strategy;Creativity : 0.01128286133363421\n", 947 | "Strategy;Action & Adventure : 0.01128286133363421\n", 948 | "Simulation;Education : 0.01128286133363421\n", 949 | "Role Playing;Brain Games : 0.01128286133363421\n", 950 | "Racing;Pretend Play : 0.01128286133363421\n", 951 | "Puzzle;Education : 0.01128286133363421\n", 952 | "Parenting;Brain Games : 0.01128286133363421\n", 953 | "Music & Audio;Music & Video : 0.01128286133363421\n", 954 | "Lifestyle;Pretend Play : 0.01128286133363421\n", 955 | "Lifestyle;Education : 0.01128286133363421\n", 956 | "Health & Fitness;Education : 0.01128286133363421\n", 957 | "Health & Fitness;Action & Adventure : 0.01128286133363421\n", 958 | "Entertainment;Education : 0.01128286133363421\n", 959 | "Communication;Creativity : 0.01128286133363421\n", 960 | "Comics;Creativity : 0.01128286133363421\n", 961 | "Casual;Music & Video : 0.01128286133363421\n", 962 | "Card;Action & Adventure : 0.01128286133363421\n", 963 | "Books & Reference;Education : 0.01128286133363421\n", 964 | "Art & Design;Pretend Play : 0.01128286133363421\n", 965 | "Art & Design;Action & Adventure : 0.01128286133363421\n", 966 | "Arcade;Pretend Play : 0.01128286133363421\n", 967 | "Adventure;Education : 0.01128286133363421\n" 968 | ] 969 | } 970 | ], 971 | "source": [ 972 | "genres_freq = display_table(android_free,9)" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": 33, 978 | "metadata": { 979 | "collapsed": false, 980 | "scrolled": true 981 | }, 982 | "outputs": [ 983 | { 984 | "name": "stdout", 985 | "output_type": "stream", 986 | "text": [ 987 | "FAMILY : 18.898792733837702\n", 988 | "GAME : 9.725826469592825\n", 989 | "TOOLS : 8.462146000225694\n", 990 | "BUSINESS : 4.592124562789124\n", 991 | "LIFESTYLE : 3.9038700214374367\n", 992 | "PRODUCTIVITY : 3.8925871601038025\n", 993 | "FINANCE : 3.700778517432021\n", 994 | "MEDICAL : 3.5315355974275078\n", 995 | "SPORTS : 3.3961412614238973\n", 996 | "PERSONALIZATION : 3.317161232088458\n", 997 | "COMMUNICATION : 3.2381812027530184\n", 998 | "HEALTH_AND_FITNESS : 3.0802211440821394\n", 999 | "PHOTOGRAPHY : 2.944826808078529\n", 1000 | "NEWS_AND_MAGAZINES : 2.798149610741284\n", 1001 | "SOCIAL : 2.6627552747376737\n", 1002 | "TRAVEL_AND_LOCAL : 2.3355522960622817\n", 1003 | "SHOPPING : 2.245289405393208\n", 1004 | "BOOKS_AND_REFERENCE : 2.1437436533905\n", 1005 | "DATING : 1.8616721200496447\n", 1006 | "VIDEO_PLAYERS : 1.7939749520478394\n", 1007 | "MAPS_AND_NAVIGATION : 1.399074805370642\n", 1008 | "FOOD_AND_DRINK : 1.2411147466997632\n", 1009 | "EDUCATION : 1.1621347173643237\n", 1010 | "ENTERTAINMENT : 0.9590432133589079\n", 1011 | "LIBRARIES_AND_DEMO : 0.9364774906916394\n", 1012 | "AUTO_AND_VEHICLES : 0.9251946293580052\n", 1013 | "HOUSE_AND_HOME : 0.8236488773552973\n", 1014 | "WEATHER : 0.8010831546880289\n", 1015 | "EVENTS : 0.7108202640189553\n", 1016 | "PARENTING : 0.6544059573507842\n", 1017 | "ART_AND_DESIGN : 0.64312309601715\n", 1018 | "COMICS : 0.6205573733498816\n", 1019 | "BEAUTY : 0.5979916506826132\n" 1020 | ] 1021 | } 1022 | ], 1023 | "source": [ 1024 | "category_freq = display_table(android_free,1)" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "markdown", 1029 | "metadata": {}, 1030 | "source": [ 1031 | "It is harder to navigate through GooglePlay numbers as the Genres table is very extensive, so we will look towards the Category frequency table. Here the distribution is more diverse but still we have Family and Games accounting for top 2, followed by productivity apps like Tools and Business.\n", 1032 | "\n", 1033 | "So, for now:\n", 1034 | "- AppStore is more directed towards Entertainment\n", 1035 | "- GooglePlay has a more diversified collection of apps\n", 1036 | "\n", 1037 | "We will now look at the number of users per genre. For GooglePlay we have info in `Installs` showing us the number of downloads. For AppStore, we will manage to look into the ratings (in `rating_count_tot`) to retrieve that information." 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": 34, 1043 | "metadata": { 1044 | "collapsed": false, 1045 | "scrolled": true 1046 | }, 1047 | "outputs": [ 1048 | { 1049 | "name": "stdout", 1050 | "output_type": "stream", 1051 | "text": [ 1052 | "News: 21248.023255813954\n", 1053 | "Education: 7003.983050847458\n", 1054 | "Music: 57326.530303030304\n", 1055 | "Book: 39758.5\n", 1056 | "Food & Drink: 33333.92307692308\n", 1057 | "Entertainment: 14029.830708661417\n", 1058 | "Navigation: 86090.33333333333\n", 1059 | "Weather: 52279.892857142855\n", 1060 | "Business: 7491.117647058823\n", 1061 | "Catalogs: 4004.0\n", 1062 | "Health & Fitness: 23298.015384615384\n", 1063 | "Sports: 23008.898550724636\n", 1064 | "Travel: 28243.8\n", 1065 | "Games: 22788.6696905016\n", 1066 | "Shopping: 26919.690476190477\n", 1067 | "Lifestyle: 16485.764705882353\n", 1068 | "Social Networking: 71548.34905660378\n", 1069 | "Utilities: 18684.456790123455\n", 1070 | "Finance: 31467.944444444445\n", 1071 | "Photo & Video: 28441.54375\n", 1072 | "Medical: 612.0\n", 1073 | "Productivity: 21028.410714285714\n", 1074 | "Reference: 74942.11111111111\n" 1075 | ] 1076 | } 1077 | ], 1078 | "source": [ 1079 | "for genre in freq_table(ios_free,11):\n", 1080 | " total = 0\n", 1081 | " len_genre = 0\n", 1082 | " for app in ios_free:\n", 1083 | " genre_app = app[11]\n", 1084 | " if genre_app == genre:\n", 1085 | " total += float(app[5])\n", 1086 | " len_genre += 1\n", 1087 | " print(genre + ':',total/len_genre)" 1088 | ] 1089 | }, 1090 | { 1091 | "cell_type": "markdown", 1092 | "metadata": {}, 1093 | "source": [ 1094 | "Top 3:\n", 1095 | "- Navigation: 86090\n", 1096 | "- Reference: 74942\n", 1097 | "- Social Networking: 71548\n", 1098 | "\n", 1099 | "Regarding GooglePlay, we have the relative figures (0+, 1+, 5+, ... , 1,000,000,000+) for number of installs. As we do not need exact precision we will remove the plus (+) sign and consider that as the final number of installs." 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": 36, 1105 | "metadata": { 1106 | "collapsed": false 1107 | }, 1108 | "outputs": [ 1109 | { 1110 | "data": { 1111 | "text/plain": [ 1112 | "['Photo Editor & Candy Camera & Grid & ScrapBook',\n", 1113 | " 'ART_AND_DESIGN',\n", 1114 | " '4.1',\n", 1115 | " '159',\n", 1116 | " '19M',\n", 1117 | " '10,000+',\n", 1118 | " 'Free',\n", 1119 | " '0',\n", 1120 | " 'Everyone',\n", 1121 | " 'Art & Design',\n", 1122 | " 'January 7, 2018',\n", 1123 | " '1.0.0',\n", 1124 | " '4.0.3 and up']" 1125 | ] 1126 | }, 1127 | "execution_count": 36, 1128 | "metadata": {}, 1129 | "output_type": "execute_result" 1130 | } 1131 | ], 1132 | "source": [ 1133 | "android_free[0]" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": 35, 1139 | "metadata": { 1140 | "collapsed": true 1141 | }, 1142 | "outputs": [], 1143 | "source": [ 1144 | "android_category_table = freq_table(android_free,1)" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "code", 1149 | "execution_count": 40, 1150 | "metadata": { 1151 | "collapsed": false 1152 | }, 1153 | "outputs": [ 1154 | { 1155 | "name": "stdout", 1156 | "output_type": "stream", 1157 | "text": [ 1158 | "AUTO_AND_VEHICLES : 647318\n", 1159 | "EVENTS : 253542\n", 1160 | "FOOD_AND_DRINK : 1924898\n", 1161 | "HEALTH_AND_FITNESS : 4188822\n", 1162 | "SHOPPING : 7036877\n", 1163 | "BOOKS_AND_REFERENCE : 8767812\n", 1164 | "ENTERTAINMENT : 11640706\n", 1165 | "TRAVEL_AND_LOCAL : 13984078\n", 1166 | "BEAUTY : 513152\n", 1167 | "PHOTOGRAPHY : 17840110\n", 1168 | "VIDEO_PLAYERS : 24727872\n", 1169 | "PRODUCTIVITY : 16787331\n", 1170 | "TOOLS : 10801391\n", 1171 | "NEWS_AND_MAGAZINES : 9549178\n", 1172 | "PERSONALIZATION : 5201483\n", 1173 | "SOCIAL : 23253652\n", 1174 | "HOUSE_AND_HOME : 1331541\n", 1175 | "LIBRARIES_AND_DEMO : 638504\n", 1176 | "LIFESTYLE : 1437816\n", 1177 | "BUSINESS : 1712290\n", 1178 | "MEDICAL : 120551\n", 1179 | "SPORTS : 3638640\n", 1180 | "COMMUNICATION : 38456119\n", 1181 | "EDUCATION : 1833495\n", 1182 | "FINANCE : 1387692\n", 1183 | "DATING : 854029\n", 1184 | "PARENTING : 542604\n", 1185 | "FAMILY : 3697848\n", 1186 | "MAPS_AND_NAVIGATION : 4056942\n", 1187 | "WEATHER : 5074486\n", 1188 | "COMICS : 817657\n", 1189 | "GAME : 15588016\n", 1190 | "ART_AND_DESIGN : 1986335\n" 1191 | ] 1192 | } 1193 | ], 1194 | "source": [ 1195 | "for category in android_category_table:\n", 1196 | " total = 0\n", 1197 | " len_category = 0\n", 1198 | " for app in android_free:\n", 1199 | " if app[1] == category:\n", 1200 | " num = app[5].replace('+','')\n", 1201 | " num = num.replace(',','')\n", 1202 | " total += float(num)\n", 1203 | " len_category += 1\n", 1204 | " print(category,':',round(total/len_category))" 1205 | ] 1206 | }, 1207 | { 1208 | "cell_type": "markdown", 1209 | "metadata": {}, 1210 | "source": [ 1211 | "Top 3:\n", 1212 | "- COMMUNICATION : 38456119\n", 1213 | "- VIDEO_PLAYERS : 2472787\n", 1214 | "- SOCIAL : 23253652" 1215 | ] 1216 | } 1217 | ], 1218 | "metadata": { 1219 | "kernelspec": { 1220 | "display_name": "Python 3", 1221 | "language": "python", 1222 | "name": "python3" 1223 | }, 1224 | "language_info": { 1225 | "codemirror_mode": { 1226 | "name": "ipython", 1227 | "version": 3 1228 | }, 1229 | "file_extension": ".py", 1230 | "mimetype": "text/x-python", 1231 | "name": "python", 1232 | "nbconvert_exporter": "python", 1233 | "pygments_lexer": "ipython3", 1234 | "version": "3.4.3" 1235 | } 1236 | }, 1237 | "nbformat": 4, 1238 | "nbformat_minor": 2 1239 | } 1240 | --------------------------------------------------------------------------------