├── .gitignore
├── Intro to Machine Learning.pdf
├── README.rst
├── data
    ├── Dynamic events table.csv
    ├── Dynamic subscription table.csv
    ├── accounts.csv
    ├── monday_datalearn.csv
    └── users.csv
├── datalearn19intro
    ├── LICENSE
    ├── README.rst
    ├── datalearn19intro
    │   ├── __init__.py
    │   └── dataloader.py
    ├── mit_license_badge.svg
    └── setup.py
├── part_1.introducing_jupyter.ipynb
├── part_2.numpy.ipynb
├── part_3.pandas.ipynb
├── part_4.EDA.ipynb
├── part_5.Preprocessing.ipynb
├── part_6.modeling.ipynb
└── util_0.reading_the_data.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 
126 | # vim swap files
127 | *.swp
128 | 
129 | .DS_Store
130 | 


--------------------------------------------------------------------------------
/Intro to Machine Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataHackIL/DataLearn-ML-Intro-2019/614ed306726f2b5f073b7da2d621069ecbd26023/Intro to Machine Learning.pdf


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | DataLearn Supervised ML Intro 2019
 2 | ##################################
 3 | 
 4 | The repository of the hands-on introduction to machine learning workshop of the DataLearn 2019 track at DataHack 2019.
 5 | 
 6 | Video link: https://youtu.be/Su8YcXgkDsk?t=1701
 7 | 
 8 | `Meetup event link <https://www.meetup.com/DataHack/events/263740425/>`_
 9 | 
10 | Notebooks by Shay Palachy. Presentation by Shay Palachy and Dana Kaner.   *(Thank you Dana <3)*
11 | 
12 | Resources
13 | =========
14 | 
15 | * Presentation: 
16 | 
17 |   * `Intro to Machine Learning presentation <https://github.com/DataHackIL/DataLearn-ML-Intro-2019/blob/master/Intro%20to%20Machine%20Learning.pdf>`_
18 |   * Credits: Shay Palachy **and Dana Kaner**
19 |   
20 |   * Video recording: https://youtu.be/Su8YcXgkDsk?t=1701
21 |   
22 | * Notebooks:
23 | 
24 |   1. `Introducing Jupyter notebooks <https://github.com/DataHackIL/DataLearn-ML-Intro-2019/blob/master/part_1.introducing_jupyter.ipynb>`_
25 |   2. `Introduction to numpy <https://github.com/DataHackIL/DataLearn-ML-Intro-2019/blob/master/part_2.numpy.ipynb>`_
26 |   3. `Introduction to pandas <https://github.com/DataHackIL/DataLearn-ML-Intro-2019/blob/master/part_3.pandas.ipynb>`_
27 |   4. `Exploratory Data Analysis <https://github.com/DataHackIL/DataLearn-ML-Intro-2019/blob/master/part_4.EDA.ipynb>`_
28 |   5. `Preprocessing <https://github.com/DataHackIL/DataLearn-ML-Intro-2019/blob/master/part_5.Preprocessing.ipynb>`_
29 |   6. `Modeling <https://github.com/DataHackIL/DataLearn-ML-Intro-2019/blob/master/part_6.modeling.ipynb>`_
30 |   7. `Utility: Reading the data <https://github.com/DataHackIL/DataLearn-ML-Intro-2019/blob/master/util_0.reading_the_data.ipynb>`_
31 | 
32 | 
33 | Outline
34 | =======
35 | 
36 | * Tools of the trade
37 | 
38 |   * Jupyter notebooks
39 |   * numpy
40 |   * pandas
41 |  
42 | * Data exploration
43 | * Preprocessing
44 | 
45 |   * Imputation
46 |   * Scaling and normalization
47 |   * Handling outliers
48 |   * Feature extraction/generation
49 |   * Feature selection
50 |   * Dimensionality reduction
51 |   
52 | * Modeling
53 |  
54 |   * Model fit & loss functions
55 |   * Splitting your data
56 |   * Model evaluation
57 |   * Hyperparameter Optimization
58 | 


--------------------------------------------------------------------------------
/data/Dynamic subscription table.csv:
--------------------------------------------------------------------------------
 1 | "event_date","account_id","plan_id","event_type","invoice_charge_amount","prev_plan_id","status","status_reason","currency","invoice_charge_amount_usd","mrr_gain","subscription_id","next_charge_date","payment_type","transaction_date"
 2 | "2019-01-07",2793955,199,"CHARGE",64.00,NULL,"","","AUD",44.67,46.00,65984302,"2019-02-07 21:05:00","CC","2019-01-07 21:05:00"
 3 | "2019-05-07",2793955,199,"RECURRING",64.00,199,"","","AUD",43.94,0.00,65984302,"2019-06-07 20:05:00","CC","2019-05-07 20:34:00"
 4 | "2019-03-07",2793955,199,"RECURRING",64.00,199,"","","AUD",44.20,0.00,65984302,"2019-04-07 20:05:00","CC","2019-03-07 21:33:00"
 5 | "2019-06-07",2793955,199,"RECURRING",64.00,199,"","","AUD",43.62,0.00,65984302,"2019-07-07 20:05:00","CC","2019-06-07 20:34:00"
 6 | "2019-04-10",2793955,199,"PAYMENT_METHOD_UPDATED",0.00,199,"","","",0.00,NULL,65984302,NULL,"CC","2019-04-10 02:32:00"
 7 | "2019-02-07",2793955,199,"RECURRING",64.00,199,"","","AUD",44.52,-1.00,65984302,"2019-03-07 21:05:00","CC","2019-02-07 21:33:00"
 8 | "2019-04-07",2793955,199,"RECURRING",64.00,199,"","","AUD",44.54,0.00,65984302,"2019-05-07 20:05:00","CC","2019-04-07 20:23:00"
 9 | "2019-07-07",2793955,199,"RECURRING",64.00,199,"","","AUD",43.91,0.00,65984302,"2019-08-07 20:05:00","CC","2019-07-07 20:01:00"
10 | "2019-01-07",2794172,198,"CHARGE",29.00,NULL,"","","USD",29.00,29.00,65925498,"2019-02-07 07:09:00","CC","2019-01-07 07:09:00"
11 | "2019-01-28",2794172,198,"CANCEL_ON_RENEWAL",0.00,198,"","","USD",0.00,NULL,65925498,"2019-02-07 07:09:00","CC","2019-01-28 07:27:00"
12 | "2019-02-07",2794172,198,"CANCELLATION",0.00,198,"","","USD",0.00,-29.00,65925498,"2019-02-07 07:09:00","CC","2019-02-07 07:32:00"
13 | "2019-01-28",2794489,231,"CONTRACT_CHANGE",0.00,232,"SUCCESS","","USD",0.00,-20.00,66852410,"2020-01-28 15:02:00","CC","2019-01-28 15:02:00"
14 | "2019-01-28",2794489,232,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2020-03-28 07:00:00","",NULL
15 | "2019-01-28",2794489,231,"RECURRING",0.00,231,"","","USD",0.00,NULL,66852410,"2020-01-28 15:02:00","BALANCE","2019-01-28 15:02:00"
16 | "2019-01-29",2794489,232,"REFUND",-258.72,232,"","","USD",-258.72,NULL,66852410,"2020-01-28 15:02:00","CC","2019-01-29 09:51:00"
17 | "2019-01-29",2794489,231,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2020-01-17 08:00:00","",NULL
18 | "2019-01-28",2794489,232,"RECURRING",258.72,232,"","","USD",258.72,NULL,66852410,"2020-01-28 14:40:00","CC","2019-01-28 14:40:00"
19 | "2019-01-17",2794489,231,"CHARGE",468.00,NULL,"","","USD",468.00,39.00,66852410,"2020-01-17 08:51:00","CC","2019-01-17 08:51:00"
20 | "2019-01-28",2794489,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","USD",0.00,20.00,66852410,"2020-01-28 14:40:00","CC","2019-01-28 14:40:00"
21 | "2019-01-14",2793704,231,"CHARGE",624.00,NULL,"","","CAD",460.64,39.00,66569204,"2020-01-14 15:16:00","CC","2019-01-14 15:16:00"
22 | "2019-01-15",2793906,239,"CHARGE",1428.00,NULL,"","","USD",1428.00,119.00,66641482,"2020-01-15 04:57:00","CC","2019-01-15 04:57:00"
23 | "2019-01-15",2793906,239,"CC_CHARGE_FAILED",0.00,239,"","This transaction has been declined. Please check card details and try again, or contact your bank for assistance","USD",0.00,NULL,NULL,NULL,"CC","2019-01-15 04:56:00"
24 | "2019-01-03",2794458,232,"RECURRING",291.01,232,"","","USD",291.01,NULL,65496398,"2020-01-03 16:20:00","CC","2019-01-03 16:20:00"
25 | "2019-01-03",2794458,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","USD",0.00,24.00,65496398,"2020-01-03 16:20:00","CC","2019-01-03 16:20:00"
26 | "2019-01-02",2794458,231,"CHARGE",421.20,NULL,"","","USD",421.20,35.00,65496398,"2020-01-02 15:46:00","CC","2019-01-02 15:46:00"
27 | "2019-07-22",2794064,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","AUD",0.00,19.00,66092630,"2020-07-22 07:33:00","CC","2019-07-22 07:33:00"
28 | "2019-01-08",2794064,231,"CHARGE",624.00,NULL,"","","AUD",436.18,37.00,66092630,"2020-01-08 22:31:00","CC","2019-01-08 22:31:00"
29 | "2019-07-22",2794064,232,"RECURRING",660.96,232,"","","AUD",455.81,NULL,66092630,"2020-07-22 07:33:00","CC","2019-07-22 07:33:00"
30 | "2019-01-19",2794278,199,"CANCEL_ON_RENEWAL",0.00,199,"","","USD",0.00,NULL,65670660,"2019-02-04 10:17:00","CC","2019-01-19 14:45:00"
31 | "2019-02-04",2794278,199,"CANCELLATION",0.00,199,"","","USD",0.00,-48.00,65670660,"2019-02-04 10:17:00","CC","2019-02-04 10:33:00"
32 | "2019-01-04",2794278,199,"CHARGE",48.00,NULL,"","","USD",48.00,48.00,65670660,"2019-02-04 10:17:00","CC","2019-01-04 10:17:00"
33 | "2019-01-21",2793789,231,"CHARGE",468.00,NULL,"","","USD",468.00,39.00,67146528,"2020-01-21 06:06:00","CC","2019-01-21 06:06:00"
34 | "2019-01-18",2793567,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","CAD",0.00,20.00,66577666,"2020-01-18 15:09:00","CC","2019-01-18 15:09:00"
35 | "2019-03-20",2793567,264,"CONTRACT_CHANGE",0.00,232,"SUCCESS","","CAD",0.00,-10.00,66577666,"2021-03-20 20:38:00","CC","2019-03-20 20:38:00"
36 | "2019-01-18",2793567,232,"RECURRING",336.48,232,"","","CAD",248.73,NULL,66577666,"2020-01-18 15:09:00","CC","2019-01-18 15:09:00"
37 | "2019-01-18",2793567,232,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2020-03-18 07:00:00","",NULL
38 | "2019-03-20",2793567,264,"RECURRING",797.16,264,"","","CAD",585.82,NULL,66577666,"2021-03-20 20:38:00","CC","2019-03-20 20:38:00"
39 | "2019-01-14",2793567,231,"CHARGE",624.00,NULL,"","","CAD",460.64,39.00,66577666,"2020-01-14 16:33:00","CC","2019-01-14 16:33:00"
40 | "2019-03-20",2793567,264,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2021-05-20 07:00:00","",NULL
41 | "2019-01-11",2793828,230,"CHARGE",276.00,NULL,"","","USD",276.00,23.00,66303986,"2020-01-11 03:35:00","CC","2019-01-11 03:35:00"
42 | "2019-01-15",2794328,198,"CHARGE",25.00,NULL,"","","GBP",31.48,32.00,66658322,"2019-02-15 12:38:00","CC","2019-01-15 12:38:00"
43 | "2019-04-30",2794328,202,"REFUND",-51.00,202,"","","GBP",-64.38,NULL,66658322,"2019-05-28 09:13:00","CC","2019-04-30 10:08:00"
44 | "2019-01-30",2794328,202,"CONTRACT_CHANGE",0.00,198,"SUCCESS","","GBP",0.00,35.00,66658322,"2019-02-28 10:13:00","CC","2019-01-30 10:13:00"
45 | "2019-04-30",2794328,202,"CANCEL_ON_RENEWAL",0.00,202,"","","GBP",0.00,NULL,66658322,"2019-05-28 09:13:00","CC","2019-04-30 09:49:00"
46 | "2019-04-28",2794328,202,"RECURRING",51.00,202,"","","GBP",64.38,-1.00,66658322,"2019-05-28 09:13:00","CC","2019-04-28 11:17:00"
47 | "2019-02-28",2794328,202,"RECURRING",51.00,202,"","","GBP",66.56,1.00,66658322,"2019-03-28 09:13:00","CC","2019-02-28 10:37:00"
48 | "2019-01-30",2794328,202,"RECURRING",38.25,202,"","","GBP",49.28,NULL,66658322,"2019-02-28 10:13:00","CC","2019-01-30 10:13:00"
49 | "2019-04-30",2794328,202,"CANCELLATION",0.00,202,"","","GBP",0.00,-66.00,66658322,"2019-05-28 09:13:00","CC","2019-04-30 10:08:00"
50 | "2019-03-28",2794328,202,"RECURRING",51.00,202,"","","GBP",66.12,-1.00,66658322,"2019-04-28 09:13:00","CC","2019-03-28 09:24:00"
51 | "2019-01-17",2793924,264,"CONTRACT_CHANGE",0.00,263,"SUCCESS","","USD",0.00,17.00,66839172,"2021-01-17 13:06:00","CC","2019-01-17 13:06:00"
52 | "2019-01-17",2793924,263,"CHARGE",768.00,NULL,"","","USD",768.00,32.00,66839172,"2021-01-17 03:06:00","CC","2019-01-17 03:06:00"
53 | "2019-01-17",2793924,264,"RECURRING",408.00,264,"","","USD",408.00,NULL,66839172,"2021-01-17 13:06:00","CC","2019-01-17 13:06:00"
54 | "2019-06-29",2793924,268,"CONTRACT_CHANGE",0.00,264,"SUCCESS","","USD",0.00,50.00,66839172,"2021-06-29 10:08:00","CC","2019-06-29 10:08:00"
55 | "2019-06-29",2793924,268,"RECURRING",1470.48,268,"","","USD",1470.48,NULL,66839172,"2021-06-29 10:08:00","CC","2019-06-29 10:08:00"
56 | "2019-01-12",2794423,232,"CHARGE",708.00,NULL,"","","USD",708.00,59.00,66415970,"2020-01-12 13:22:00","CC","2019-01-12 13:22:00"
57 | "2019-02-28",2794463,818,"CHARGE",48.00,NULL,"","","USD",48.00,48.00,70971926,"2019-03-28 21:07:00","CC","2019-02-28 22:07:00"
58 | "2019-04-03",2794463,200,"RECURRING",33.60,200,"","","USD",33.60,NULL,70971926,"2019-05-03 10:11:00","CC","2019-04-03 10:11:00"
59 | "2019-03-28",2794463,818,"RECURRING",48.00,818,"","","USD",48.00,0.00,70971926,"2019-04-28 21:07:00","CC","2019-03-28 21:23:00"
60 | "2019-05-03",2794463,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,70971926,"2019-06-03 10:11:00","CC","2019-05-03 10:34:00"
61 | "2019-07-03",2794463,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,70971926,"2019-08-03 10:11:00","CC","2019-07-03 10:01:00"
62 | "2019-06-03",2794463,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,70971926,"2019-07-03 10:11:00","CC","2019-06-03 10:33:00"
63 | "2019-04-03",2794463,200,"CONTRACT_CHANGE",0.00,818,"SUCCESS","","USD",0.00,24.00,70971926,"2019-05-03 10:11:00","CC","2019-04-03 10:11:00"
64 | "2019-01-09",2794383,232,"RECURRING",240.39,232,"","","GBP",300.52,NULL,65941602,"2020-01-09 09:04:00","CC","2019-01-09 09:04:00"
65 | "2019-01-30",2794383,236,"CONTRACT_CHANGE",0.00,232,"SUCCESS","","GBP",0.00,69.00,65941602,"2020-01-30 10:29:00","CC","2019-01-30 10:29:00"
66 | "2019-01-30",2794383,236,"RECURRING",648.72,236,"","","GBP",835.92,NULL,65941602,"2020-01-30 10:29:00","CC","2019-01-30 10:29:00"
67 | "2019-01-07",2794383,231,"CHARGE",375.36,NULL,"","","GBP",468.67,40.00,65941602,"2020-01-07 14:01:00","CC","2019-01-07 14:01:00"
68 | "2019-01-09",2794383,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","GBP",0.00,25.00,65941602,"2020-01-09 09:04:00","CC","2019-01-09 09:04:00"
69 | "2019-07-12",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","CC","2019-07-12 16:03:00"
70 | "2019-07-07",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","CC","2019-07-07 16:02:00"
71 | "2019-04-17",2794060,200,"PAYMENT_METHOD_UPDATED",0.00,200,"","","",0.00,NULL,66570782,NULL,"CC","2019-04-17 19:40:00"
72 | "2019-04-17",2794060,200,"PAYMENT_METHOD_UPDATED",0.00,200,"","","",0.00,NULL,66570782,NULL,"CC","2019-04-17 18:38:00"
73 | "2019-04-17",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-05-02 15:41:00","CC","2019-04-17 19:40:00"
74 | "2019-07-16",2794060,199,"RECURRING",48.00,199,"","","USD",48.00,NULL,66570782,"2019-08-16 20:15:00","CC","2019-07-16 20:15:00"
75 | "2019-02-02",2794060,200,"RECURRING",53.76,200,"","","USD",53.76,NULL,66570782,"2019-03-02 16:41:00","PAYPAL","2019-02-02 16:41:00"
76 | "2019-01-14",2794060,199,"CHARGE",48.00,NULL,"","","USD",48.00,48.00,66570782,"2019-02-14 15:33:00","PAYPAL","2019-01-14 15:34:00"
77 | "2019-02-02",2794060,200,"CONTRACT_CHANGE",0.00,199,"SUCCESS","","USD",0.00,24.00,66570782,"2019-03-02 16:41:00","PAYPAL","2019-02-02 16:41:00"
78 | "2019-07-03",2794060,232,"CONTRACT_CHANGE",0.00,200,"FAILED","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","PAYPAL","2019-07-03 17:08:00"
79 | "2019-06-02",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-07-02 15:41:00","CC","2019-06-02 15:34:00"
80 | "2019-04-17",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","This transaction has been declined. Please try a different card or contact the credit card provider for assistance:[Do not Honour] (PV-05)","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","CC","2019-04-17 18:38:00"
81 | "2019-07-16",2794060,199,"CONTRACT_CHANGE",0.00,200,"SUCCESS","","USD",0.00,-24.00,66570782,"2019-08-16 20:15:00","CC","2019-07-16 20:15:00"
82 | "2019-07-16",2794060,200,"RECURRING",24.00,200,"","","USD",24.00,NULL,66570782,"2019-08-16 20:17:00","CC","2019-07-16 20:17:00"
83 | "2019-04-09",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","10417: Transaction cannot complete.","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","PAYPAL","2019-04-09 16:24:00"
84 | "2019-04-17",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","This transaction has been declined. Please try a different card or contact the credit card provider for assistance:[Do not Honour] (PV-05)","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","CC","2019-04-17 19:23:00"
85 | "2019-07-16",2794060,200,"CONTRACT_CHANGE",0.00,199,"SUCCESS","","USD",0.00,24.00,66570782,"2019-08-16 20:17:00","CC","2019-07-16 20:17:00"
86 | "2019-05-02",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-06-02 15:41:00","CC","2019-05-02 15:34:00"
87 | "2019-07-02",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","CC","2019-07-02 15:03:00"
88 | "2019-04-02",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","10417: Transaction cannot complete.","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","PAYPAL","2019-04-02 15:24:00"
89 | "2019-03-02",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-04-02 15:41:00","PAYPAL","2019-03-02 16:34:00"
90 | "2019-01-10",2793508,200,"CHARGE",72.00,NULL,"","","USD",72.00,72.00,66280160,"2019-02-10 22:16:00","PAYPAL","2019-01-10 22:16:00"
91 | "2019-02-10",2793508,200,"CANCELLATION",0.00,200,"","","USD",0.00,-72.00,66280160,"2019-02-10 22:16:00","PAYPAL","2019-02-10 22:33:00"
92 | "2019-02-06",2793508,200,"CANCEL_ON_RENEWAL",0.00,200,"","","USD",0.00,NULL,66280160,"2019-02-10 22:16:00","PAYPAL","2019-02-06 11:07:00"
93 | "2019-06-21",2793730,234,"RECURRING",420.00,234,"","","USD",420.00,NULL,66630878,"2020-06-21 10:03:00","CC","2019-06-21 10:03:00"
94 | "2019-06-21",2793730,234,"CONTRACT_CHANGE",0.00,230,"SUCCESS","","USD",0.00,24.00,66630878,"2020-06-21 10:03:00","CC","2019-06-21 10:03:00"
95 | "2019-01-15",2793730,230,"CHARGE",300.00,NULL,"","","USD",300.00,25.00,66630878,"2020-01-15 01:34:00","CC","2019-01-15 01:35:00"
96 | 


--------------------------------------------------------------------------------
/datalearn19intro/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Shay Palachy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/datalearn19intro/README.rst:
--------------------------------------------------------------------------------
 1 | datalearn19intro
 2 | ################
 3 | |PyPI-Status| |PyPI-Versions| |LICENCE|
 4 | 
 5 | Helper code for DataLearn 2019 ML Intro Workshop.
 6 | 
 7 | .. code-block:: python
 8 | 
 9 |    from datalearn19intro import get_accounts
10 |    accounts = get_accounts()
11 | 
12 | .. contents::
13 | 
14 | .. section-numbering::
15 | 
16 | Installation
17 | ============
18 | 
19 | Install ``datalearn19intro`` with:
20 | 
21 | .. code-block:: bash
22 | 
23 |   pip install datalearn19intro
24 | 
25 | 
26 | Credits
27 | =======
28 | Created by Shay Palachy  (shay.palachy@gmail.com).
29 | 
30 | .. alternative:
31 | .. https://badge.fury.io/py/yellowbrick.svg
32 | 
33 | .. |PyPI-Status| image:: https://img.shields.io/pypi/v/datalearn19intro.svg
34 |   :target: https://pypi.org/project/datalearn19intro
35 | 
36 | .. |PyPI-Versions| image:: https://img.shields.io/pypi/pyversions/datalearn19intro.svg
37 |    :target: https://pypi.org/project/datalearn19intro
38 | 
39 | .. |LICENCE| image:: https://img.shields.io/badge/License-MIT-yellow.svg
40 |   :target: https://pypi.python.org/pypi/datalearn19intro
41 | 


--------------------------------------------------------------------------------
/datalearn19intro/datalearn19intro/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataloader import (  # noqa: F401
2 |     get_accounts,
3 |     get_users,
4 |     get_events,
5 |     get_subscriptions,
6 |     get_processed_intro_dataset,
7 | )
8 | 


--------------------------------------------------------------------------------
/datalearn19intro/datalearn19intro/dataloader.py:
--------------------------------------------------------------------------------
 1 | """Data loading code for DataLearn prep night workshop."""
 2 | 
 3 | import pip
 4 | import subprocess
 5 | 
 6 | import pandas as pd
 7 | 
 8 | try:
 9 |     import google.colab  # noqa: F401
10 | 
11 |     IN_COLAB = True
12 | except ImportError:
13 |     IN_COLAB = False
14 | 
15 | 
16 | def in_notebook():
17 |     try:
18 |         from IPython import get_ipython
19 | 
20 |         if 'IPKernelApp' not in get_ipython().config:  # pragma: no cover
21 |             return False
22 |     except ImportError:
23 |         return False
24 |     return True
25 | 
26 | 
27 | def pipinstall(package):
28 |     if hasattr(pip, 'main'):
29 |         pip.main(['install', package])
30 |     else:
31 |         pip._internal.main(['install', package])
32 | 
33 | 
34 | GDRIVE = None
35 | 
36 | 
37 | def gdrive_authenticate():
38 |     global GDRIVE
39 |     if GDRIVE is not None:
40 |         return
41 |     print('Installing PyDrive...')
42 |     subprocess.run(["pip", "install", "-U", "-q", "PyDrive"])
43 |     # pipinstall('PyDrive')
44 |     # !pip install -U -q PyDrive
45 |     from pydrive.auth import GoogleAuth
46 |     from pydrive.drive import GoogleDrive
47 |     from google.colab import auth
48 |     from oauth2client.client import GoogleCredentials
49 | 
50 |     # Authenticate and create the PyDrive client.GDRIVE_AUTHENICATED# This only
51 |     # needs to be done once per notebook.
52 |     print('Authenticating with Google Drive...')
53 |     auth.authenticate_user()
54 |     gauth = GoogleAuth()
55 |     gauth.credentials = GoogleCredentials.get_application_default()
56 |     GDRIVE = GoogleDrive(gauth)
57 | 
58 | 
59 | def _get_file(fname, id):
60 |     if IN_COLAB:
61 |         gdrive_authenticate()
62 |         # you can see it with "get sherable link"
63 |         print("Downloading {} from Google Drive...".format(fname))
64 |         downloaded = GDRIVE.CreateFile({'id': id})
65 |         downloaded.GetContentFile(fname)
66 |         print("Done.")
67 |         return pd.read_csv(fname)
68 |     else:
69 |         return pd.read_csv('data/{}'.format(fname))
70 | 
71 | 
72 | def get_accounts():
73 |     return _get_file('accounts.csv', '1SFFGL_FIq3-l6CP9MTe9ueuLRMz_tvrw')
74 | 
75 | 
76 | def get_users():
77 |     return _get_file('users.csv', '1fG6ebyTaWWOVRFHw9svNjgJLYdUcu5th')
78 | 
79 | 
80 | def get_events():
81 |     return _get_file(
82 |         'Dynamic events table.csv', '1Gv0Z_IJ1kBwuUnPDkpgFM8mK1dGeTNi4')
83 | 
84 | 
85 | def get_subscriptions():
86 |     return _get_file(
87 |         'Dynamic subscription table.csv', '1qC0VOpUkZo4O4lggzp45YcNxC7NXY4VV')
88 | 
89 | 
90 | def get_processed_intro_dataset():
91 |     return _get_file(
92 |         'monday_datalearn.csv', '1W2D192QF_LIixPws1mj57C6OBNSxILFI')
93 | 


--------------------------------------------------------------------------------
/datalearn19intro/mit_license_badge.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="78" height="20"><linearGradient id="b" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="a"><rect width="78" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#a)"><path fill="#555" d="M0 0h47v20H0z"/><path fill="#007ec6" d="M47 0h31v20H47z"/><path fill="url(#b)" d="M0 0h78v20H0z"/></g><g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="110"><text x="245" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="370">license</text><text x="245" y="140" transform="scale(.1)" textLength="370">license</text><text x="615" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="210">MIT</text><text x="615" y="140" transform="scale(.1)" textLength="210">MIT</text></g> </svg>


--------------------------------------------------------------------------------
/datalearn19intro/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup for the datalearn19intro package."""
 2 | 
 3 | # !/usr/bin/env python
 4 | # -*- coding: utf-8 -*-
 5 | 
 6 | import setuptools
 7 | 
 8 | 
 9 | INSTALL_REQUIRES = [
10 |     'numpy',
11 |     'pandas'
12 | ]
13 | 
14 | with open('README.rst') as f:
15 |     README = f.read()
16 | 
17 | setuptools.setup(
18 |     author="Shay Palachy",
19 |     author_email="shay.palachy@gmail.com",
20 |     name='datalearn19intro',
21 |     license="MIT",
22 |     description='Helper code for DataLearn 2019 ML Intro Workshop.',
23 |     version='v0.0.5',
24 |     # cmdclass=versioneer.get_cmdclass(),
25 |     long_description=README,
26 |     url='https://github.com/DataHackIL/DataLearn-ML-Intro-2019',
27 |     packages=setuptools.find_packages(),
28 |     include_package_data=True,
29 |     python_requires=">=3.5",
30 |     install_requires=INSTALL_REQUIRES,
31 |     # extras_require={
32 |     #     'test': TEST_REQUIRES + INSTALL_REQUIRES,
33 |     # },
34 |     classifiers=[
35 |         # Trove classifiers
36 |         # (https://pypi.python.org/pypi?%3Aaction=list_classifiers)
37 |         'Development Status :: 4 - Beta',
38 |         'License :: OSI Approved :: MIT License',
39 |         'Programming Language :: Python',
40 |         'Programming Language :: Python :: 3.5',
41 |         'Programming Language :: Python :: 3.6',
42 |         'Programming Language :: Python :: 3.7',
43 |         'Topic :: Software Development :: Libraries',
44 |         'Topic :: Software Development :: Libraries :: Python Modules',
45 |         'Intended Audience :: Developers',
46 |     ],
47 | )
48 | 


--------------------------------------------------------------------------------
/part_1.introducing_jupyter.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.5"},"colab":{"name":"part_1.introducing_jupyter.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[]}},"cells":[{"cell_type":"markdown","metadata":{"id":"dXQRvMNhxdND","colab_type":"text"},"source":["# Part 1: Introducing the Jupyter Notebook"]},{"cell_type":"markdown","metadata":{"id":"Vdp7sdXHxdNF","colab_type":"text"},"source":["## What is the Jupyter Notebook?"]},{"cell_type":"markdown","metadata":{"id":"XxajIhHfxdNG","colab_type":"text"},"source":["The Jupyter Notebook is an interactive computing environment that enables users to author notebook documents that include:\n","\n","* Live code\n","* Interactive widgets\n","* Plots\n","* Narrative text\n","* Equations\n","* Images\n","* Video\n","\n","These documents provide a complete and self-contained record of a computation that can be converted to various formats and shared with others using email, Dropbox, version control systems (like git/GitHub) or nbviewer.jupyter.org."]},{"cell_type":"markdown","metadata":{"id":"FraUpL-JxdNH","colab_type":"text"},"source":["## Components\n","The Jupyter Notebook combines three components:\n","\n","* **The notebook web application**: An interactive web application for writing and running code interactively and authoring notebook documents.\n","* **Kernels**: Separate processes started by the notebook web application that runs users' code in a given language and returns output back to the notebook web application. The kernel also handles things like computations for interactive widgets, tab completion and introspection.\n","* **Notebook documents**: Self-contained documents that contain a representation of all content visible in the notebook web application, including inputs and outputs of the computations, narrative text, equations, images, and rich media representations of objects. Each notebook document has its own kernel.\n","\n","This enables the user to both **edit** and **run** code in the browser."]},{"cell_type":"markdown","metadata":{"id":"8cfyGFfPxdNI","colab_type":"text"},"source":["## Kernels\n","Through Jupyter's kernel and messaging architecture, the Notebook allows code to be run in a range of different programming languages. For each notebook document that a user opens, the web application starts a kernel that runs the code for that notebook. Each kernel is capable of running code in a single programming language and there are kernels available in the following languages:\n","\n","* Python (https://github.com/ipython/ipython)\n","* Julia (https://github.com/JuliaLang/IJulia.jl)\n","* R (https://github.com/takluyver/IRkernel)\n","* Ruby (https://github.com/minrk/iruby)\n","* Haskell (https://github.com/gibiansky/IHaskell)\n","* Scala (https://github.com/Bridgewater/scala-notebook)\n","* node.js (https://gist.github.com/Carreau/4279371)\n","* Go (https://github.com/takluyver/igo)\n","\n","The default kernel runs Python code. The notebook provides a simple way for users to pick which of these kernels is used for a given notebook."]},{"cell_type":"markdown","metadata":{"id":"D5Q2fTa5xdNJ","colab_type":"text"},"source":["## Notebook cells"]},{"cell_type":"markdown","metadata":{"id":"s-9D3V3hxdNK","colab_type":"text"},"source":["A Jupyter notebook is made up of consecutive cells. There are two basic types of cells:\n","* Markdown cells\n","* Code cells"]},{"cell_type":"markdown","metadata":{"id":"xF9L_V12xdNK","colab_type":"text"},"source":["You can turn a cell into a markdown cell by pressing `m` when it is selected (but the cursor is **not** inside it). To turn it to a code cell use `y`."]},{"cell_type":"markdown","metadata":{"id":"2a6sSfdkxdNL","colab_type":"text"},"source":["###More comands:\n","* 'Escape' outside of a cell using `Esc`.\n","* Edit the current cell using `Enter`.\n","* Execute a cell using `shift`+`Enter`.\n","\n","In Jupyter Notebooks (but not on *colab*):\n","* Add a cell above the current cell using `a`.\n","* Add a cell above the current cell using `b`.\n","* Copy the current cell using `c`.\n","* Delete the current cell using `dd`."]},{"cell_type":"markdown","metadata":{"id":"FIx3pfbtxdNL","colab_type":"text"},"source":["## Markdown Cells\n","\n","This cell is a markdown cell. This means you can use markdown to write *italic text* by surrounding you text with *asterisks* or _underscores_.\n","Strong emphasis, aka bold, with **asterisks** or __underscores__.\n","Combined emphasis with **asterisks and _underscores_**.\n","Strikethrough uses two tildes. ~~Scratch this.~~\n","\n","Here be headers:\n","# H1\n","## H2\n","### etc...\n","\n","You can create bulleted lists using\n","* Asterisks\n","- Or minuses\n","+ Or pluses\n","\n","### Ordered lists\n","1. First ordered list item\n","2. Another item\n","  * Unordered sub-list. \n","1. Actual numbers don't matter, just that it's a number\n","  1. Ordered sub-list.\n","  \n","   An un-numbered indented paragraph.\n","4. And another item.\n","\n","[And of course add links](https://www.google.com)\n","\n","You can find a nice full markdown cheatsheet here: https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet"]},{"cell_type":"markdown","metadata":{"id":"XIcvwtmyxdNM","colab_type":"text"},"source":["<p>**NOTICE**</p>\n","\n","  <p>HTML tags can also be used <b>inside</b> markdown cells.</p>\n","</div>"]},{"cell_type":"markdown","metadata":{"id":"nGFegaYLxdNN","colab_type":"text"},"source":["## Code Cells"]},{"cell_type":"code","metadata":{"id":"c3DJV5VrxdNN","colab_type":"code","colab":{}},"source":["# this is a code cell\n","a = 5"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"11R4NJYYxdNR","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"9c4c7195-ac75-4e6e-9649-b7c622decabe","executionInfo":{"status":"ok","timestamp":1565864335301,"user_tz":-180,"elapsed":531,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# a line containing only an expression will print it\n","a"],"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5"]},"metadata":{"tags":[]},"execution_count":4}]},{"cell_type":"code","metadata":{"id":"Z0EcqEDvxdNU","colab_type":"code","colab":{}},"source":["# unless it ends with a semicolon\n","a;"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"2G5MWI9MxdNW","colab_type":"code","colab":{}},"source":["# you can also define functions\n","def foo(a, b):\n","    \"\"\"Foo documentation.\"\"\"\n","    return a*5 + 2/b"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"0bhzFsNFxdNY","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"5efcad27-e813-4c75-c1b3-3f6fbbb3782f","executionInfo":{"status":"ok","timestamp":1565864343265,"user_tz":-180,"elapsed":745,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# and then use them!\n","foo(5,8)"],"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["25.25"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"code","metadata":{"id":"nbpbD7pxxdNb","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"a3da4249-5f4c-4990-d70a-837ad0b2f406","executionInfo":{"status":"ok","timestamp":1565864346647,"user_tz":-180,"elapsed":535,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# you can also use IPython magic functions!\n","%time numbers = [x for x in range(1000000)]"],"execution_count":8,"outputs":[{"output_type":"stream","text":["CPU times: user 47.6 ms, sys: 45.9 ms, total: 93.5 ms\n","Wall time: 102 ms\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"ppBPkqUFxdNd","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"1e3ba24e-5252-4abd-85e4-965918cd6b42","executionInfo":{"status":"ok","timestamp":1565864352232,"user_tz":-180,"elapsed":3315,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["%timeit numbers = [x for x in range(100000)]"],"execution_count":9,"outputs":[{"output_type":"stream","text":["100 loops, best of 3: 6.29 ms per loop\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"QCf-fiySxdNf","colab_type":"text"},"source":["It is also easy to display plots inside a Jupyter notebook. We'll see this later."]},{"cell_type":"code","metadata":{"id":"uCjbABubxdNg","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}


--------------------------------------------------------------------------------
/part_2.numpy.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.5"},"colab":{"name":"part_2.numpy.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":["RwPgHgFP4H1l","CKd7UT3F4H20","Z4ZVTQa84H3p","6sPYGkYt4H3z","0onFRxLN4H4B","NU_ZudOj4H4e","8qWuhJ7E4H4v","WFkuwnxB4H5z","CNZcnWVi4H6R","R4QtrGfi4H6h","mgBbOZ2H4H6k","dPDTaPLn4H6m","hdcCMF5B4H6u"]}},"cells":[{"cell_type":"markdown","metadata":{"id":"Yai-UYMg4H1a","colab_type":"text"},"source":["# Numpy -  multidimensional data arrays"]},{"cell_type":"markdown","metadata":{"id":"zhlhZMYL4H1f","colab_type":"text"},"source":["Based  on J.R. Johansson's notebook (jrjohansson at gmail.com)"]},{"cell_type":"markdown","metadata":{"id":"RwPgHgFP4H1l","colab_type":"text"},"source":["## Introduction"]},{"cell_type":"markdown","metadata":{"id":"Ci0r993d4H1p","colab_type":"text"},"source":["The `numpy` package (module) is used in almost all numerical computation using Python. It is a package that provide high-performance vector, matrix and higher-dimensional data structures for Python. It is implemented in C and Fortran so when calculations are vectorized (formulated with vectors and matrices), performance is very good. \n","\n","To use `numpy` you need to import the module, using for example:"]},{"cell_type":"code","metadata":{"id":"OvvZa0Z44H1q","colab_type":"code","colab":{}},"source":["from numpy import *"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UAaDil2O4H1u","colab_type":"text"},"source":["In the `numpy` package the terminology used for vectors, matrices and higher-dimensional data sets is *array*. \n","\n"]},{"cell_type":"markdown","metadata":{"id":"aLAmXzIW4H2o","colab_type":"text"},"source":["## Creating `numpy` arrays"]},{"cell_type":"markdown","metadata":{"id":"f0RWAhOb4H2p","colab_type":"text"},"source":["There are a number of ways to initialize new numpy arrays, for example from\n","\n","* a Python list or tuples\n","* using functions that are dedicated to generating numpy arrays, such as `arange`, `linspace`, etc.\n","* reading data from files"]},{"cell_type":"markdown","metadata":{"id":"CKd7UT3F4H20","colab_type":"text"},"source":["### From lists"]},{"cell_type":"markdown","metadata":{"id":"ivCyIJYi4H23","colab_type":"text"},"source":["For example, to create new vector and matrix arrays from Python lists we can use the `numpy.array` function."]},{"cell_type":"code","metadata":{"id":"cfeoOXX94H24","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"85bca577-5bfa-472e-db58-f9ad3a139b50","executionInfo":{"status":"ok","timestamp":1565877235016,"user_tz":-180,"elapsed":1031,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# a vector: the argument to the array function is a Python list\n","v = array([1,2,3,4])\n","\n","v"],"execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([1, 2, 3, 4])"]},"metadata":{"tags":[]},"execution_count":5}]},{"cell_type":"code","metadata":{"id":"ks3LWXpq4H28","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":50},"outputId":"f47f7217-92f4-49c1-ee89-29b1aacdc600","executionInfo":{"status":"ok","timestamp":1565877235464,"user_tz":-180,"elapsed":1369,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# a matrix: the argument to the array function is a nested Python list\n","M = array([[1, 2], [3, 4]])\n","\n","M"],"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[1, 2],\n","       [3, 4]])"]},"metadata":{"tags":[]},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"uub0dXTZ4H3A","colab_type":"text"},"source":["The `v` and `M` objects are both of the type `ndarray` that the `numpy` module provides."]},{"cell_type":"code","metadata":{"id":"zkxf0n4a4H3A","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"3261d26f-670c-41d5-9e26-cb6f510a8254","executionInfo":{"status":"ok","timestamp":1565877236184,"user_tz":-180,"elapsed":540,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["type(v), type(M)"],"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(numpy.ndarray, numpy.ndarray)"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"markdown","metadata":{"id":"2YbxEAgV4H3C","colab_type":"text"},"source":["The difference between the `v` and `M` arrays is only their shapes. We can get information about the shape of an array by using the `ndarray.shape` property."]},{"cell_type":"code","metadata":{"id":"Fz7mVTEL4H3E","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"82cafd58-eb8f-4184-a7bf-768ff3a23889","executionInfo":{"status":"ok","timestamp":1565877289354,"user_tz":-180,"elapsed":1019,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["v.shape"],"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(4,)"]},"metadata":{"tags":[]},"execution_count":8}]},{"cell_type":"code","metadata":{"id":"QeGbgpLb4H3H","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"3c6874ef-e927-403a-b5c2-3a094ca7156b","executionInfo":{"status":"ok","timestamp":1565877295892,"user_tz":-180,"elapsed":936,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M.shape"],"execution_count":9,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(2, 2)"]},"metadata":{"tags":[]},"execution_count":9}]},{"cell_type":"markdown","metadata":{"id":"BmW4mybb4H3K","colab_type":"text"},"source":["The number of elements in the array is available through the `ndarray.size` property:"]},{"cell_type":"code","metadata":{"id":"pYoUPA_A4H3N","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"73705664-b047-46ee-dacb-dd5a53da8a2d","executionInfo":{"status":"ok","timestamp":1565877302429,"user_tz":-180,"elapsed":1236,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M.size"],"execution_count":10,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4"]},"metadata":{"tags":[]},"execution_count":10}]},{"cell_type":"markdown","metadata":{"id":"lbmKpqa64H3S","colab_type":"text"},"source":["Equivalently, we could use the function `numpy.shape` and `numpy.size`"]},{"cell_type":"code","metadata":{"id":"va2u6UBS4H3T","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"c7626ae7-7abd-4857-eb6a-5f092321fa7e","executionInfo":{"status":"ok","timestamp":1565877305074,"user_tz":-180,"elapsed":819,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["shape(M)"],"execution_count":11,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(2, 2)"]},"metadata":{"tags":[]},"execution_count":11}]},{"cell_type":"code","metadata":{"id":"6Nt1VX094H3a","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"e1d574bc-e5fd-42f7-8a12-3d857957cc11","executionInfo":{"status":"ok","timestamp":1565877306018,"user_tz":-180,"elapsed":525,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["size(M)"],"execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4"]},"metadata":{"tags":[]},"execution_count":12}]},{"cell_type":"markdown","metadata":{"id":"8uURf4TK4H3c","colab_type":"text"},"source":["So far the `numpy.ndarray` looks awefully much like a Python list (or nested list). Why not simply use Python lists for computations instead of creating a new array type? \n","\n","There are several reasons:\n","\n","* Python lists are very general. They can contain any kind of object. They are dynamically typed. They do not support mathematical functions such as matrix and dot multiplications, etc. Implementing such functions for Python lists would not be very efficient because of the dynamic typing.\n","* Numpy arrays are **statically typed** and **homogeneous**. The type of the elements is determined when the array is created.\n","* Numpy arrays are memory efficient.\n","* Because of the static typing, fast implementation of mathematical functions such as multiplication and addition of `numpy` arrays can be implemented in a compiled language (C and Fortran is used).\n","\n","Using the `dtype` (data type) property of an `ndarray`, we can see what type the data of an array has:"]},{"cell_type":"code","metadata":{"id":"H1ny0dwq4H3d","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"e5948335-7540-4b99-a3e7-354f4de59da8","executionInfo":{"status":"ok","timestamp":1565877336197,"user_tz":-180,"elapsed":556,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M.dtype"],"execution_count":13,"outputs":[{"output_type":"execute_result","data":{"text/plain":["dtype('int64')"]},"metadata":{"tags":[]},"execution_count":13}]},{"cell_type":"markdown","metadata":{"id":"mOQjfIdD4H3g","colab_type":"text"},"source":["We get an error if we try to assign a value of the wrong type to an element in a numpy array:"]},{"cell_type":"code","metadata":{"id":"HQ0ySny04H3g","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":162},"outputId":"80cf5e53-3fcc-444c-f08d-fc35e8bf7b88","executionInfo":{"status":"error","timestamp":1565877346786,"user_tz":-180,"elapsed":561,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M[0,0] = \"hello\""],"execution_count":14,"outputs":[{"output_type":"error","ename":"ValueError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)","\u001b[0;32m<ipython-input-14-e1f336250f69>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mM\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"hello\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: 'hello'"]}]},{"cell_type":"markdown","metadata":{"id":"NxoE_aas4H3i","colab_type":"text"},"source":["If we want, we can explicitly define the type of the array data when we create it, using the `dtype` keyword argument: "]},{"cell_type":"code","metadata":{"id":"tcoqshv34H3j","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":50},"outputId":"53d4f229-57ac-493c-ff77-acc9f1b30cbc","executionInfo":{"status":"ok","timestamp":1565877353544,"user_tz":-180,"elapsed":800,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M = array([[1, 2], [3, 4]], dtype=complex)\n","\n","M"],"execution_count":15,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[1.+0.j, 2.+0.j],\n","       [3.+0.j, 4.+0.j]])"]},"metadata":{"tags":[]},"execution_count":15}]},{"cell_type":"markdown","metadata":{"id":"71v9E4m84H3o","colab_type":"text"},"source":["Common data types that can be used with `dtype` are: `int`, `float`, `complex`, `bool`, `object`, etc.\n","\n","We can also explicitly define the bit size of the data types, for example: `int64`, `int16`, `float128`, `complex128`."]},{"cell_type":"markdown","metadata":{"id":"Z4ZVTQa84H3p","colab_type":"text"},"source":["### Using array-generating functions"]},{"cell_type":"markdown","metadata":{"id":"aYFas-Rn4H3p","colab_type":"text"},"source":["For larger arrays it is inpractical to initialize the data manually, using explicit python lists. Instead we can use one of the many functions in `numpy` that generate arrays of different forms. Some of the more common are:"]},{"cell_type":"code","metadata":{"id":"irdANGOL4H3q","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"549095e2-3930-456e-fd1d-f3a7ebc9e946","executionInfo":{"status":"ok","timestamp":1565877387626,"user_tz":-180,"elapsed":951,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# create a range\n","\n","x = arange(0, 10, 1) # arguments: start, stop, step\n","\n","x"],"execution_count":16,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"]},"metadata":{"tags":[]},"execution_count":16}]},{"cell_type":"code","metadata":{"id":"Sr-MPcyK4H3s","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":101},"outputId":"bdd6d5cc-d486-4923-e3cf-7373591297da","executionInfo":{"status":"ok","timestamp":1565877387628,"user_tz":-180,"elapsed":552,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["x = arange(-1, 1, 0.1)\n","\n","x"],"execution_count":17,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([-1.00000000e+00, -9.00000000e-01, -8.00000000e-01, -7.00000000e-01,\n","       -6.00000000e-01, -5.00000000e-01, -4.00000000e-01, -3.00000000e-01,\n","       -2.00000000e-01, -1.00000000e-01, -2.22044605e-16,  1.00000000e-01,\n","        2.00000000e-01,  3.00000000e-01,  4.00000000e-01,  5.00000000e-01,\n","        6.00000000e-01,  7.00000000e-01,  8.00000000e-01,  9.00000000e-01])"]},"metadata":{"tags":[]},"execution_count":17}]},{"cell_type":"code","metadata":{"scrolled":true,"id":"xXZsubZo4H3v","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":101},"outputId":"421b1c6d-e865-437b-f68a-a6a5e15f46e2","executionInfo":{"status":"ok","timestamp":1565877391062,"user_tz":-180,"elapsed":860,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# using linspace, both end points ARE included\n","linspace(0, 10, 25)"],"execution_count":18,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.        ,  0.41666667,  0.83333333,  1.25      ,  1.66666667,\n","        2.08333333,  2.5       ,  2.91666667,  3.33333333,  3.75      ,\n","        4.16666667,  4.58333333,  5.        ,  5.41666667,  5.83333333,\n","        6.25      ,  6.66666667,  7.08333333,  7.5       ,  7.91666667,\n","        8.33333333,  8.75      ,  9.16666667,  9.58333333, 10.        ])"]},"metadata":{"tags":[]},"execution_count":18}]},{"cell_type":"markdown","metadata":{"id":"6sPYGkYt4H3z","colab_type":"text"},"source":["#### mgrid"]},{"cell_type":"code","metadata":{"id":"LTtylXc24H34","colab_type":"code","colab":{}},"source":["x, y = mgrid[0:5, 0:5] # similar to meshgrid in MATLAB"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"CSsUMPEr4H37","colab_type":"code","colab":{},"outputId":"469c0bf4-a5f2-4a0c-8638-7ac71c62539c"},"source":["x"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[0, 0, 0, 0, 0],\n","       [1, 1, 1, 1, 1],\n","       [2, 2, 2, 2, 2],\n","       [3, 3, 3, 3, 3],\n","       [4, 4, 4, 4, 4]])"]},"metadata":{"tags":[]},"execution_count":19}]},{"cell_type":"code","metadata":{"id":"KWaBRngd4H3-","colab_type":"code","colab":{},"outputId":"71ba3bbd-ddff-4392-a4c0-d3abc5838b09"},"source":["y"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[0, 1, 2, 3, 4],\n","       [0, 1, 2, 3, 4],\n","       [0, 1, 2, 3, 4],\n","       [0, 1, 2, 3, 4],\n","       [0, 1, 2, 3, 4]])"]},"metadata":{"tags":[]},"execution_count":20}]},{"cell_type":"markdown","metadata":{"id":"0onFRxLN4H4B","colab_type":"text"},"source":["#### random data"]},{"cell_type":"code","metadata":{"id":"KbOBN6R34H4D","colab_type":"code","colab":{}},"source":["from numpy import random"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ek8AG7Br4H4Q","colab_type":"code","colab":{},"outputId":"9f97fd7e-e017-4fbd-8ef8-ee318ca7916d"},"source":["# uniform random numbers in [0,1]\n","random.rand(5,5)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.92932506,  0.19684255,  0.736434  ,  0.18125714,  0.70905038],\n","       [ 0.18803573,  0.9312815 ,  0.1284532 ,  0.38138008,  0.36646481],\n","       [ 0.53700462,  0.02361381,  0.97760688,  0.73296701,  0.23042324],\n","       [ 0.9024635 ,  0.20860922,  0.67729644,  0.68386687,  0.49385729],\n","       [ 0.95876515,  0.29341553,  0.37520629,  0.29194432,  0.64102804]])"]},"metadata":{"tags":[]},"execution_count":22}]},{"cell_type":"code","metadata":{"id":"3Qb6y0hv4H4a","colab_type":"code","colab":{},"outputId":"f8bff034-1d25-42d5-e1ac-d5b1a9c95c8c"},"source":["# standard normal distributed random numbers\n","random.randn(5,5)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.117907  , -1.57016164,  0.78256246,  1.45386709,  0.54744436],\n","       [ 2.30356897, -0.28352021, -0.9087325 ,  1.2285279 , -1.00760167],\n","       [ 0.72216801,  0.77507299, -0.37793178, -0.31852241,  0.84493629],\n","       [-0.10682252,  1.15930142, -0.47291444, -0.69496967, -0.58912034],\n","       [ 0.34513487, -0.92389516, -0.216978  ,  0.42153272,  0.86650101]])"]},"metadata":{"tags":[]},"execution_count":23}]},{"cell_type":"markdown","metadata":{"id":"NU_ZudOj4H4e","colab_type":"text"},"source":["#### zeros and ones"]},{"cell_type":"code","metadata":{"id":"MC5Qqrih4H4g","colab_type":"code","colab":{},"outputId":"2da7218a-809e-4151-a584-1c10a7311e8d"},"source":["zeros((3,3))"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.,  0.,  0.],\n","       [ 0.,  0.,  0.],\n","       [ 0.,  0.,  0.]])"]},"metadata":{"tags":[]},"execution_count":26}]},{"cell_type":"code","metadata":{"id":"q8-TzdJ34H4r","colab_type":"code","colab":{},"outputId":"dd3cf0f6-d086-491b-8ac0-acd1ead5ce42"},"source":["ones((3,3))"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1.,  1.,  1.],\n","       [ 1.,  1.,  1.],\n","       [ 1.,  1.,  1.]])"]},"metadata":{"tags":[]},"execution_count":27}]},{"cell_type":"markdown","metadata":{"id":"8qWuhJ7E4H4v","colab_type":"text"},"source":["## More properties of numpy arrays"]},{"cell_type":"code","metadata":{"id":"rcREVjoG4H4w","colab_type":"code","colab":{},"outputId":"7df199df-f2d1-4a27-9287-05b427e50cc7"},"source":["M.itemsize # bytes per element"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["8"]},"metadata":{"tags":[]},"execution_count":38}]},{"cell_type":"code","metadata":{"id":"o4LXNp644H40","colab_type":"code","colab":{},"outputId":"22b37e4c-d92b-4ed9-d901-5246282f41a7"},"source":["M.nbytes # number of bytes"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["72"]},"metadata":{"tags":[]},"execution_count":39}]},{"cell_type":"code","metadata":{"id":"WEPR97Rx4H49","colab_type":"code","colab":{},"outputId":"09bd572f-e6e6-45af-d7b1-2f519f0b5776"},"source":["M.ndim # number of dimensions"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["2"]},"metadata":{"tags":[]},"execution_count":40}]},{"cell_type":"markdown","metadata":{"id":"ZE5H_AbI4H4_","colab_type":"text"},"source":["## Manipulating arrays"]},{"cell_type":"markdown","metadata":{"id":"Gn5WEg754H5A","colab_type":"text"},"source":["### Indexing"]},{"cell_type":"markdown","metadata":{"id":"xZxt-9h44H5A","colab_type":"text"},"source":["We can index elements in an array using square brackets and indices:"]},{"cell_type":"code","metadata":{"id":"Ry0do-4K4H5B","colab_type":"code","colab":{},"outputId":"809671ed-3317-433f-dbbf-a82c584d7d0e"},"source":["# v is a vector, and has only one dimension, taking one index\n","v[0]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1"]},"metadata":{"tags":[]},"execution_count":41}]},{"cell_type":"code","metadata":{"id":"SeXl-uSv4H5D","colab_type":"code","colab":{},"outputId":"86699f4f-7c80-4e0b-d91f-320f16efecea"},"source":["# M is a matrix, or a 2 dimensional array, taking two indices \n","M[1,1]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.47913739949636192"]},"metadata":{"tags":[]},"execution_count":42}]},{"cell_type":"markdown","metadata":{"id":"UIS_PIPQ4H5F","colab_type":"text"},"source":["If we omit an index of a multidimensional array it returns the whole row (or, in general, a N-1 dimensional array) "]},{"cell_type":"code","metadata":{"id":"wmTMc0ia4H5G","colab_type":"code","colab":{},"outputId":"09d6e405-e1d6-441e-ba9b-52218318032b"},"source":["M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.77872576,  0.40043577,  0.66254019],\n","       [ 0.60410063,  0.4791374 ,  0.8237106 ],\n","       [ 0.96856318,  0.15459644,  0.96082399]])"]},"metadata":{"tags":[]},"execution_count":43}]},{"cell_type":"code","metadata":{"id":"hrO_oej84H5N","colab_type":"code","colab":{},"outputId":"eba958b0-6259-481b-9bf7-6f06c9fd30db"},"source":["M[1]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.60410063,  0.4791374 ,  0.8237106 ])"]},"metadata":{"tags":[]},"execution_count":44}]},{"cell_type":"markdown","metadata":{"id":"hyj1kRXP4H5P","colab_type":"text"},"source":["The same thing can be achieved with using `:` instead of an index: "]},{"cell_type":"code","metadata":{"id":"_RhaEcYM4H5Q","colab_type":"code","colab":{},"outputId":"79daddb0-ca06-40c4-c734-602f7971e5e6"},"source":["M[1,:] # row 1"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.60410063,  0.4791374 ,  0.8237106 ])"]},"metadata":{"tags":[]},"execution_count":45}]},{"cell_type":"code","metadata":{"id":"H3DX8HsU4H5T","colab_type":"code","colab":{},"outputId":"bb107fd0-f7ee-4748-fee6-7c179a2d5151"},"source":["M[:,1] # column 1"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.40043577,  0.4791374 ,  0.15459644])"]},"metadata":{"tags":[]},"execution_count":46}]},{"cell_type":"markdown","metadata":{"id":"YVOTbsDS4H5W","colab_type":"text"},"source":["We can assign new values to elements in an array using indexing:"]},{"cell_type":"code","metadata":{"id":"AiHZCF_q4H5X","colab_type":"code","colab":{}},"source":["M[0,0] = 1"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"gdRL6hoq4H5f","colab_type":"code","colab":{},"outputId":"f3534e44-0686-460e-dfdb-82fd009314c0"},"source":["M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1.        ,  0.40043577,  0.66254019],\n","       [ 0.60410063,  0.4791374 ,  0.8237106 ],\n","       [ 0.96856318,  0.15459644,  0.96082399]])"]},"metadata":{"tags":[]},"execution_count":48}]},{"cell_type":"code","metadata":{"id":"-CkU0him4H5l","colab_type":"code","colab":{}},"source":["# also works for rows and columns\n","M[1,:] = 0\n","M[:,2] = -1"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBRT1tyZ4H5x","colab_type":"code","colab":{},"outputId":"f47a6691-2251-4a73-f3ca-b2fbbf08022e"},"source":["M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1.        ,  0.40043577, -1.        ],\n","       [ 0.        ,  0.        , -1.        ],\n","       [ 0.96856318,  0.15459644, -1.        ]])"]},"metadata":{"tags":[]},"execution_count":50}]},{"cell_type":"markdown","metadata":{"id":"WFkuwnxB4H5z","colab_type":"text"},"source":["### Index slicing"]},{"cell_type":"markdown","metadata":{"id":"dFRAptaD4H50","colab_type":"text"},"source":["Index slicing is the technical name for the syntax `M[lower:upper:step]` to extract part of an array:"]},{"cell_type":"code","metadata":{"id":"9wz8jo3Z4H51","colab_type":"code","colab":{},"outputId":"02e30d19-c5a6-4059-b660-536c28a72141"},"source":["A = array([1,2,3,4,5])\n","A"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([1, 2, 3, 4, 5])"]},"metadata":{"tags":[]},"execution_count":51}]},{"cell_type":"code","metadata":{"id":"nn08qKFJ4H54","colab_type":"code","colab":{},"outputId":"3253b970-9460-4623-b23c-ee3779336e36"},"source":["A[1:3]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([2, 3])"]},"metadata":{"tags":[]},"execution_count":52}]},{"cell_type":"markdown","metadata":{"id":"9HANOCRZ4H5-","colab_type":"text"},"source":["Array slices are *mutable*: if they are assigned a new value the original array from which the slice was extracted is modified:"]},{"cell_type":"code","metadata":{"id":"OgYbFcS64H5_","colab_type":"code","colab":{},"outputId":"bb43f189-fd8c-432a-9810-810de1dc66df"},"source":["A[1:3] = [-2,-3]\n","\n","A"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 1, -2, -3,  4,  5])"]},"metadata":{"tags":[]},"execution_count":53}]},{"cell_type":"markdown","metadata":{"id":"jNczBl2X4H6E","colab_type":"text"},"source":["Negative indices counts from the end of the array (positive index from the begining):"]},{"cell_type":"code","metadata":{"id":"Mgur3nS94H6F","colab_type":"code","colab":{}},"source":["A = array([1,2,3,4,5])"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ovmQzTPT4H6I","colab_type":"code","colab":{},"outputId":"61419e48-b0d2-4ebe-8849-61b1efca5b38"},"source":["A[-1] # the last element in the array"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5"]},"metadata":{"tags":[]},"execution_count":59}]},{"cell_type":"code","metadata":{"id":"ZVHNR1Up4H6O","colab_type":"code","colab":{},"outputId":"bb0ef677-294c-4760-9bff-43567b1f09d8"},"source":["A[-3:] # the last three elements"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([3, 4, 5])"]},"metadata":{"tags":[]},"execution_count":60}]},{"cell_type":"markdown","metadata":{"id":"CNZcnWVi4H6R","colab_type":"text"},"source":["### Fancy indexing"]},{"cell_type":"markdown","metadata":{"id":"xDsu0l564H6S","colab_type":"text"},"source":["Fancy indexing is the name for when an array or list is used in-place of an index: "]},{"cell_type":"code","metadata":{"id":"GgHGCVGp4H6S","colab_type":"code","colab":{},"outputId":"531f2545-88a5-4078-b4e3-fcfc72f1f2a9"},"source":["row_indices = [1, 2, 3]\n","A[row_indices]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[10, 11, 12, 13, 14],\n","       [20, 21, 22, 23, 24],\n","       [30, 31, 32, 33, 34]])"]},"metadata":{"tags":[]},"execution_count":64}]},{"cell_type":"code","metadata":{"id":"0Ol2NX0j4H6W","colab_type":"code","colab":{},"outputId":"f484a32e-826e-4c7b-982c-59dd7700e14f"},"source":["col_indices = [1, 2, -1] # remember, index -1 means the last element\n","A[row_indices, col_indices]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([11, 22, 34])"]},"metadata":{"tags":[]},"execution_count":65}]},{"cell_type":"markdown","metadata":{"id":"sYOLqh5s4H6Y","colab_type":"text"},"source":["### Linear and Matrix algebra"]},{"cell_type":"markdown","metadata":{"id":"cE8dK2DK4H6Z","colab_type":"text"},"source":["Numpy's real strength is in optimized linear and matrix algebric operations on vectors and matrices, but that's less relevant here."]},{"cell_type":"markdown","metadata":{"id":"0UsCmaIm4H6Z","colab_type":"text"},"source":["### Data processing"]},{"cell_type":"markdown","metadata":{"id":"NXPyElfW4H6a","colab_type":"text"},"source":["Often it is useful to store datasets in Numpy arrays. Numpy provides a number of functions to calculate statistics of datasets in arrays. \n","\n","For example, let's calculate some properties from the Stockholm temperature dataset used above."]},{"cell_type":"code","metadata":{"id":"k4KNJQvy4H6b","colab_type":"code","colab":{},"outputId":"7b6235eb-1446-44bc-da3a-678fe4780f24"},"source":["# reminder, the tempeature dataset is stored in the data variable:\n","data = random.randint(10,size=(8,8))\n","shape(data)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(8, 8)"]},"metadata":{"tags":[]},"execution_count":18}]},{"cell_type":"markdown","metadata":{"id":"R4QtrGfi4H6h","colab_type":"text"},"source":["#### mean"]},{"cell_type":"code","metadata":{"id":"px1LjwPK4H6h","colab_type":"code","colab":{},"outputId":"2ab080ba-418a-49f5-c780-a5649e405c72"},"source":["mean(data[:,3])"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5.25"]},"metadata":{"tags":[]},"execution_count":20}]},{"cell_type":"markdown","metadata":{"id":"mgBbOZ2H4H6k","colab_type":"text"},"source":["#### standard deviations and variance"]},{"cell_type":"code","metadata":{"id":"KNsHBTsm4H6l","colab_type":"code","colab":{},"outputId":"1cedb05f-e5b4-405f-f902-a39370b87bae"},"source":["std(data[:,3]), var(data[:,3])"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(1.6393596310755001, 2.6875)"]},"metadata":{"tags":[]},"execution_count":21}]},{"cell_type":"markdown","metadata":{"id":"dPDTaPLn4H6m","colab_type":"text"},"source":["#### min and max"]},{"cell_type":"code","metadata":{"id":"CoF7hVsZ4H6o","colab_type":"code","colab":{},"outputId":"c74bab1a-18c7-4d91-ccf4-6a8db61c653d"},"source":["data[:,3].min()"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4"]},"metadata":{"tags":[]},"execution_count":24}]},{"cell_type":"code","metadata":{"id":"_UMY69jr4H6q","colab_type":"code","colab":{},"outputId":"720b0a96-fa4d-4cd3-84ea-284c91cfb802"},"source":["data[:,3].max()"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["9"]},"metadata":{"tags":[]},"execution_count":25}]},{"cell_type":"markdown","metadata":{"id":"hdcCMF5B4H6u","colab_type":"text"},"source":["#### sum, prod, and their cumulative versions"]},{"cell_type":"code","metadata":{"id":"x5sBrhVU4H6x","colab_type":"code","colab":{},"outputId":"1eec9c5b-0005-40e6-f8bd-b2e8e8269e15"},"source":["d = arange(0, 10)\n","d"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"]},"metadata":{"tags":[]},"execution_count":26}]},{"cell_type":"code","metadata":{"id":"7SAdiHLi4H60","colab_type":"code","colab":{},"outputId":"ed91617c-4814-4a38-825f-75ebaff86666"},"source":["# sum up all elements\n","sum(d)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["45"]},"metadata":{"tags":[]},"execution_count":27}]},{"cell_type":"code","metadata":{"id":"i-3OP3WX4H63","colab_type":"code","colab":{},"outputId":"b52618b4-7f3d-4d70-aa43-19b47d1b6261"},"source":["# product of all elements\n","prod(d+1)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["3628800"]},"metadata":{"tags":[]},"execution_count":28}]},{"cell_type":"code","metadata":{"id":"gfm7Wv_E4H64","colab_type":"code","colab":{},"outputId":"794b747d-ba5f-49e2-f14c-22dbd3e6bde5"},"source":["# cummulative sum\n","cumsum(d)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45])"]},"metadata":{"tags":[]},"execution_count":29}]},{"cell_type":"code","metadata":{"id":"VTAeSiH54H66","colab_type":"code","colab":{},"outputId":"190d8a7d-7eba-48aa-e371-54fbf803c194"},"source":["# cummulative product\n","cumprod(d+1)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([      1,       2,       6,      24,     120,     720,    5040,\n","         40320,  362880, 3628800])"]},"metadata":{"tags":[]},"execution_count":30}]},{"cell_type":"markdown","metadata":{"id":"33g0Fjtk4H7D","colab_type":"text"},"source":["## Iterating over array elements"]},{"cell_type":"markdown","metadata":{"id":"kOVcn0eu4H7D","colab_type":"text"},"source":["Generally, we want to avoid iterating over the elements of arrays whenever we can (at all costs). The reason is that in a interpreted language like Python (or MATLAB), iterations are really slow compared to vectorized operations. \n","\n","However, sometimes iterations are unavoidable. For such cases, the Python `for` loop is the most convenient way to iterate over an array:"]},{"cell_type":"code","metadata":{"id":"5Bi7I-Lz4H7E","colab_type":"code","colab":{},"outputId":"b65ac7c8-e662-4bfe-e9da-9f087c09f09f"},"source":["v = array([1,2,3,4])\n","\n","for element in v:\n","    print(element)"],"execution_count":0,"outputs":[{"output_type":"stream","text":["1\n","2\n","3\n","4\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"q1Mp_Qvq4H7G","colab_type":"code","colab":{},"outputId":"d277af64-0bdd-4d09-f304-dd649607bd4f"},"source":["M = array([[1,2], [3,4]])\n","\n","for row in M:\n","    print(\"row\", row)\n","    \n","    for element in row:\n","        print(element)"],"execution_count":0,"outputs":[{"output_type":"stream","text":["row [1 2]\n","1\n","2\n","row [3 4]\n","3\n","4\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"sOq8jYMN4H7I","colab_type":"text"},"source":["When we need to iterate over each element of an array and modify its elements, it is convenient to use the `enumerate` function to obtain both the element and its index in the `for` loop: "]},{"cell_type":"code","metadata":{"id":"93m_6Ev34H7J","colab_type":"code","colab":{},"outputId":"86c347ae-b2ac-49c6-8fb3-8b1e115e4d91"},"source":["for row_idx, row in enumerate(M):\n","    print(\"row_idx\", row_idx, \"row\", row)\n","    \n","    for col_idx, element in enumerate(row):\n","        print(\"col_idx\", col_idx, \"element\", element)\n","       \n","        # update the matrix M: square each element\n","        M[row_idx, col_idx] = element ** 2"],"execution_count":0,"outputs":[{"output_type":"stream","text":["row_idx 0 row [1 2]\n","col_idx 0 element 1\n","col_idx 1 element 2\n","row_idx 1 row [3 4]\n","col_idx 0 element 3\n","col_idx 1 element 4\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"-UcjV-954H7K","colab_type":"code","colab":{},"outputId":"ca48ab3c-cf27-47ad-d6b5-83009cd1e724"},"source":["# each element in M is now squared\n","M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1,  4],\n","       [ 9, 16]])"]},"metadata":{"tags":[]},"execution_count":35}]},{"cell_type":"code","metadata":{"id":"kRTCtoHX4H7M","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}


--------------------------------------------------------------------------------
/part_6.modeling.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Modelling"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "## Loading the processed dataset"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 2,
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "# !pip install -U -q datalearn19intro\n",
  24 |     "import numpy as np\n",
  25 |     "import pandas as pd\n",
  26 |     "import seaborn as sns\n",
  27 |     "import matplotlib.pyplot as plt\n",
  28 |     "from datalearn19intro import get_processed_intro_dataset\n",
  29 |     "%matplotlib inline"
  30 |    ]
  31 |   },
  32 |   {
  33 |    "cell_type": "code",
  34 |    "execution_count": 3,
  35 |    "metadata": {},
  36 |    "outputs": [],
  37 |    "source": [
  38 |     "pd.options.display.float_format = '{:,.2f}'.format\n",
  39 |     "pd.set_option('display.max_columns', 150)\n",
  40 |     "pd.set_option('display.max_rows', 200)"
  41 |    ]
  42 |   },
  43 |   {
  44 |    "cell_type": "code",
  45 |    "execution_count": 4,
  46 |    "metadata": {},
  47 |    "outputs": [],
  48 |    "source": [
  49 |     "df = get_processed_intro_dataset()"
  50 |    ]
  51 |   },
  52 |   {
  53 |    "cell_type": "code",
  54 |    "execution_count": 5,
  55 |    "metadata": {},
  56 |    "outputs": [
  57 |     {
  58 |      "data": {
  59 |       "text/plain": [
  60 |        "(1001, 22)"
  61 |       ]
  62 |      },
  63 |      "execution_count": 5,
  64 |      "metadata": {},
  65 |      "output_type": "execute_result"
  66 |     }
  67 |    ],
  68 |    "source": [
  69 |     "df.shape"
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "code",
  74 |    "execution_count": 6,
  75 |    "metadata": {},
  76 |    "outputs": [],
  77 |    "source": [
  78 |     "df = df.set_index('account_id')"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "code",
  83 |    "execution_count": 7,
  84 |    "metadata": {},
  85 |    "outputs": [
  86 |     {
  87 |      "data": {
  88 |       "text/html": [
  89 |        "<div>\n",
  90 |        "<style scoped>\n",
  91 |        "    .dataframe tbody tr th:only-of-type {\n",
  92 |        "        vertical-align: middle;\n",
  93 |        "    }\n",
  94 |        "\n",
  95 |        "    .dataframe tbody tr th {\n",
  96 |        "        vertical-align: top;\n",
  97 |        "    }\n",
  98 |        "\n",
  99 |        "    .dataframe thead th {\n",
 100 |        "        text-align: right;\n",
 101 |        "    }\n",
 102 |        "</style>\n",
 103 |        "<table border=\"1\" class=\"dataframe\">\n",
 104 |        "  <thead>\n",
 105 |        "    <tr style=\"text-align: right;\">\n",
 106 |        "      <th></th>\n",
 107 |        "      <th>is_gmail_fromaccounts</th>\n",
 108 |        "      <th>collection_21_days</th>\n",
 109 |        "      <th>billed_users_count</th>\n",
 110 |        "      <th>log_max_tsize</th>\n",
 111 |        "      <th>os_android_avg</th>\n",
 112 |        "      <th>os_chrome_os_avg</th>\n",
 113 |        "      <th>os_ios_avg</th>\n",
 114 |        "      <th>os_linux_avg</th>\n",
 115 |        "      <th>os_mac_avg</th>\n",
 116 |        "      <th>total_events_sum</th>\n",
 117 |        "      <th>notification_events_sum</th>\n",
 118 |        "      <th>new_entry_events_sum</th>\n",
 119 |        "      <th>payment_events_sum</th>\n",
 120 |        "      <th>inbox_events_sum</th>\n",
 121 |        "      <th>communicating_events_sum</th>\n",
 122 |        "      <th>non_communicating_events_sum</th>\n",
 123 |        "      <th>web_events_sum</th>\n",
 124 |        "      <th>ios_events_sum</th>\n",
 125 |        "      <th>desktop_app_events_sum</th>\n",
 126 |        "      <th>empty_events_sum</th>\n",
 127 |        "      <th>lead_score</th>\n",
 128 |        "    </tr>\n",
 129 |        "    <tr>\n",
 130 |        "      <th>account_id</th>\n",
 131 |        "      <th></th>\n",
 132 |        "      <th></th>\n",
 133 |        "      <th></th>\n",
 134 |        "      <th></th>\n",
 135 |        "      <th></th>\n",
 136 |        "      <th></th>\n",
 137 |        "      <th></th>\n",
 138 |        "      <th></th>\n",
 139 |        "      <th></th>\n",
 140 |        "      <th></th>\n",
 141 |        "      <th></th>\n",
 142 |        "      <th></th>\n",
 143 |        "      <th></th>\n",
 144 |        "      <th></th>\n",
 145 |        "      <th></th>\n",
 146 |        "      <th></th>\n",
 147 |        "      <th></th>\n",
 148 |        "      <th></th>\n",
 149 |        "      <th></th>\n",
 150 |        "      <th></th>\n",
 151 |        "      <th></th>\n",
 152 |        "    </tr>\n",
 153 |        "  </thead>\n",
 154 |        "  <tbody>\n",
 155 |        "    <tr>\n",
 156 |        "      <th>2793496</th>\n",
 157 |        "      <td>1.00</td>\n",
 158 |        "      <td>0.00</td>\n",
 159 |        "      <td>0.00</td>\n",
 160 |        "      <td>0.57</td>\n",
 161 |        "      <td>0.00</td>\n",
 162 |        "      <td>0.00</td>\n",
 163 |        "      <td>0.00</td>\n",
 164 |        "      <td>0.00</td>\n",
 165 |        "      <td>0.00</td>\n",
 166 |        "      <td>0.00</td>\n",
 167 |        "      <td>0.00</td>\n",
 168 |        "      <td>0.00</td>\n",
 169 |        "      <td>0.00</td>\n",
 170 |        "      <td>0.00</td>\n",
 171 |        "      <td>0.00</td>\n",
 172 |        "      <td>0.00</td>\n",
 173 |        "      <td>0.00</td>\n",
 174 |        "      <td>0.00</td>\n",
 175 |        "      <td>0.00</td>\n",
 176 |        "      <td>0.00</td>\n",
 177 |        "      <td>0.00</td>\n",
 178 |        "    </tr>\n",
 179 |        "    <tr>\n",
 180 |        "      <th>2793497</th>\n",
 181 |        "      <td>1.00</td>\n",
 182 |        "      <td>0.00</td>\n",
 183 |        "      <td>0.00</td>\n",
 184 |        "      <td>0.57</td>\n",
 185 |        "      <td>0.45</td>\n",
 186 |        "      <td>0.00</td>\n",
 187 |        "      <td>0.55</td>\n",
 188 |        "      <td>0.00</td>\n",
 189 |        "      <td>0.00</td>\n",
 190 |        "      <td>0.01</td>\n",
 191 |        "      <td>0.04</td>\n",
 192 |        "      <td>0.02</td>\n",
 193 |        "      <td>0.00</td>\n",
 194 |        "      <td>0.00</td>\n",
 195 |        "      <td>0.00</td>\n",
 196 |        "      <td>0.00</td>\n",
 197 |        "      <td>0.00</td>\n",
 198 |        "      <td>0.17</td>\n",
 199 |        "      <td>0.00</td>\n",
 200 |        "      <td>0.00</td>\n",
 201 |        "      <td>0.00</td>\n",
 202 |        "    </tr>\n",
 203 |        "    <tr>\n",
 204 |        "      <th>2793498</th>\n",
 205 |        "      <td>1.00</td>\n",
 206 |        "      <td>0.00</td>\n",
 207 |        "      <td>0.00</td>\n",
 208 |        "      <td>0.43</td>\n",
 209 |        "      <td>0.00</td>\n",
 210 |        "      <td>0.00</td>\n",
 211 |        "      <td>0.00</td>\n",
 212 |        "      <td>0.00</td>\n",
 213 |        "      <td>0.00</td>\n",
 214 |        "      <td>0.00</td>\n",
 215 |        "      <td>0.00</td>\n",
 216 |        "      <td>0.00</td>\n",
 217 |        "      <td>0.00</td>\n",
 218 |        "      <td>0.00</td>\n",
 219 |        "      <td>0.00</td>\n",
 220 |        "      <td>0.00</td>\n",
 221 |        "      <td>0.00</td>\n",
 222 |        "      <td>0.00</td>\n",
 223 |        "      <td>0.00</td>\n",
 224 |        "      <td>0.00</td>\n",
 225 |        "      <td>0.00</td>\n",
 226 |        "    </tr>\n",
 227 |        "    <tr>\n",
 228 |        "      <th>2793499</th>\n",
 229 |        "      <td>1.00</td>\n",
 230 |        "      <td>0.00</td>\n",
 231 |        "      <td>0.00</td>\n",
 232 |        "      <td>0.57</td>\n",
 233 |        "      <td>1.00</td>\n",
 234 |        "      <td>0.00</td>\n",
 235 |        "      <td>0.00</td>\n",
 236 |        "      <td>0.00</td>\n",
 237 |        "      <td>0.00</td>\n",
 238 |        "      <td>0.00</td>\n",
 239 |        "      <td>0.00</td>\n",
 240 |        "      <td>0.00</td>\n",
 241 |        "      <td>0.00</td>\n",
 242 |        "      <td>0.00</td>\n",
 243 |        "      <td>0.00</td>\n",
 244 |        "      <td>0.00</td>\n",
 245 |        "      <td>0.00</td>\n",
 246 |        "      <td>0.00</td>\n",
 247 |        "      <td>0.00</td>\n",
 248 |        "      <td>0.00</td>\n",
 249 |        "      <td>0.00</td>\n",
 250 |        "    </tr>\n",
 251 |        "    <tr>\n",
 252 |        "      <th>2793500</th>\n",
 253 |        "      <td>1.00</td>\n",
 254 |        "      <td>0.00</td>\n",
 255 |        "      <td>0.00</td>\n",
 256 |        "      <td>0.43</td>\n",
 257 |        "      <td>0.00</td>\n",
 258 |        "      <td>1.00</td>\n",
 259 |        "      <td>0.00</td>\n",
 260 |        "      <td>0.00</td>\n",
 261 |        "      <td>0.00</td>\n",
 262 |        "      <td>0.00</td>\n",
 263 |        "      <td>0.00</td>\n",
 264 |        "      <td>0.00</td>\n",
 265 |        "      <td>0.00</td>\n",
 266 |        "      <td>0.00</td>\n",
 267 |        "      <td>0.00</td>\n",
 268 |        "      <td>0.00</td>\n",
 269 |        "      <td>0.00</td>\n",
 270 |        "      <td>0.00</td>\n",
 271 |        "      <td>0.00</td>\n",
 272 |        "      <td>0.00</td>\n",
 273 |        "      <td>0.00</td>\n",
 274 |        "    </tr>\n",
 275 |        "    <tr>\n",
 276 |        "      <th>2793501</th>\n",
 277 |        "      <td>0.00</td>\n",
 278 |        "      <td>0.00</td>\n",
 279 |        "      <td>0.00</td>\n",
 280 |        "      <td>0.57</td>\n",
 281 |        "      <td>0.00</td>\n",
 282 |        "      <td>0.00</td>\n",
 283 |        "      <td>0.00</td>\n",
 284 |        "      <td>0.00</td>\n",
 285 |        "      <td>1.00</td>\n",
 286 |        "      <td>0.00</td>\n",
 287 |        "      <td>0.00</td>\n",
 288 |        "      <td>0.00</td>\n",
 289 |        "      <td>0.00</td>\n",
 290 |        "      <td>0.00</td>\n",
 291 |        "      <td>0.00</td>\n",
 292 |        "      <td>0.00</td>\n",
 293 |        "      <td>0.00</td>\n",
 294 |        "      <td>0.00</td>\n",
 295 |        "      <td>0.00</td>\n",
 296 |        "      <td>0.00</td>\n",
 297 |        "      <td>0.00</td>\n",
 298 |        "    </tr>\n",
 299 |        "    <tr>\n",
 300 |        "      <th>2793502</th>\n",
 301 |        "      <td>1.00</td>\n",
 302 |        "      <td>0.00</td>\n",
 303 |        "      <td>0.00</td>\n",
 304 |        "      <td>0.43</td>\n",
 305 |        "      <td>0.00</td>\n",
 306 |        "      <td>0.00</td>\n",
 307 |        "      <td>0.00</td>\n",
 308 |        "      <td>0.00</td>\n",
 309 |        "      <td>0.00</td>\n",
 310 |        "      <td>0.00</td>\n",
 311 |        "      <td>0.00</td>\n",
 312 |        "      <td>0.00</td>\n",
 313 |        "      <td>0.00</td>\n",
 314 |        "      <td>0.00</td>\n",
 315 |        "      <td>0.00</td>\n",
 316 |        "      <td>0.00</td>\n",
 317 |        "      <td>0.00</td>\n",
 318 |        "      <td>0.00</td>\n",
 319 |        "      <td>0.00</td>\n",
 320 |        "      <td>0.00</td>\n",
 321 |        "      <td>0.00</td>\n",
 322 |        "    </tr>\n",
 323 |        "    <tr>\n",
 324 |        "      <th>2793503</th>\n",
 325 |        "      <td>1.00</td>\n",
 326 |        "      <td>0.00</td>\n",
 327 |        "      <td>0.00</td>\n",
 328 |        "      <td>0.57</td>\n",
 329 |        "      <td>1.00</td>\n",
 330 |        "      <td>0.00</td>\n",
 331 |        "      <td>0.00</td>\n",
 332 |        "      <td>0.00</td>\n",
 333 |        "      <td>0.00</td>\n",
 334 |        "      <td>0.00</td>\n",
 335 |        "      <td>0.00</td>\n",
 336 |        "      <td>0.00</td>\n",
 337 |        "      <td>0.00</td>\n",
 338 |        "      <td>0.00</td>\n",
 339 |        "      <td>0.00</td>\n",
 340 |        "      <td>0.00</td>\n",
 341 |        "      <td>0.00</td>\n",
 342 |        "      <td>0.00</td>\n",
 343 |        "      <td>0.00</td>\n",
 344 |        "      <td>0.00</td>\n",
 345 |        "      <td>0.00</td>\n",
 346 |        "    </tr>\n",
 347 |        "  </tbody>\n",
 348 |        "</table>\n",
 349 |        "</div>"
 350 |       ],
 351 |       "text/plain": [
 352 |        "            is_gmail_fromaccounts  collection_21_days  billed_users_count  \\\n",
 353 |        "account_id                                                                  \n",
 354 |        "2793496                      1.00                0.00                0.00   \n",
 355 |        "2793497                      1.00                0.00                0.00   \n",
 356 |        "2793498                      1.00                0.00                0.00   \n",
 357 |        "2793499                      1.00                0.00                0.00   \n",
 358 |        "2793500                      1.00                0.00                0.00   \n",
 359 |        "2793501                      0.00                0.00                0.00   \n",
 360 |        "2793502                      1.00                0.00                0.00   \n",
 361 |        "2793503                      1.00                0.00                0.00   \n",
 362 |        "\n",
 363 |        "            log_max_tsize  os_android_avg  os_chrome_os_avg  os_ios_avg  \\\n",
 364 |        "account_id                                                                \n",
 365 |        "2793496              0.57            0.00              0.00        0.00   \n",
 366 |        "2793497              0.57            0.45              0.00        0.55   \n",
 367 |        "2793498              0.43            0.00              0.00        0.00   \n",
 368 |        "2793499              0.57            1.00              0.00        0.00   \n",
 369 |        "2793500              0.43            0.00              1.00        0.00   \n",
 370 |        "2793501              0.57            0.00              0.00        0.00   \n",
 371 |        "2793502              0.43            0.00              0.00        0.00   \n",
 372 |        "2793503              0.57            1.00              0.00        0.00   \n",
 373 |        "\n",
 374 |        "            os_linux_avg  os_mac_avg  total_events_sum  \\\n",
 375 |        "account_id                                               \n",
 376 |        "2793496             0.00        0.00              0.00   \n",
 377 |        "2793497             0.00        0.00              0.01   \n",
 378 |        "2793498             0.00        0.00              0.00   \n",
 379 |        "2793499             0.00        0.00              0.00   \n",
 380 |        "2793500             0.00        0.00              0.00   \n",
 381 |        "2793501             0.00        1.00              0.00   \n",
 382 |        "2793502             0.00        0.00              0.00   \n",
 383 |        "2793503             0.00        0.00              0.00   \n",
 384 |        "\n",
 385 |        "            notification_events_sum  new_entry_events_sum  payment_events_sum  \\\n",
 386 |        "account_id                                                                      \n",
 387 |        "2793496                        0.00                  0.00                0.00   \n",
 388 |        "2793497                        0.04                  0.02                0.00   \n",
 389 |        "2793498                        0.00                  0.00                0.00   \n",
 390 |        "2793499                        0.00                  0.00                0.00   \n",
 391 |        "2793500                        0.00                  0.00                0.00   \n",
 392 |        "2793501                        0.00                  0.00                0.00   \n",
 393 |        "2793502                        0.00                  0.00                0.00   \n",
 394 |        "2793503                        0.00                  0.00                0.00   \n",
 395 |        "\n",
 396 |        "            inbox_events_sum  communicating_events_sum  \\\n",
 397 |        "account_id                                               \n",
 398 |        "2793496                 0.00                      0.00   \n",
 399 |        "2793497                 0.00                      0.00   \n",
 400 |        "2793498                 0.00                      0.00   \n",
 401 |        "2793499                 0.00                      0.00   \n",
 402 |        "2793500                 0.00                      0.00   \n",
 403 |        "2793501                 0.00                      0.00   \n",
 404 |        "2793502                 0.00                      0.00   \n",
 405 |        "2793503                 0.00                      0.00   \n",
 406 |        "\n",
 407 |        "            non_communicating_events_sum  web_events_sum  ios_events_sum  \\\n",
 408 |        "account_id                                                                 \n",
 409 |        "2793496                             0.00            0.00            0.00   \n",
 410 |        "2793497                             0.00            0.00            0.17   \n",
 411 |        "2793498                             0.00            0.00            0.00   \n",
 412 |        "2793499                             0.00            0.00            0.00   \n",
 413 |        "2793500                             0.00            0.00            0.00   \n",
 414 |        "2793501                             0.00            0.00            0.00   \n",
 415 |        "2793502                             0.00            0.00            0.00   \n",
 416 |        "2793503                             0.00            0.00            0.00   \n",
 417 |        "\n",
 418 |        "            desktop_app_events_sum  empty_events_sum  lead_score  \n",
 419 |        "account_id                                                        \n",
 420 |        "2793496                       0.00              0.00        0.00  \n",
 421 |        "2793497                       0.00              0.00        0.00  \n",
 422 |        "2793498                       0.00              0.00        0.00  \n",
 423 |        "2793499                       0.00              0.00        0.00  \n",
 424 |        "2793500                       0.00              0.00        0.00  \n",
 425 |        "2793501                       0.00              0.00        0.00  \n",
 426 |        "2793502                       0.00              0.00        0.00  \n",
 427 |        "2793503                       0.00              0.00        0.00  "
 428 |       ]
 429 |      },
 430 |      "execution_count": 7,
 431 |      "metadata": {},
 432 |      "output_type": "execute_result"
 433 |     }
 434 |    ],
 435 |    "source": [
 436 |     "df.head(8)"
 437 |    ]
 438 |   },
 439 |   {
 440 |    "cell_type": "code",
 441 |    "execution_count": 8,
 442 |    "metadata": {},
 443 |    "outputs": [],
 444 |    "source": [
 445 |     "X = df.drop('lead_score', axis=1)"
 446 |    ]
 447 |   },
 448 |   {
 449 |    "cell_type": "code",
 450 |    "execution_count": 9,
 451 |    "metadata": {},
 452 |    "outputs": [],
 453 |    "source": [
 454 |     "y = df['lead_score']"
 455 |    ]
 456 |   },
 457 |   {
 458 |    "cell_type": "markdown",
 459 |    "metadata": {},
 460 |    "source": [
 461 |     "## Data split"
 462 |    ]
 463 |   },
 464 |   {
 465 |    "cell_type": "code",
 466 |    "execution_count": 10,
 467 |    "metadata": {},
 468 |    "outputs": [],
 469 |    "source": [
 470 |     "from sklearn.model_selection import train_test_split"
 471 |    ]
 472 |   },
 473 |   {
 474 |    "cell_type": "markdown",
 475 |    "metadata": {},
 476 |    "source": [
 477 |     "Let's split the data into train and test sets, at a 80/20 ratio."
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "code",
 482 |    "execution_count": 11,
 483 |    "metadata": {},
 484 |    "outputs": [],
 485 |    "source": [
 486 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
 487 |    ]
 488 |   },
 489 |   {
 490 |    "cell_type": "markdown",
 491 |    "metadata": {},
 492 |    "source": [
 493 |     "## Model fit"
 494 |    ]
 495 |   },
 496 |   {
 497 |    "cell_type": "code",
 498 |    "execution_count": 32,
 499 |    "metadata": {},
 500 |    "outputs": [],
 501 |    "source": [
 502 |     "from sklearn.linear_model import LogisticRegression"
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "code",
 507 |    "execution_count": 33,
 508 |    "metadata": {},
 509 |    "outputs": [],
 510 |    "source": [
 511 |     "clf = LogisticRegression()"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "code",
 516 |    "execution_count": 34,
 517 |    "metadata": {},
 518 |    "outputs": [
 519 |     {
 520 |      "name": "stderr",
 521 |      "output_type": "stream",
 522 |      "text": [
 523 |       "/Users/shaypalachy/.pyenv/versions/3.6.5/envs/py3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
 524 |       "  FutureWarning)\n"
 525 |      ]
 526 |     },
 527 |     {
 528 |      "data": {
 529 |       "text/plain": [
 530 |        "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
 531 |        "          intercept_scaling=1, max_iter=100, multi_class='warn',\n",
 532 |        "          n_jobs=None, penalty='l2', random_state=None, solver='warn',\n",
 533 |        "          tol=0.0001, verbose=0, warm_start=False)"
 534 |       ]
 535 |      },
 536 |      "execution_count": 34,
 537 |      "metadata": {},
 538 |      "output_type": "execute_result"
 539 |     }
 540 |    ],
 541 |    "source": [
 542 |     "clf.fit(X_train, y_train)"
 543 |    ]
 544 |   },
 545 |   {
 546 |    "cell_type": "code",
 547 |    "execution_count": 35,
 548 |    "metadata": {},
 549 |    "outputs": [],
 550 |    "source": [
 551 |     "y_pred = clf.predict(X_test)"
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "code",
 556 |    "execution_count": 36,
 557 |    "metadata": {},
 558 |    "outputs": [],
 559 |    "source": [
 560 |     "from sklearn.metrics import accuracy_score, precision_score, recall_score"
 561 |    ]
 562 |   },
 563 |   {
 564 |    "cell_type": "code",
 565 |    "execution_count": 37,
 566 |    "metadata": {},
 567 |    "outputs": [
 568 |     {
 569 |      "data": {
 570 |       "text/plain": [
 571 |        "0.9701492537313433"
 572 |       ]
 573 |      },
 574 |      "execution_count": 37,
 575 |      "metadata": {},
 576 |      "output_type": "execute_result"
 577 |     }
 578 |    ],
 579 |    "source": [
 580 |     "accuracy_score(y_test, y_pred)"
 581 |    ]
 582 |   },
 583 |   {
 584 |    "cell_type": "markdown",
 585 |    "metadata": {},
 586 |    "source": [
 587 |     "OMG! That's amazing!"
 588 |    ]
 589 |   },
 590 |   {
 591 |    "cell_type": "code",
 592 |    "execution_count": 38,
 593 |    "metadata": {},
 594 |    "outputs": [
 595 |     {
 596 |      "name": "stderr",
 597 |      "output_type": "stream",
 598 |      "text": [
 599 |       "/Users/shaypalachy/.pyenv/versions/3.6.5/envs/py3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
 600 |       "  'precision', 'predicted', average, warn_for)\n"
 601 |      ]
 602 |     },
 603 |     {
 604 |      "data": {
 605 |       "text/plain": [
 606 |        "0.0"
 607 |       ]
 608 |      },
 609 |      "execution_count": 38,
 610 |      "metadata": {},
 611 |      "output_type": "execute_result"
 612 |     }
 613 |    ],
 614 |    "source": [
 615 |     "precision_score(y_test, y_pred)"
 616 |    ]
 617 |   },
 618 |   {
 619 |    "cell_type": "code",
 620 |    "execution_count": 39,
 621 |    "metadata": {},
 622 |    "outputs": [
 623 |     {
 624 |      "data": {
 625 |       "text/plain": [
 626 |        "0.0"
 627 |       ]
 628 |      },
 629 |      "execution_count": 39,
 630 |      "metadata": {},
 631 |      "output_type": "execute_result"
 632 |     }
 633 |    ],
 634 |    "source": [
 635 |     "recall_score(y_test, y_pred)"
 636 |    ]
 637 |   },
 638 |   {
 639 |    "cell_type": "markdown",
 640 |    "metadata": {},
 641 |    "source": [
 642 |     "Oh no! Maybe our model wasn't as good as we thought!\n",
 643 |     "What happened?"
 644 |    ]
 645 |   },
 646 |   {
 647 |    "cell_type": "code",
 648 |    "execution_count": 40,
 649 |    "metadata": {},
 650 |    "outputs": [
 651 |     {
 652 |      "data": {
 653 |       "text/plain": [
 654 |        "(array([0.]), array([201]))"
 655 |       ]
 656 |      },
 657 |      "execution_count": 40,
 658 |      "metadata": {},
 659 |      "output_type": "execute_result"
 660 |     }
 661 |    ],
 662 |    "source": [
 663 |     "np.unique(y_pred, return_counts=True)"
 664 |    ]
 665 |   },
 666 |   {
 667 |    "cell_type": "code",
 668 |    "execution_count": 41,
 669 |    "metadata": {},
 670 |    "outputs": [
 671 |     {
 672 |      "data": {
 673 |       "text/plain": [
 674 |        "(array([0., 1.]), array([980,  21]))"
 675 |       ]
 676 |      },
 677 |      "execution_count": 41,
 678 |      "metadata": {},
 679 |      "output_type": "execute_result"
 680 |     }
 681 |    ],
 682 |    "source": [
 683 |     "np.unique(y, return_counts=True)"
 684 |    ]
 685 |   },
 686 |   {
 687 |    "cell_type": "code",
 688 |    "execution_count": 42,
 689 |    "metadata": {},
 690 |    "outputs": [
 691 |     {
 692 |      "name": "stdout",
 693 |      "output_type": "stream",
 694 |      "text": [
 695 |       "Accuracy: 97.01%\n",
 696 |       "Precision: 0.00%\n",
 697 |       "Recall: 0.00%\n"
 698 |      ]
 699 |     },
 700 |     {
 701 |      "name": "stderr",
 702 |      "output_type": "stream",
 703 |      "text": [
 704 |       "/Users/shaypalachy/.pyenv/versions/3.6.5/envs/py3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
 705 |       "  'precision', 'predicted', average, warn_for)\n"
 706 |      ]
 707 |     }
 708 |    ],
 709 |    "source": [
 710 |     "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n",
 711 |     "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n",
 712 |     "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))"
 713 |    ]
 714 |   },
 715 |   {
 716 |    "cell_type": "markdown",
 717 |    "metadata": {},
 718 |    "source": [
 719 |     "Notice, we have 980 negative examples and only 21 positive ones."
 720 |    ]
 721 |   },
 722 |   {
 723 |    "cell_type": "code",
 724 |    "execution_count": 43,
 725 |    "metadata": {},
 726 |    "outputs": [
 727 |     {
 728 |      "data": {
 729 |       "text/plain": [
 730 |        "97.9020979020979"
 731 |       ]
 732 |      },
 733 |      "execution_count": 43,
 734 |      "metadata": {},
 735 |      "output_type": "execute_result"
 736 |     }
 737 |    ],
 738 |    "source": [
 739 |     "980/1001 * 100"
 740 |    ]
 741 |   },
 742 |   {
 743 |    "cell_type": "markdown",
 744 |    "metadata": {},
 745 |    "source": [
 746 |     "Our model was optimizing accuracy when fitting its parameters, and the easiest to do that is to simply predict 0 all the time for roughly 98% accuracy!"
 747 |    ]
 748 |   },
 749 |   {
 750 |    "cell_type": "markdown",
 751 |    "metadata": {},
 752 |    "source": [
 753 |     "We can use `class_weight='balanced'` to each **class** equally important instead of each row/entry."
 754 |    ]
 755 |   },
 756 |   {
 757 |    "cell_type": "code",
 758 |    "execution_count": 44,
 759 |    "metadata": {},
 760 |    "outputs": [
 761 |     {
 762 |      "name": "stderr",
 763 |      "output_type": "stream",
 764 |      "text": [
 765 |       "/Users/shaypalachy/.pyenv/versions/3.6.5/envs/py3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
 766 |       "  FutureWarning)\n"
 767 |      ]
 768 |     }
 769 |    ],
 770 |    "source": [
 771 |     "clf = LogisticRegression(class_weight='balanced')\n",
 772 |     "clf.fit(X_train, y_train)\n",
 773 |     "y_pred = clf.predict(X_test)"
 774 |    ]
 775 |   },
 776 |   {
 777 |    "cell_type": "code",
 778 |    "execution_count": 45,
 779 |    "metadata": {},
 780 |    "outputs": [
 781 |     {
 782 |      "name": "stdout",
 783 |      "output_type": "stream",
 784 |      "text": [
 785 |       "Accuracy: 68.66%\n",
 786 |       "Precision: 7.46%\n",
 787 |       "Recall: 83.33%\n"
 788 |      ]
 789 |     }
 790 |    ],
 791 |    "source": [
 792 |     "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n",
 793 |     "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n",
 794 |     "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "markdown",
 799 |    "metadata": {},
 800 |    "source": [
 801 |     "## Hyperparameter tuning"
 802 |    ]
 803 |   },
 804 |   {
 805 |    "cell_type": "markdown",
 806 |    "metadata": {},
 807 |    "source": [
 808 |     "We can define, for each hyperparameter, a range of possible values."
 809 |    ]
 810 |   },
 811 |   {
 812 |    "cell_type": "markdown",
 813 |    "metadata": {},
 814 |    "source": [
 815 |     "For logistic regression here we will play with just two hyperparameters:\n",
 816 |     "\n",
 817 |     "* `penalty` - The loss function use. Both L1 and L2 are common loss functions.\n",
 818 |     "* `C` - Inverse of regularization strength; smaller values specify stronger regularization. Regularization can prevent overfitting, a concept which you'll discuss in the advanced workshops."
 819 |    ]
 820 |   },
 821 |   {
 822 |    "cell_type": "code",
 823 |    "execution_count": 46,
 824 |    "metadata": {},
 825 |    "outputs": [],
 826 |    "source": [
 827 |     "hyparam_grid = [{\n",
 828 |     "    'penalty': ['l1', 'l2'],\n",
 829 |     "    'C': [1, 10, 100, 1000],\n",
 830 |     "    'class_weight': ['balanced'],\n",
 831 |     "}]"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "markdown",
 836 |    "metadata": {},
 837 |    "source": [
 838 |     "We can optimize are hyperparameters for various metrics..."
 839 |    ]
 840 |   },
 841 |   {
 842 |    "cell_type": "code",
 843 |    "execution_count": 49,
 844 |    "metadata": {},
 845 |    "outputs": [],
 846 |    "source": [
 847 |     "scores = ['precision', 'recall']"
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "code",
 852 |    "execution_count": 70,
 853 |    "metadata": {},
 854 |    "outputs": [],
 855 |    "source": [
 856 |     "# silence annoying future warnings\n",
 857 |     "def warn(*args, **kwargs):\n",
 858 |     "    pass\n",
 859 |     "import warnings\n",
 860 |     "warnings.warn = warn"
 861 |    ]
 862 |   },
 863 |   {
 864 |    "cell_type": "code",
 865 |    "execution_count": 54,
 866 |    "metadata": {},
 867 |    "outputs": [
 868 |     {
 869 |      "name": "stdout",
 870 |      "output_type": "stream",
 871 |      "text": [
 872 |       "# Tuning hyper-parameters for precision\n",
 873 |       "\n",
 874 |       "Best parameters set found on development set:\n",
 875 |       "\n",
 876 |       "{'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 877 |       "\n",
 878 |       "Grid scores on development set:\n",
 879 |       "\n",
 880 |       "0.525 (+/-0.014) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}\n",
 881 |       "0.525 (+/-0.014) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 882 |       "0.515 (+/-0.036) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l1'}\n",
 883 |       "0.525 (+/-0.014) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 884 |       "0.508 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}\n",
 885 |       "0.521 (+/-0.024) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 886 |       "0.516 (+/-0.032) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l1'}\n",
 887 |       "0.513 (+/-0.031) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 888 |       "\n",
 889 |       "Detailed classification report:\n",
 890 |       "\n",
 891 |       "The model is trained on the full development set.\n",
 892 |       "The scores are computed on the full evaluation set.\n",
 893 |       "\n",
 894 |       "              precision    recall  f1-score   support\n",
 895 |       "\n",
 896 |       "         0.0       0.99      0.68      0.81       195\n",
 897 |       "         1.0       0.07      0.83      0.14         6\n",
 898 |       "\n",
 899 |       "   micro avg       0.69      0.69      0.69       201\n",
 900 |       "   macro avg       0.53      0.76      0.47       201\n",
 901 |       "weighted avg       0.97      0.69      0.79       201\n",
 902 |       "\n",
 903 |       "\n",
 904 |       "# Tuning hyper-parameters for recall\n",
 905 |       "\n",
 906 |       "Best parameters set found on development set:\n",
 907 |       "\n",
 908 |       "{'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 909 |       "\n",
 910 |       "Grid scores on development set:\n",
 911 |       "\n",
 912 |       "0.786 (+/-0.160) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}\n",
 913 |       "0.785 (+/-0.162) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 914 |       "0.669 (+/-0.380) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l1'}\n",
 915 |       "0.788 (+/-0.162) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 916 |       "0.586 (+/-0.219) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}\n",
 917 |       "0.728 (+/-0.256) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 918 |       "0.621 (+/-0.242) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l1'}\n",
 919 |       "0.641 (+/-0.326) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l2'}\n",
 920 |       "\n",
 921 |       "Detailed classification report:\n",
 922 |       "\n",
 923 |       "The model is trained on the full development set.\n",
 924 |       "The scores are computed on the full evaluation set.\n",
 925 |       "\n",
 926 |       "              precision    recall  f1-score   support\n",
 927 |       "\n",
 928 |       "         0.0       0.99      0.68      0.81       195\n",
 929 |       "         1.0       0.07      0.83      0.14         6\n",
 930 |       "\n",
 931 |       "   micro avg       0.69      0.69      0.69       201\n",
 932 |       "   macro avg       0.53      0.76      0.47       201\n",
 933 |       "weighted avg       0.97      0.69      0.79       201\n",
 934 |       "\n",
 935 |       "\n"
 936 |      ]
 937 |     }
 938 |    ],
 939 |    "source": [
 940 |     "from sklearn.metrics import classification_report\n",
 941 |     "from sklearn.model_selection import GridSearchCV\n",
 942 |     "for score in scores:\n",
 943 |     "    print(\"# Tuning hyper-parameters for %s\" % score)\n",
 944 |     "    print()\n",
 945 |     "\n",
 946 |     "    clf = GridSearchCV(LogisticRegression(), hyparam_grid, cv=5,\n",
 947 |     "                       scoring='%s_macro' % score)\n",
 948 |     "    clf.fit(X_train, y_train)\n",
 949 |     "\n",
 950 |     "    print(\"Best parameters set found on development set:\")\n",
 951 |     "    print()\n",
 952 |     "    print(clf.best_params_)\n",
 953 |     "    print()\n",
 954 |     "    print(\"Grid scores on development set:\")\n",
 955 |     "    print()\n",
 956 |     "    means = clf.cv_results_['mean_test_score']\n",
 957 |     "    stds = clf.cv_results_['std_test_score']\n",
 958 |     "    for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n",
 959 |     "        print(\"%0.3f (+/-%0.03f) for %r\"\n",
 960 |     "              % (mean, std * 2, params))\n",
 961 |     "    print()\n",
 962 |     "\n",
 963 |     "    print(\"Detailed classification report:\")\n",
 964 |     "    print()\n",
 965 |     "    print(\"The model is trained on the full development set.\")\n",
 966 |     "    print(\"The scores are computed on the full evaluation set.\")\n",
 967 |     "    print()\n",
 968 |     "    y_true, y_pred = y_test, clf.predict(X_test)\n",
 969 |     "    print(classification_report(y_true, y_pred))\n",
 970 |     "    print()"
 971 |    ]
 972 |   },
 973 |   {
 974 |    "cell_type": "markdown",
 975 |    "metadata": {},
 976 |    "source": [
 977 |     "We can now fit a model with the tune hyperparameters over the entire training set and test its performace on the test set. This will provide an estimate of performance which is likely to be (possibly substantially) optimistically biased.\n",
 978 |     "\n",
 979 |     "A more advanced way to get estimates for hyperoptimization results - and a less biased one - is nested cross-validation; this is a more computationaly intensive method, and out of scope here, of course. :)"
 980 |    ]
 981 |   },
 982 |   {
 983 |    "cell_type": "code",
 984 |    "execution_count": 55,
 985 |    "metadata": {},
 986 |    "outputs": [
 987 |     {
 988 |      "name": "stdout",
 989 |      "output_type": "stream",
 990 |      "text": [
 991 |       "Accuracy: 68.66%\n",
 992 |       "Precision: 7.46%\n",
 993 |       "Recall: 83.33%\n"
 994 |      ]
 995 |     }
 996 |    ],
 997 |    "source": [
 998 |     "clf = LogisticRegression(class_weight='balanced', C=10, penalty='l2')\n",
 999 |     "clf.fit(X_train, y_train)\n",
1000 |     "y_pred = clf.predict(X_test)\n",
1001 |     "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n",
1002 |     "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n",
1003 |     "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))"
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "markdown",
1008 |    "metadata": {},
1009 |    "source": [
1010 |     "(note: this produced no difference in relation to the default hyperparameters)"
1011 |    ]
1012 |   },
1013 |   {
1014 |    "cell_type": "markdown",
1015 |    "metadata": {},
1016 |    "source": [
1017 |     "# Trying other models is easy"
1018 |    ]
1019 |   },
1020 |   {
1021 |    "cell_type": "markdown",
1022 |    "metadata": {},
1023 |    "source": [
1024 |     "## SVM"
1025 |    ]
1026 |   },
1027 |   {
1028 |    "cell_type": "code",
1029 |    "execution_count": 67,
1030 |    "metadata": {},
1031 |    "outputs": [],
1032 |    "source": [
1033 |     "from sklearn.svm import SVC"
1034 |    ]
1035 |   },
1036 |   {
1037 |    "cell_type": "code",
1038 |    "execution_count": 58,
1039 |    "metadata": {},
1040 |    "outputs": [
1041 |     {
1042 |      "name": "stdout",
1043 |      "output_type": "stream",
1044 |      "text": [
1045 |       "Accuracy: 97.01%\n",
1046 |       "Precision: 0.00%\n",
1047 |       "Recall: 0.00%\n"
1048 |      ]
1049 |     }
1050 |    ],
1051 |    "source": [
1052 |     "clf = RandomForestClassifier(class_weight='balanced')\n",
1053 |     "clf.fit(X_train, y_train)\n",
1054 |     "y_pred = clf.predict(X_test)\n",
1055 |     "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n",
1056 |     "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n",
1057 |     "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))"
1058 |    ]
1059 |   },
1060 |   {
1061 |    "cell_type": "code",
1062 |    "execution_count": 60,
1063 |    "metadata": {},
1064 |    "outputs": [
1065 |     {
1066 |      "name": "stdout",
1067 |      "output_type": "stream",
1068 |      "text": [
1069 |       "Accuracy: 68.66%\n",
1070 |       "Precision: 7.46%\n",
1071 |       "Recall: 83.33%\n"
1072 |      ]
1073 |     }
1074 |    ],
1075 |    "source": [
1076 |     "clf = SVC(class_weight='balanced')\n",
1077 |     "clf.fit(X_train, y_train)\n",
1078 |     "y_pred = clf.predict(X_test)\n",
1079 |     "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n",
1080 |     "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n",
1081 |     "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))"
1082 |    ]
1083 |   },
1084 |   {
1085 |    "cell_type": "markdown",
1086 |    "metadata": {},
1087 |    "source": [
1088 |     "Don't be surprised that different linear models tend to achieve the same performance"
1089 |    ]
1090 |   },
1091 |   {
1092 |    "cell_type": "code",
1093 |    "execution_count": 69,
1094 |    "metadata": {},
1095 |    "outputs": [
1096 |     {
1097 |      "name": "stdout",
1098 |      "output_type": "stream",
1099 |      "text": [
1100 |       "Accuracy: 2.99%\n",
1101 |       "Precision: 2.99%\n",
1102 |       "Recall: 100.00%\n"
1103 |      ]
1104 |     }
1105 |    ],
1106 |    "source": [
1107 |     "clf = SVC(class_weight='balanced', kernel='poly')\n",
1108 |     "clf.fit(X_train, y_train)\n",
1109 |     "y_pred = clf.predict(X_test)\n",
1110 |     "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n",
1111 |     "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n",
1112 |     "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))"
1113 |    ]
1114 |   },
1115 |   {
1116 |    "cell_type": "code",
1117 |    "execution_count": 73,
1118 |    "metadata": {},
1119 |    "outputs": [],
1120 |    "source": [
1121 |     "hyparam_grid = [{\n",
1122 |     "    'kernel': ['rbf', 'poly', 'sigmoid'],\n",
1123 |     "    'C': [0.1, 1, 10, 100],\n",
1124 |     "    'degree': [2, 3, 4, 5],\n",
1125 |     "    'class_weight': ['balanced'],\n",
1126 |     "}]"
1127 |    ]
1128 |   },
1129 |   {
1130 |    "cell_type": "code",
1131 |    "execution_count": 74,
1132 |    "metadata": {
1133 |     "scrolled": false
1134 |    },
1135 |    "outputs": [
1136 |     {
1137 |      "name": "stdout",
1138 |      "output_type": "stream",
1139 |      "text": [
1140 |       "# Tuning hyper-parameters for precision\n",
1141 |       "\n",
1142 |       "Best parameters set found on development set:\n",
1143 |       "\n",
1144 |       "{'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1145 |       "\n",
1146 |       "Grid scores on development set:\n",
1147 |       "\n",
1148 |       "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1149 |       "0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n",
1150 |       "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n",
1151 |       "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n",
1152 |       "0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n",
1153 |       "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n",
1154 |       "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n",
1155 |       "0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n",
1156 |       "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n",
1157 |       "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n",
1158 |       "0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n",
1159 |       "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n",
1160 |       "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1161 |       "0.527 (+/-0.010) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n",
1162 |       "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n",
1163 |       "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n",
1164 |       "0.009 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n",
1165 |       "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n",
1166 |       "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n",
1167 |       "0.009 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n",
1168 |       "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n",
1169 |       "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n",
1170 |       "0.009 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n",
1171 |       "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n",
1172 |       "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1173 |       "0.530 (+/-0.004) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n",
1174 |       "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n",
1175 |       "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n",
1176 |       "0.527 (+/-0.011) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n",
1177 |       "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n",
1178 |       "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n",
1179 |       "0.491 (+/-0.000) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n",
1180 |       "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n",
1181 |       "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n",
1182 |       "0.491 (+/-0.000) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n",
1183 |       "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n",
1184 |       "0.527 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1185 |       "0.527 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n",
1186 |       "0.528 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n",
1187 |       "0.527 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n",
1188 |       "0.530 (+/-0.004) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n",
1189 |       "0.528 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n",
1190 |       "0.527 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n",
1191 |       "0.521 (+/-0.014) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n",
1192 |       "0.528 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n",
1193 |       "0.527 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n",
1194 |       "0.009 (+/-0.000) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n",
1195 |       "0.528 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n",
1196 |       "\n",
1197 |       "Detailed classification report:\n",
1198 |       "\n",
1199 |       "The model is trained on the full development set.\n",
1200 |       "The scores are computed on the full evaluation set.\n",
1201 |       "\n",
1202 |       "              precision    recall  f1-score   support\n",
1203 |       "\n",
1204 |       "         0.0       0.99      0.68      0.81       195\n",
1205 |       "         1.0       0.07      0.83      0.14         6\n",
1206 |       "\n",
1207 |       "   micro avg       0.69      0.69      0.69       201\n",
1208 |       "   macro avg       0.53      0.76      0.47       201\n",
1209 |       "weighted avg       0.97      0.69      0.79       201\n",
1210 |       "\n",
1211 |       "\n",
1212 |       "# Tuning hyper-parameters for recall\n",
1213 |       "\n",
1214 |       "Best parameters set found on development set:\n",
1215 |       "\n",
1216 |       "{'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1217 |       "\n",
1218 |       "Grid scores on development set:\n",
1219 |       "\n",
1220 |       "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1221 |       "0.500 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n",
1222 |       "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n",
1223 |       "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n",
1224 |       "0.500 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n",
1225 |       "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n",
1226 |       "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n",
1227 |       "0.500 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n",
1228 |       "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n",
1229 |       "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n",
1230 |       "0.500 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n",
1231 |       "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n",
1232 |       "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1233 |       "0.825 (+/-0.083) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n",
1234 |       "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n",
1235 |       "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n",
1236 |       "0.500 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n",
1237 |       "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n",
1238 |       "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n",
1239 |       "0.500 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n",
1240 |       "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n",
1241 |       "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n",
1242 |       "0.500 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n",
1243 |       "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n",
1244 |       "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1245 |       "0.848 (+/-0.025) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n",
1246 |       "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n",
1247 |       "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n",
1248 |       "0.821 (+/-0.097) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n",
1249 |       "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n",
1250 |       "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n",
1251 |       "0.500 (+/-0.000) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n",
1252 |       "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n",
1253 |       "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n",
1254 |       "0.500 (+/-0.000) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n",
1255 |       "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n",
1256 |       "0.801 (+/-0.247) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n",
1257 |       "0.818 (+/-0.136) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n",
1258 |       "0.820 (+/-0.135) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n",
1259 |       "0.801 (+/-0.247) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n",
1260 |       "0.847 (+/-0.025) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n",
1261 |       "0.820 (+/-0.135) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n",
1262 |       "0.801 (+/-0.247) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n",
1263 |       "0.735 (+/-0.191) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n",
1264 |       "0.820 (+/-0.135) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n",
1265 |       "0.801 (+/-0.247) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n",
1266 |       "0.500 (+/-0.000) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n",
1267 |       "0.820 (+/-0.135) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n",
1268 |       "\n",
1269 |       "Detailed classification report:\n",
1270 |       "\n",
1271 |       "The model is trained on the full development set.\n",
1272 |       "The scores are computed on the full evaluation set.\n",
1273 |       "\n",
1274 |       "              precision    recall  f1-score   support\n",
1275 |       "\n",
1276 |       "         0.0       0.99      0.68      0.81       195\n",
1277 |       "         1.0       0.07      0.83      0.14         6\n",
1278 |       "\n",
1279 |       "   micro avg       0.69      0.69      0.69       201\n",
1280 |       "   macro avg       0.53      0.76      0.47       201\n",
1281 |       "weighted avg       0.97      0.69      0.79       201\n",
1282 |       "\n",
1283 |       "\n"
1284 |      ]
1285 |     }
1286 |    ],
1287 |    "source": [
1288 |     "from sklearn.metrics import classification_report\n",
1289 |     "from sklearn.model_selection import GridSearchCV\n",
1290 |     "for score in scores:\n",
1291 |     "    print(\"# Tuning hyper-parameters for %s\" % score)\n",
1292 |     "    print()\n",
1293 |     "\n",
1294 |     "    clf = GridSearchCV(SVC(), hyparam_grid, cv=5,\n",
1295 |     "                       scoring='%s_macro' % score)\n",
1296 |     "    clf.fit(X_train, y_train)\n",
1297 |     "\n",
1298 |     "    print(\"Best parameters set found on development set:\")\n",
1299 |     "    print()\n",
1300 |     "    print(clf.best_params_)\n",
1301 |     "    print()\n",
1302 |     "    print(\"Grid scores on development set:\")\n",
1303 |     "    print()\n",
1304 |     "    means = clf.cv_results_['mean_test_score']\n",
1305 |     "    stds = clf.cv_results_['std_test_score']\n",
1306 |     "    for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n",
1307 |     "        print(\"%0.3f (+/-%0.03f) for %r\"\n",
1308 |     "              % (mean, std * 2, params))\n",
1309 |     "    print()\n",
1310 |     "\n",
1311 |     "    print(\"Detailed classification report:\")\n",
1312 |     "    print()\n",
1313 |     "    print(\"The model is trained on the full development set.\")\n",
1314 |     "    print(\"The scores are computed on the full evaluation set.\")\n",
1315 |     "    print()\n",
1316 |     "    y_true, y_pred = y_test, clf.predict(X_test)\n",
1317 |     "    print(classification_report(y_true, y_pred))\n",
1318 |     "    print()"
1319 |    ]
1320 |   },
1321 |   {
1322 |    "cell_type": "markdown",
1323 |    "metadata": {},
1324 |    "source": [
1325 |     "## Random forest"
1326 |    ]
1327 |   },
1328 |   {
1329 |    "cell_type": "code",
1330 |    "execution_count": 68,
1331 |    "metadata": {},
1332 |    "outputs": [],
1333 |    "source": [
1334 |     "from sklearn.ensemble import RandomForestClassifier"
1335 |    ]
1336 |   },
1337 |   {
1338 |    "cell_type": "code",
1339 |    "execution_count": 63,
1340 |    "metadata": {},
1341 |    "outputs": [],
1342 |    "source": [
1343 |     "hyparam_grid = [{\n",
1344 |     "    'n_estimators': [5, 20, 100],\n",
1345 |     "    'criterion': ['gini', 'entropy'],\n",
1346 |     "    'max_depth': [None, 3, 5],\n",
1347 |     "    'class_weight': ['balanced'],\n",
1348 |     "}]"
1349 |    ]
1350 |   },
1351 |   {
1352 |    "cell_type": "code",
1353 |    "execution_count": 64,
1354 |    "metadata": {
1355 |     "scrolled": false
1356 |    },
1357 |    "outputs": [
1358 |     {
1359 |      "name": "stdout",
1360 |      "output_type": "stream",
1361 |      "text": [
1362 |       "# Tuning hyper-parameters for precision\n",
1363 |       "\n",
1364 |       "Best parameters set found on development set:\n",
1365 |       "\n",
1366 |       "{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 5}\n",
1367 |       "\n",
1368 |       "Grid scores on development set:\n",
1369 |       "\n",
1370 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 5}\n",
1371 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 20}\n",
1372 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}\n",
1373 |       "0.524 (+/-0.058) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 5}\n",
1374 |       "0.524 (+/-0.136) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 20}\n",
1375 |       "0.511 (+/-0.083) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 100}\n",
1376 |       "0.533 (+/-0.132) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 5}\n",
1377 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 20}\n",
1378 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 100}\n",
1379 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 5}\n",
1380 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 20}\n",
1381 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 100}\n",
1382 |       "0.503 (+/-0.053) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 5}\n",
1383 |       "0.533 (+/-0.132) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 20}\n",
1384 |       "0.505 (+/-0.060) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 100}\n",
1385 |       "0.518 (+/-0.071) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 5}\n",
1386 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 20}\n",
1387 |       "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 100}\n",
1388 |       "\n",
1389 |       "Detailed classification report:\n",
1390 |       "\n",
1391 |       "The model is trained on the full development set.\n",
1392 |       "The scores are computed on the full evaluation set.\n",
1393 |       "\n",
1394 |       "              precision    recall  f1-score   support\n",
1395 |       "\n",
1396 |       "         0.0       0.98      0.90      0.94       195\n",
1397 |       "         1.0       0.10      0.33      0.15         6\n",
1398 |       "\n",
1399 |       "   micro avg       0.89      0.89      0.89       201\n",
1400 |       "   macro avg       0.54      0.62      0.54       201\n",
1401 |       "weighted avg       0.95      0.89      0.92       201\n",
1402 |       "\n",
1403 |       "\n",
1404 |       "# Tuning hyper-parameters for recall\n",
1405 |       "\n",
1406 |       "Best parameters set found on development set:\n",
1407 |       "\n",
1408 |       "{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 5}\n",
1409 |       "\n",
1410 |       "Grid scores on development set:\n",
1411 |       "\n",
1412 |       "0.499 (+/-0.003) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 5}\n",
1413 |       "0.500 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 20}\n",
1414 |       "0.500 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}\n",
1415 |       "0.513 (+/-0.223) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 5}\n",
1416 |       "0.503 (+/-0.133) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 20}\n",
1417 |       "0.486 (+/-0.010) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 100}\n",
1418 |       "0.508 (+/-0.145) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 5}\n",
1419 |       "0.492 (+/-0.015) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 20}\n",
1420 |       "0.496 (+/-0.006) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 100}\n",
1421 |       "0.530 (+/-0.133) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 5}\n",
1422 |       "0.499 (+/-0.003) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 20}\n",
1423 |       "0.500 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 100}\n",
1424 |       "0.557 (+/-0.257) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 5}\n",
1425 |       "0.539 (+/-0.139) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 20}\n",
1426 |       "0.512 (+/-0.140) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 100}\n",
1427 |       "0.472 (+/-0.064) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 5}\n",
1428 |       "0.492 (+/-0.019) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 20}\n",
1429 |       "0.497 (+/-0.006) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 100}\n",
1430 |       "\n",
1431 |       "Detailed classification report:\n",
1432 |       "\n",
1433 |       "The model is trained on the full development set.\n",
1434 |       "The scores are computed on the full evaluation set.\n",
1435 |       "\n",
1436 |       "              precision    recall  f1-score   support\n",
1437 |       "\n",
1438 |       "         0.0       0.98      0.81      0.88       195\n",
1439 |       "         1.0       0.05      0.33      0.09         6\n",
1440 |       "\n",
1441 |       "   micro avg       0.79      0.79      0.79       201\n",
1442 |       "   macro avg       0.51      0.57      0.48       201\n",
1443 |       "weighted avg       0.95      0.79      0.86       201\n",
1444 |       "\n",
1445 |       "\n"
1446 |      ]
1447 |     }
1448 |    ],
1449 |    "source": [
1450 |     "from sklearn.metrics import classification_report\n",
1451 |     "from sklearn.model_selection import GridSearchCV\n",
1452 |     "for score in scores:\n",
1453 |     "    print(\"# Tuning hyper-parameters for %s\" % score)\n",
1454 |     "    print()\n",
1455 |     "\n",
1456 |     "    clf = GridSearchCV(RandomForestClassifier(), hyparam_grid, cv=5,\n",
1457 |     "                       scoring='%s_macro' % score)\n",
1458 |     "    clf.fit(X_train, y_train)\n",
1459 |     "\n",
1460 |     "    print(\"Best parameters set found on development set:\")\n",
1461 |     "    print()\n",
1462 |     "    print(clf.best_params_)\n",
1463 |     "    print()\n",
1464 |     "    print(\"Grid scores on development set:\")\n",
1465 |     "    print()\n",
1466 |     "    means = clf.cv_results_['mean_test_score']\n",
1467 |     "    stds = clf.cv_results_['std_test_score']\n",
1468 |     "    for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n",
1469 |     "        print(\"%0.3f (+/-%0.03f) for %r\"\n",
1470 |     "              % (mean, std * 2, params))\n",
1471 |     "    print()\n",
1472 |     "\n",
1473 |     "    print(\"Detailed classification report:\")\n",
1474 |     "    print()\n",
1475 |     "    print(\"The model is trained on the full development set.\")\n",
1476 |     "    print(\"The scores are computed on the full evaluation set.\")\n",
1477 |     "    print()\n",
1478 |     "    y_true, y_pred = y_test, clf.predict(X_test)\n",
1479 |     "    print(classification_report(y_true, y_pred))\n",
1480 |     "    print()"
1481 |    ]
1482 |   },
1483 |   {
1484 |    "cell_type": "markdown",
1485 |    "metadata": {},
1486 |    "source": [
1487 |     "Without putting too much effort into hyperparameter tunning, a random forest classifier got an F1 score of 54% (averaged over classes) compared to 47% of logistic regression and SVM."
1488 |    ]
1489 |   },
1490 |   {
1491 |    "cell_type": "markdown",
1492 |    "metadata": {},
1493 |    "source": [
1494 |     "Even if that is not what we want to optimize for here (and that depends on a lot of factors), this still demonstrates why giving other models at least a superficial examination can be worthwhile."
1495 |    ]
1496 |   },
1497 |   {
1498 |    "cell_type": "markdown",
1499 |    "metadata": {},
1500 |    "source": [
1501 |     "## Final notes: The classification problem here is significantly unbalanced; we did not treat this at all except for balancing class weights. An imbalance of 98/2 probably justifies more sophisticated tools.\n",
1502 |     "\n",
1503 |     "## One such option is to treat positive examples as anomalies, and to then draw on knowledge and methods from the field of anomaly detection.\n",
1504 |     "\n",
1505 |     "## Other options can include more advnaced processing of the data.\n",
1506 |     "\n",
1507 |     "## The advanced workshops might go into more details."
1508 |    ]
1509 |   },
1510 |   {
1511 |    "cell_type": "markdown",
1512 |    "metadata": {},
1513 |    "source": [
1514 |     "# We're done! Thank you!"
1515 |    ]
1516 |   }
1517 |  ],
1518 |  "metadata": {
1519 |   "kernelspec": {
1520 |    "display_name": "py3",
1521 |    "language": "python",
1522 |    "name": "py3"
1523 |   },
1524 |   "language_info": {
1525 |    "codemirror_mode": {
1526 |     "name": "ipython",
1527 |     "version": 3
1528 |    },
1529 |    "file_extension": ".py",
1530 |    "mimetype": "text/x-python",
1531 |    "name": "python",
1532 |    "nbconvert_exporter": "python",
1533 |    "pygments_lexer": "ipython3",
1534 |    "version": "3.6.5"
1535 |   }
1536 |  },
1537 |  "nbformat": 4,
1538 |  "nbformat_minor": 2
1539 | }
1540 | 


--------------------------------------------------------------------------------
/util_0.reading_the_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Reading the data the easy way"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Package installation and imports"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "!pip install -U -q datalearn19intro"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from datalearn19intro import (get_accounts, get_events, get_subscriptions, get_users)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## Reading the data"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "acc = get_accounts()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 5,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/html": [
 59 |        "<div>\n",
 60 |        "<style scoped>\n",
 61 |        "    .dataframe tbody tr th:only-of-type {\n",
 62 |        "        vertical-align: middle;\n",
 63 |        "    }\n",
 64 |        "\n",
 65 |        "    .dataframe tbody tr th {\n",
 66 |        "        vertical-align: top;\n",
 67 |        "    }\n",
 68 |        "\n",
 69 |        "    .dataframe thead th {\n",
 70 |        "        text-align: right;\n",
 71 |        "    }\n",
 72 |        "</style>\n",
 73 |        "<table border=\"1\" class=\"dataframe\">\n",
 74 |        "  <thead>\n",
 75 |        "    <tr style=\"text-align: right;\">\n",
 76 |        "      <th></th>\n",
 77 |        "      <th>account_id</th>\n",
 78 |        "      <th>marketing_source</th>\n",
 79 |        "      <th>marketing_referrer</th>\n",
 80 |        "      <th>created_at</th>\n",
 81 |        "      <th>plan_id</th>\n",
 82 |        "      <th>trial_start</th>\n",
 83 |        "      <th>started_plan_at</th>\n",
 84 |        "      <th>signup_box_origin</th>\n",
 85 |        "      <th>churn_state</th>\n",
 86 |        "      <th>churn_date</th>\n",
 87 |        "      <th>...</th>\n",
 88 |        "      <th>utm_cluster_id</th>\n",
 89 |        "      <th>pricing_version</th>\n",
 90 |        "      <th>has_domain</th>\n",
 91 |        "      <th>mrr</th>\n",
 92 |        "      <th>lead_score</th>\n",
 93 |        "      <th>industry.1</th>\n",
 94 |        "      <th>team_size</th>\n",
 95 |        "      <th>user_goal</th>\n",
 96 |        "      <th>user_description</th>\n",
 97 |        "      <th>sub_industry</th>\n",
 98 |        "    </tr>\n",
 99 |        "  </thead>\n",
100 |        "  <tbody>\n",
101 |        "    <tr>\n",
102 |        "      <th>0</th>\n",
103 |        "      <td>2793496</td>\n",
104 |        "      <td>bing</td>\n",
105 |        "      <td>https://www.bing.com/search?q=basecamp login</td>\n",
106 |        "      <td>2019-01-01</td>\n",
107 |        "      <td>NaN</td>\n",
108 |        "      <td>2019-01-01</td>\n",
109 |        "      <td>NaN</td>\n",
110 |        "      <td>NaN</td>\n",
111 |        "      <td>none</td>\n",
112 |        "      <td>NaN</td>\n",
113 |        "      <td>...</td>\n",
114 |        "      <td>orders</td>\n",
115 |        "      <td>3</td>\n",
116 |        "      <td>0</td>\n",
117 |        "      <td>NaN</td>\n",
118 |        "      <td>0</td>\n",
119 |        "      <td>NaN</td>\n",
120 |        "      <td>NaN</td>\n",
121 |        "      <td>NaN</td>\n",
122 |        "      <td>NaN</td>\n",
123 |        "      <td>NaN</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>1</th>\n",
127 |        "      <td>2793497</td>\n",
128 |        "      <td>NaN</td>\n",
129 |        "      <td>NaN</td>\n",
130 |        "      <td>2019-01-01</td>\n",
131 |        "      <td>NaN</td>\n",
132 |        "      <td>2019-01-01</td>\n",
133 |        "      <td>NaN</td>\n",
134 |        "      <td>mobile_app</td>\n",
135 |        "      <td>none</td>\n",
136 |        "      <td>NaN</td>\n",
137 |        "      <td>...</td>\n",
138 |        "      <td>NaN</td>\n",
139 |        "      <td>3</td>\n",
140 |        "      <td>0</td>\n",
141 |        "      <td>NaN</td>\n",
142 |        "      <td>0</td>\n",
143 |        "      <td>NaN</td>\n",
144 |        "      <td>NaN</td>\n",
145 |        "      <td>NaN</td>\n",
146 |        "      <td>NaN</td>\n",
147 |        "      <td>NaN</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>2</th>\n",
151 |        "      <td>2793498</td>\n",
152 |        "      <td>adwordsverticals</td>\n",
153 |        "      <td>https://www.google.com/</td>\n",
154 |        "      <td>2019-01-01</td>\n",
155 |        "      <td>NaN</td>\n",
156 |        "      <td>2019-01-01</td>\n",
157 |        "      <td>NaN</td>\n",
158 |        "      <td>NaN</td>\n",
159 |        "      <td>none</td>\n",
160 |        "      <td>NaN</td>\n",
161 |        "      <td>...</td>\n",
162 |        "      <td>todos</td>\n",
163 |        "      <td>3</td>\n",
164 |        "      <td>0</td>\n",
165 |        "      <td>NaN</td>\n",
166 |        "      <td>0</td>\n",
167 |        "      <td>Other</td>\n",
168 |        "      <td>1</td>\n",
169 |        "      <td>NaN</td>\n",
170 |        "      <td>NaN</td>\n",
171 |        "      <td>NaN</td>\n",
172 |        "    </tr>\n",
173 |        "  </tbody>\n",
174 |        "</table>\n",
175 |        "<p>3 rows × 49 columns</p>\n",
176 |        "</div>"
177 |       ],
178 |       "text/plain": [
179 |        "   account_id  marketing_source                            marketing_referrer  \\\n",
180 |        "0     2793496              bing  https://www.bing.com/search?q=basecamp login   \n",
181 |        "1     2793497               NaN                                           NaN   \n",
182 |        "2     2793498  adwordsverticals                       https://www.google.com/   \n",
183 |        "\n",
184 |        "   created_at  plan_id trial_start started_plan_at signup_box_origin  \\\n",
185 |        "0  2019-01-01      NaN  2019-01-01             NaN               NaN   \n",
186 |        "1  2019-01-01      NaN  2019-01-01             NaN        mobile_app   \n",
187 |        "2  2019-01-01      NaN  2019-01-01             NaN               NaN   \n",
188 |        "\n",
189 |        "  churn_state churn_date  ...  utm_cluster_id  pricing_version has_domain mrr  \\\n",
190 |        "0        none        NaN  ...          orders                3          0 NaN   \n",
191 |        "1        none        NaN  ...             NaN                3          0 NaN   \n",
192 |        "2        none        NaN  ...           todos                3          0 NaN   \n",
193 |        "\n",
194 |        "  lead_score  industry.1  team_size  user_goal  user_description sub_industry  \n",
195 |        "0          0         NaN        NaN        NaN               NaN          NaN  \n",
196 |        "1          0         NaN        NaN        NaN               NaN          NaN  \n",
197 |        "2          0       Other          1        NaN               NaN          NaN  \n",
198 |        "\n",
199 |        "[3 rows x 49 columns]"
200 |       ]
201 |      },
202 |      "execution_count": 5,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "acc.head(3)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 6,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "events = get_events()"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 8,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "data": {
227 |       "text/html": [
228 |        "<div>\n",
229 |        "<style scoped>\n",
230 |        "    .dataframe tbody tr th:only-of-type {\n",
231 |        "        vertical-align: middle;\n",
232 |        "    }\n",
233 |        "\n",
234 |        "    .dataframe tbody tr th {\n",
235 |        "        vertical-align: top;\n",
236 |        "    }\n",
237 |        "\n",
238 |        "    .dataframe thead th {\n",
239 |        "        text-align: right;\n",
240 |        "    }\n",
241 |        "</style>\n",
242 |        "<table border=\"1\" class=\"dataframe\">\n",
243 |        "  <thead>\n",
244 |        "    <tr style=\"text-align: right;\">\n",
245 |        "      <th></th>\n",
246 |        "      <th>DATE</th>\n",
247 |        "      <th>user_id</th>\n",
248 |        "      <th>account_id</th>\n",
249 |        "      <th>total_events</th>\n",
250 |        "      <th>column_events</th>\n",
251 |        "      <th>board_events</th>\n",
252 |        "      <th>num_of_boards</th>\n",
253 |        "      <th>count_kind_columns</th>\n",
254 |        "      <th>raw_events</th>\n",
255 |        "      <th>group_events</th>\n",
256 |        "      <th>...</th>\n",
257 |        "      <th>new_entry_events</th>\n",
258 |        "      <th>payment_events</th>\n",
259 |        "      <th>inbox_events</th>\n",
260 |        "      <th>communicating_events</th>\n",
261 |        "      <th>non_communicating_events</th>\n",
262 |        "      <th>web_events</th>\n",
263 |        "      <th>ios_events</th>\n",
264 |        "      <th>android_events</th>\n",
265 |        "      <th>desktop_app_events</th>\n",
266 |        "      <th>empty_events</th>\n",
267 |        "    </tr>\n",
268 |        "  </thead>\n",
269 |        "  <tbody>\n",
270 |        "    <tr>\n",
271 |        "      <th>0</th>\n",
272 |        "      <td>2019-01-01</td>\n",
273 |        "      <td>6181915</td>\n",
274 |        "      <td>2793790</td>\n",
275 |        "      <td>212</td>\n",
276 |        "      <td>30</td>\n",
277 |        "      <td>1</td>\n",
278 |        "      <td>1</td>\n",
279 |        "      <td>4</td>\n",
280 |        "      <td>14</td>\n",
281 |        "      <td>0</td>\n",
282 |        "      <td>...</td>\n",
283 |        "      <td>1</td>\n",
284 |        "      <td>0</td>\n",
285 |        "      <td>1</td>\n",
286 |        "      <td>4</td>\n",
287 |        "      <td>58</td>\n",
288 |        "      <td>201</td>\n",
289 |        "      <td>0</td>\n",
290 |        "      <td>0</td>\n",
291 |        "      <td>0</td>\n",
292 |        "      <td>9</td>\n",
293 |        "    </tr>\n",
294 |        "    <tr>\n",
295 |        "      <th>1</th>\n",
296 |        "      <td>2019-01-01</td>\n",
297 |        "      <td>6182266</td>\n",
298 |        "      <td>2793900</td>\n",
299 |        "      <td>207</td>\n",
300 |        "      <td>0</td>\n",
301 |        "      <td>0</td>\n",
302 |        "      <td>0</td>\n",
303 |        "      <td>0</td>\n",
304 |        "      <td>0</td>\n",
305 |        "      <td>0</td>\n",
306 |        "      <td>...</td>\n",
307 |        "      <td>1</td>\n",
308 |        "      <td>0</td>\n",
309 |        "      <td>0</td>\n",
310 |        "      <td>0</td>\n",
311 |        "      <td>1</td>\n",
312 |        "      <td>0</td>\n",
313 |        "      <td>189</td>\n",
314 |        "      <td>0</td>\n",
315 |        "      <td>0</td>\n",
316 |        "      <td>8</td>\n",
317 |        "    </tr>\n",
318 |        "    <tr>\n",
319 |        "      <th>2</th>\n",
320 |        "      <td>2019-01-01</td>\n",
321 |        "      <td>6182190</td>\n",
322 |        "      <td>2793860</td>\n",
323 |        "      <td>4</td>\n",
324 |        "      <td>0</td>\n",
325 |        "      <td>0</td>\n",
326 |        "      <td>0</td>\n",
327 |        "      <td>0</td>\n",
328 |        "      <td>0</td>\n",
329 |        "      <td>0</td>\n",
330 |        "      <td>...</td>\n",
331 |        "      <td>0</td>\n",
332 |        "      <td>0</td>\n",
333 |        "      <td>0</td>\n",
334 |        "      <td>0</td>\n",
335 |        "      <td>0</td>\n",
336 |        "      <td>0</td>\n",
337 |        "      <td>0</td>\n",
338 |        "      <td>0</td>\n",
339 |        "      <td>0</td>\n",
340 |        "      <td>0</td>\n",
341 |        "    </tr>\n",
342 |        "  </tbody>\n",
343 |        "</table>\n",
344 |        "<p>3 rows × 23 columns</p>\n",
345 |        "</div>"
346 |       ],
347 |       "text/plain": [
348 |        "         DATE  user_id  account_id  total_events  column_events  board_events  \\\n",
349 |        "0  2019-01-01  6181915     2793790           212             30             1   \n",
350 |        "1  2019-01-01  6182266     2793900           207              0             0   \n",
351 |        "2  2019-01-01  6182190     2793860             4              0             0   \n",
352 |        "\n",
353 |        "   num_of_boards  count_kind_columns  raw_events  group_events  ...  \\\n",
354 |        "0              1                   4          14             0  ...   \n",
355 |        "1              0                   0           0             0  ...   \n",
356 |        "2              0                   0           0             0  ...   \n",
357 |        "\n",
358 |        "   new_entry_events  payment_events  inbox_events  communicating_events  \\\n",
359 |        "0                 1               0             1                     4   \n",
360 |        "1                 1               0             0                     0   \n",
361 |        "2                 0               0             0                     0   \n",
362 |        "\n",
363 |        "   non_communicating_events  web_events  ios_events  android_events  \\\n",
364 |        "0                        58         201           0               0   \n",
365 |        "1                         1           0         189               0   \n",
366 |        "2                         0           0           0               0   \n",
367 |        "\n",
368 |        "   desktop_app_events  empty_events  \n",
369 |        "0                   0             9  \n",
370 |        "1                   0             8  \n",
371 |        "2                   0             0  \n",
372 |        "\n",
373 |        "[3 rows x 23 columns]"
374 |       ]
375 |      },
376 |      "execution_count": 8,
377 |      "metadata": {},
378 |      "output_type": "execute_result"
379 |     }
380 |    ],
381 |    "source": [
382 |     "events.head(3)"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 9,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "subs = get_subscriptions()"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 10,
397 |    "metadata": {},
398 |    "outputs": [
399 |     {
400 |      "data": {
401 |       "text/html": [
402 |        "<div>\n",
403 |        "<style scoped>\n",
404 |        "    .dataframe tbody tr th:only-of-type {\n",
405 |        "        vertical-align: middle;\n",
406 |        "    }\n",
407 |        "\n",
408 |        "    .dataframe tbody tr th {\n",
409 |        "        vertical-align: top;\n",
410 |        "    }\n",
411 |        "\n",
412 |        "    .dataframe thead th {\n",
413 |        "        text-align: right;\n",
414 |        "    }\n",
415 |        "</style>\n",
416 |        "<table border=\"1\" class=\"dataframe\">\n",
417 |        "  <thead>\n",
418 |        "    <tr style=\"text-align: right;\">\n",
419 |        "      <th></th>\n",
420 |        "      <th>event_date</th>\n",
421 |        "      <th>account_id</th>\n",
422 |        "      <th>plan_id</th>\n",
423 |        "      <th>event_type</th>\n",
424 |        "      <th>invoice_charge_amount</th>\n",
425 |        "      <th>prev_plan_id</th>\n",
426 |        "      <th>status</th>\n",
427 |        "      <th>status_reason</th>\n",
428 |        "      <th>currency</th>\n",
429 |        "      <th>invoice_charge_amount_usd</th>\n",
430 |        "      <th>mrr_gain</th>\n",
431 |        "      <th>subscription_id</th>\n",
432 |        "      <th>next_charge_date</th>\n",
433 |        "      <th>payment_type</th>\n",
434 |        "      <th>transaction_date</th>\n",
435 |        "    </tr>\n",
436 |        "  </thead>\n",
437 |        "  <tbody>\n",
438 |        "    <tr>\n",
439 |        "      <th>0</th>\n",
440 |        "      <td>2019-01-07</td>\n",
441 |        "      <td>2793955</td>\n",
442 |        "      <td>199</td>\n",
443 |        "      <td>CHARGE</td>\n",
444 |        "      <td>64.0</td>\n",
445 |        "      <td>NaN</td>\n",
446 |        "      <td>NaN</td>\n",
447 |        "      <td>NaN</td>\n",
448 |        "      <td>AUD</td>\n",
449 |        "      <td>44.67</td>\n",
450 |        "      <td>46.0</td>\n",
451 |        "      <td>65984302.0</td>\n",
452 |        "      <td>2019-02-07 21:05:00</td>\n",
453 |        "      <td>CC</td>\n",
454 |        "      <td>2019-01-07 21:05:00</td>\n",
455 |        "    </tr>\n",
456 |        "    <tr>\n",
457 |        "      <th>1</th>\n",
458 |        "      <td>2019-05-07</td>\n",
459 |        "      <td>2793955</td>\n",
460 |        "      <td>199</td>\n",
461 |        "      <td>RECURRING</td>\n",
462 |        "      <td>64.0</td>\n",
463 |        "      <td>199.0</td>\n",
464 |        "      <td>NaN</td>\n",
465 |        "      <td>NaN</td>\n",
466 |        "      <td>AUD</td>\n",
467 |        "      <td>43.94</td>\n",
468 |        "      <td>0.0</td>\n",
469 |        "      <td>65984302.0</td>\n",
470 |        "      <td>2019-06-07 20:05:00</td>\n",
471 |        "      <td>CC</td>\n",
472 |        "      <td>2019-05-07 20:34:00</td>\n",
473 |        "    </tr>\n",
474 |        "    <tr>\n",
475 |        "      <th>2</th>\n",
476 |        "      <td>2019-03-07</td>\n",
477 |        "      <td>2793955</td>\n",
478 |        "      <td>199</td>\n",
479 |        "      <td>RECURRING</td>\n",
480 |        "      <td>64.0</td>\n",
481 |        "      <td>199.0</td>\n",
482 |        "      <td>NaN</td>\n",
483 |        "      <td>NaN</td>\n",
484 |        "      <td>AUD</td>\n",
485 |        "      <td>44.20</td>\n",
486 |        "      <td>0.0</td>\n",
487 |        "      <td>65984302.0</td>\n",
488 |        "      <td>2019-04-07 20:05:00</td>\n",
489 |        "      <td>CC</td>\n",
490 |        "      <td>2019-03-07 21:33:00</td>\n",
491 |        "    </tr>\n",
492 |        "  </tbody>\n",
493 |        "</table>\n",
494 |        "</div>"
495 |       ],
496 |       "text/plain": [
497 |        "   event_date  account_id  plan_id event_type  invoice_charge_amount  \\\n",
498 |        "0  2019-01-07     2793955      199     CHARGE                   64.0   \n",
499 |        "1  2019-05-07     2793955      199  RECURRING                   64.0   \n",
500 |        "2  2019-03-07     2793955      199  RECURRING                   64.0   \n",
501 |        "\n",
502 |        "   prev_plan_id status status_reason currency  invoice_charge_amount_usd  \\\n",
503 |        "0           NaN    NaN           NaN      AUD                      44.67   \n",
504 |        "1         199.0    NaN           NaN      AUD                      43.94   \n",
505 |        "2         199.0    NaN           NaN      AUD                      44.20   \n",
506 |        "\n",
507 |        "   mrr_gain  subscription_id     next_charge_date payment_type  \\\n",
508 |        "0      46.0       65984302.0  2019-02-07 21:05:00           CC   \n",
509 |        "1       0.0       65984302.0  2019-06-07 20:05:00           CC   \n",
510 |        "2       0.0       65984302.0  2019-04-07 20:05:00           CC   \n",
511 |        "\n",
512 |        "      transaction_date  \n",
513 |        "0  2019-01-07 21:05:00  \n",
514 |        "1  2019-05-07 20:34:00  \n",
515 |        "2  2019-03-07 21:33:00  "
516 |       ]
517 |      },
518 |      "execution_count": 10,
519 |      "metadata": {},
520 |      "output_type": "execute_result"
521 |     }
522 |    ],
523 |    "source": [
524 |     "subs.head(3)"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": 11,
530 |    "metadata": {},
531 |    "outputs": [],
532 |    "source": [
533 |     "users = get_users()"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 12,
539 |    "metadata": {},
540 |    "outputs": [
541 |     {
542 |      "data": {
543 |       "text/html": [
544 |        "<div>\n",
545 |        "<style scoped>\n",
546 |        "    .dataframe tbody tr th:only-of-type {\n",
547 |        "        vertical-align: middle;\n",
548 |        "    }\n",
549 |        "\n",
550 |        "    .dataframe tbody tr th {\n",
551 |        "        vertical-align: top;\n",
552 |        "    }\n",
553 |        "\n",
554 |        "    .dataframe thead th {\n",
555 |        "        text-align: right;\n",
556 |        "    }\n",
557 |        "</style>\n",
558 |        "<table border=\"1\" class=\"dataframe\">\n",
559 |        "  <thead>\n",
560 |        "    <tr style=\"text-align: right;\">\n",
561 |        "      <th></th>\n",
562 |        "      <th>account_id</th>\n",
563 |        "      <th>user_id</th>\n",
564 |        "      <th>created_at</th>\n",
565 |        "      <th>is_admin</th>\n",
566 |        "      <th>pending</th>\n",
567 |        "      <th>enabled</th>\n",
568 |        "      <th>became_active_at</th>\n",
569 |        "      <th>time_diff</th>\n",
570 |        "      <th>city</th>\n",
571 |        "      <th>region</th>\n",
572 |        "      <th>...</th>\n",
573 |        "      <th>browser</th>\n",
574 |        "      <th>is_gmail</th>\n",
575 |        "      <th>campaign_id</th>\n",
576 |        "      <th>first_user_in_account_id</th>\n",
577 |        "      <th>LANGUAGE</th>\n",
578 |        "      <th>gender</th>\n",
579 |        "      <th>seniority</th>\n",
580 |        "      <th>title</th>\n",
581 |        "      <th>mobile_activation_date</th>\n",
582 |        "      <th>has_phone</th>\n",
583 |        "    </tr>\n",
584 |        "  </thead>\n",
585 |        "  <tbody>\n",
586 |        "    <tr>\n",
587 |        "      <th>0</th>\n",
588 |        "      <td>2793496</td>\n",
589 |        "      <td>6181341</td>\n",
590 |        "      <td>2018-12-31</td>\n",
591 |        "      <td>1</td>\n",
592 |        "      <td>0</td>\n",
593 |        "      <td>1</td>\n",
594 |        "      <td>2019-01-01</td>\n",
595 |        "      <td>11.0</td>\n",
596 |        "      <td>Warrawee</td>\n",
597 |        "      <td>New South Wales</td>\n",
598 |        "      <td>...</td>\n",
599 |        "      <td>microsoft edge</td>\n",
600 |        "      <td>1</td>\n",
601 |        "      <td>4005514.0</td>\n",
602 |        "      <td>29837820.0</td>\n",
603 |        "      <td>NaN</td>\n",
604 |        "      <td>M</td>\n",
605 |        "      <td>NaN</td>\n",
606 |        "      <td>NaN</td>\n",
607 |        "      <td>NaN</td>\n",
608 |        "      <td>1</td>\n",
609 |        "    </tr>\n",
610 |        "    <tr>\n",
611 |        "      <th>1</th>\n",
612 |        "      <td>2793497</td>\n",
613 |        "      <td>6181339</td>\n",
614 |        "      <td>2018-12-31</td>\n",
615 |        "      <td>1</td>\n",
616 |        "      <td>0</td>\n",
617 |        "      <td>1</td>\n",
618 |        "      <td>2019-01-01</td>\n",
619 |        "      <td>-5.0</td>\n",
620 |        "      <td>Old Bridge</td>\n",
621 |        "      <td>New Jersey</td>\n",
622 |        "      <td>...</td>\n",
623 |        "      <td>NaN</td>\n",
624 |        "      <td>1</td>\n",
625 |        "      <td>4005516.0</td>\n",
626 |        "      <td>29837847.0</td>\n",
627 |        "      <td>NaN</td>\n",
628 |        "      <td>F</td>\n",
629 |        "      <td>NaN</td>\n",
630 |        "      <td>NaN</td>\n",
631 |        "      <td>2019-01-01</td>\n",
632 |        "      <td>1</td>\n",
633 |        "    </tr>\n",
634 |        "    <tr>\n",
635 |        "      <th>2</th>\n",
636 |        "      <td>2793497</td>\n",
637 |        "      <td>6181386</td>\n",
638 |        "      <td>2019-01-01</td>\n",
639 |        "      <td>0</td>\n",
640 |        "      <td>0</td>\n",
641 |        "      <td>1</td>\n",
642 |        "      <td>2019-01-01</td>\n",
643 |        "      <td>-5.0</td>\n",
644 |        "      <td>New York</td>\n",
645 |        "      <td>New York</td>\n",
646 |        "      <td>...</td>\n",
647 |        "      <td>NaN</td>\n",
648 |        "      <td>1</td>\n",
649 |        "      <td>4005619.0</td>\n",
650 |        "      <td>29839571.0</td>\n",
651 |        "      <td>NaN</td>\n",
652 |        "      <td>M</td>\n",
653 |        "      <td>NaN</td>\n",
654 |        "      <td>NaN</td>\n",
655 |        "      <td>2019-01-01</td>\n",
656 |        "      <td>1</td>\n",
657 |        "    </tr>\n",
658 |        "  </tbody>\n",
659 |        "</table>\n",
660 |        "<p>3 rows × 25 columns</p>\n",
661 |        "</div>"
662 |       ],
663 |       "text/plain": [
664 |        "   account_id  user_id  created_at  is_admin  pending  enabled  \\\n",
665 |        "0     2793496  6181341  2018-12-31         1        0        1   \n",
666 |        "1     2793497  6181339  2018-12-31         1        0        1   \n",
667 |        "2     2793497  6181386  2019-01-01         0        0        1   \n",
668 |        "\n",
669 |        "  became_active_at  time_diff        city           region  ...  \\\n",
670 |        "0       2019-01-01       11.0    Warrawee  New South Wales  ...   \n",
671 |        "1       2019-01-01       -5.0  Old Bridge       New Jersey  ...   \n",
672 |        "2       2019-01-01       -5.0    New York         New York  ...   \n",
673 |        "\n",
674 |        "          browser  is_gmail  campaign_id first_user_in_account_id LANGUAGE  \\\n",
675 |        "0  microsoft edge         1    4005514.0               29837820.0      NaN   \n",
676 |        "1             NaN         1    4005516.0               29837847.0      NaN   \n",
677 |        "2             NaN         1    4005619.0               29839571.0      NaN   \n",
678 |        "\n",
679 |        "  gender  seniority  title  mobile_activation_date has_phone  \n",
680 |        "0      M        NaN    NaN                     NaN         1  \n",
681 |        "1      F        NaN    NaN              2019-01-01         1  \n",
682 |        "2      M        NaN    NaN              2019-01-01         1  \n",
683 |        "\n",
684 |        "[3 rows x 25 columns]"
685 |       ]
686 |      },
687 |      "execution_count": 12,
688 |      "metadata": {},
689 |      "output_type": "execute_result"
690 |     }
691 |    ],
692 |    "source": [
693 |     "users.head(3)"
694 |    ]
695 |   },
696 |   {
697 |    "cell_type": "code",
698 |    "execution_count": null,
699 |    "metadata": {},
700 |    "outputs": [],
701 |    "source": []
702 |   }
703 |  ],
704 |  "metadata": {
705 |   "kernelspec": {
706 |    "display_name": "py3",
707 |    "language": "python",
708 |    "name": "py3"
709 |   },
710 |   "language_info": {
711 |    "codemirror_mode": {
712 |     "name": "ipython",
713 |     "version": 3
714 |    },
715 |    "file_extension": ".py",
716 |    "mimetype": "text/x-python",
717 |    "name": "python",
718 |    "nbconvert_exporter": "python",
719 |    "pygments_lexer": "ipython3",
720 |    "version": "3.6.5"
721 |   }
722 |  },
723 |  "nbformat": 4,
724 |  "nbformat_minor": 2
725 | }
726 | 


--------------------------------------------------------------------------------