├── .gitignore ├── Intro to Machine Learning.pdf ├── README.rst ├── data ├── Dynamic events table.csv ├── Dynamic subscription table.csv ├── accounts.csv ├── monday_datalearn.csv └── users.csv ├── datalearn19intro ├── LICENSE ├── README.rst ├── datalearn19intro │ ├── __init__.py │ └── dataloader.py ├── mit_license_badge.svg └── setup.py ├── part_1.introducing_jupyter.ipynb ├── part_2.numpy.ipynb ├── part_3.pandas.ipynb ├── part_4.EDA.ipynb ├── part_5.Preprocessing.ipynb ├── part_6.modeling.ipynb └── util_0.reading_the_data.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | 126 | # vim swap files 127 | *.swp 128 | 129 | .DS_Store 130 | -------------------------------------------------------------------------------- /Intro to Machine Learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataHackIL/DataLearn-ML-Intro-2019/614ed306726f2b5f073b7da2d621069ecbd26023/Intro to Machine Learning.pdf -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | DataLearn Supervised ML Intro 2019 2 | ################################## 3 | 4 | The repository of the hands-on introduction to machine learning workshop of the DataLearn 2019 track at DataHack 2019. 5 | 6 | Video link: https://youtu.be/Su8YcXgkDsk?t=1701 7 | 8 | `Meetup event link `_ 9 | 10 | Notebooks by Shay Palachy. Presentation by Shay Palachy and Dana Kaner. *(Thank you Dana <3)* 11 | 12 | Resources 13 | ========= 14 | 15 | * Presentation: 16 | 17 | * `Intro to Machine Learning presentation `_ 18 | * Credits: Shay Palachy **and Dana Kaner** 19 | 20 | * Video recording: https://youtu.be/Su8YcXgkDsk?t=1701 21 | 22 | * Notebooks: 23 | 24 | 1. `Introducing Jupyter notebooks `_ 25 | 2. `Introduction to numpy `_ 26 | 3. `Introduction to pandas `_ 27 | 4. `Exploratory Data Analysis `_ 28 | 5. `Preprocessing `_ 29 | 6. `Modeling `_ 30 | 7. `Utility: Reading the data `_ 31 | 32 | 33 | Outline 34 | ======= 35 | 36 | * Tools of the trade 37 | 38 | * Jupyter notebooks 39 | * numpy 40 | * pandas 41 | 42 | * Data exploration 43 | * Preprocessing 44 | 45 | * Imputation 46 | * Scaling and normalization 47 | * Handling outliers 48 | * Feature extraction/generation 49 | * Feature selection 50 | * Dimensionality reduction 51 | 52 | * Modeling 53 | 54 | * Model fit & loss functions 55 | * Splitting your data 56 | * Model evaluation 57 | * Hyperparameter Optimization 58 | -------------------------------------------------------------------------------- /data/Dynamic subscription table.csv: -------------------------------------------------------------------------------- 1 | "event_date","account_id","plan_id","event_type","invoice_charge_amount","prev_plan_id","status","status_reason","currency","invoice_charge_amount_usd","mrr_gain","subscription_id","next_charge_date","payment_type","transaction_date" 2 | "2019-01-07",2793955,199,"CHARGE",64.00,NULL,"","","AUD",44.67,46.00,65984302,"2019-02-07 21:05:00","CC","2019-01-07 21:05:00" 3 | "2019-05-07",2793955,199,"RECURRING",64.00,199,"","","AUD",43.94,0.00,65984302,"2019-06-07 20:05:00","CC","2019-05-07 20:34:00" 4 | "2019-03-07",2793955,199,"RECURRING",64.00,199,"","","AUD",44.20,0.00,65984302,"2019-04-07 20:05:00","CC","2019-03-07 21:33:00" 5 | "2019-06-07",2793955,199,"RECURRING",64.00,199,"","","AUD",43.62,0.00,65984302,"2019-07-07 20:05:00","CC","2019-06-07 20:34:00" 6 | "2019-04-10",2793955,199,"PAYMENT_METHOD_UPDATED",0.00,199,"","","",0.00,NULL,65984302,NULL,"CC","2019-04-10 02:32:00" 7 | "2019-02-07",2793955,199,"RECURRING",64.00,199,"","","AUD",44.52,-1.00,65984302,"2019-03-07 21:05:00","CC","2019-02-07 21:33:00" 8 | "2019-04-07",2793955,199,"RECURRING",64.00,199,"","","AUD",44.54,0.00,65984302,"2019-05-07 20:05:00","CC","2019-04-07 20:23:00" 9 | "2019-07-07",2793955,199,"RECURRING",64.00,199,"","","AUD",43.91,0.00,65984302,"2019-08-07 20:05:00","CC","2019-07-07 20:01:00" 10 | "2019-01-07",2794172,198,"CHARGE",29.00,NULL,"","","USD",29.00,29.00,65925498,"2019-02-07 07:09:00","CC","2019-01-07 07:09:00" 11 | "2019-01-28",2794172,198,"CANCEL_ON_RENEWAL",0.00,198,"","","USD",0.00,NULL,65925498,"2019-02-07 07:09:00","CC","2019-01-28 07:27:00" 12 | "2019-02-07",2794172,198,"CANCELLATION",0.00,198,"","","USD",0.00,-29.00,65925498,"2019-02-07 07:09:00","CC","2019-02-07 07:32:00" 13 | "2019-01-28",2794489,231,"CONTRACT_CHANGE",0.00,232,"SUCCESS","","USD",0.00,-20.00,66852410,"2020-01-28 15:02:00","CC","2019-01-28 15:02:00" 14 | "2019-01-28",2794489,232,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2020-03-28 07:00:00","",NULL 15 | "2019-01-28",2794489,231,"RECURRING",0.00,231,"","","USD",0.00,NULL,66852410,"2020-01-28 15:02:00","BALANCE","2019-01-28 15:02:00" 16 | "2019-01-29",2794489,232,"REFUND",-258.72,232,"","","USD",-258.72,NULL,66852410,"2020-01-28 15:02:00","CC","2019-01-29 09:51:00" 17 | "2019-01-29",2794489,231,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2020-01-17 08:00:00","",NULL 18 | "2019-01-28",2794489,232,"RECURRING",258.72,232,"","","USD",258.72,NULL,66852410,"2020-01-28 14:40:00","CC","2019-01-28 14:40:00" 19 | "2019-01-17",2794489,231,"CHARGE",468.00,NULL,"","","USD",468.00,39.00,66852410,"2020-01-17 08:51:00","CC","2019-01-17 08:51:00" 20 | "2019-01-28",2794489,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","USD",0.00,20.00,66852410,"2020-01-28 14:40:00","CC","2019-01-28 14:40:00" 21 | "2019-01-14",2793704,231,"CHARGE",624.00,NULL,"","","CAD",460.64,39.00,66569204,"2020-01-14 15:16:00","CC","2019-01-14 15:16:00" 22 | "2019-01-15",2793906,239,"CHARGE",1428.00,NULL,"","","USD",1428.00,119.00,66641482,"2020-01-15 04:57:00","CC","2019-01-15 04:57:00" 23 | "2019-01-15",2793906,239,"CC_CHARGE_FAILED",0.00,239,"","This transaction has been declined. Please check card details and try again, or contact your bank for assistance","USD",0.00,NULL,NULL,NULL,"CC","2019-01-15 04:56:00" 24 | "2019-01-03",2794458,232,"RECURRING",291.01,232,"","","USD",291.01,NULL,65496398,"2020-01-03 16:20:00","CC","2019-01-03 16:20:00" 25 | "2019-01-03",2794458,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","USD",0.00,24.00,65496398,"2020-01-03 16:20:00","CC","2019-01-03 16:20:00" 26 | "2019-01-02",2794458,231,"CHARGE",421.20,NULL,"","","USD",421.20,35.00,65496398,"2020-01-02 15:46:00","CC","2019-01-02 15:46:00" 27 | "2019-07-22",2794064,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","AUD",0.00,19.00,66092630,"2020-07-22 07:33:00","CC","2019-07-22 07:33:00" 28 | "2019-01-08",2794064,231,"CHARGE",624.00,NULL,"","","AUD",436.18,37.00,66092630,"2020-01-08 22:31:00","CC","2019-01-08 22:31:00" 29 | "2019-07-22",2794064,232,"RECURRING",660.96,232,"","","AUD",455.81,NULL,66092630,"2020-07-22 07:33:00","CC","2019-07-22 07:33:00" 30 | "2019-01-19",2794278,199,"CANCEL_ON_RENEWAL",0.00,199,"","","USD",0.00,NULL,65670660,"2019-02-04 10:17:00","CC","2019-01-19 14:45:00" 31 | "2019-02-04",2794278,199,"CANCELLATION",0.00,199,"","","USD",0.00,-48.00,65670660,"2019-02-04 10:17:00","CC","2019-02-04 10:33:00" 32 | "2019-01-04",2794278,199,"CHARGE",48.00,NULL,"","","USD",48.00,48.00,65670660,"2019-02-04 10:17:00","CC","2019-01-04 10:17:00" 33 | "2019-01-21",2793789,231,"CHARGE",468.00,NULL,"","","USD",468.00,39.00,67146528,"2020-01-21 06:06:00","CC","2019-01-21 06:06:00" 34 | "2019-01-18",2793567,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","CAD",0.00,20.00,66577666,"2020-01-18 15:09:00","CC","2019-01-18 15:09:00" 35 | "2019-03-20",2793567,264,"CONTRACT_CHANGE",0.00,232,"SUCCESS","","CAD",0.00,-10.00,66577666,"2021-03-20 20:38:00","CC","2019-03-20 20:38:00" 36 | "2019-01-18",2793567,232,"RECURRING",336.48,232,"","","CAD",248.73,NULL,66577666,"2020-01-18 15:09:00","CC","2019-01-18 15:09:00" 37 | "2019-01-18",2793567,232,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2020-03-18 07:00:00","",NULL 38 | "2019-03-20",2793567,264,"RECURRING",797.16,264,"","","CAD",585.82,NULL,66577666,"2021-03-20 20:38:00","CC","2019-03-20 20:38:00" 39 | "2019-01-14",2793567,231,"CHARGE",624.00,NULL,"","","CAD",460.64,39.00,66577666,"2020-01-14 16:33:00","CC","2019-01-14 16:33:00" 40 | "2019-03-20",2793567,264,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2021-05-20 07:00:00","",NULL 41 | "2019-01-11",2793828,230,"CHARGE",276.00,NULL,"","","USD",276.00,23.00,66303986,"2020-01-11 03:35:00","CC","2019-01-11 03:35:00" 42 | "2019-01-15",2794328,198,"CHARGE",25.00,NULL,"","","GBP",31.48,32.00,66658322,"2019-02-15 12:38:00","CC","2019-01-15 12:38:00" 43 | "2019-04-30",2794328,202,"REFUND",-51.00,202,"","","GBP",-64.38,NULL,66658322,"2019-05-28 09:13:00","CC","2019-04-30 10:08:00" 44 | "2019-01-30",2794328,202,"CONTRACT_CHANGE",0.00,198,"SUCCESS","","GBP",0.00,35.00,66658322,"2019-02-28 10:13:00","CC","2019-01-30 10:13:00" 45 | "2019-04-30",2794328,202,"CANCEL_ON_RENEWAL",0.00,202,"","","GBP",0.00,NULL,66658322,"2019-05-28 09:13:00","CC","2019-04-30 09:49:00" 46 | "2019-04-28",2794328,202,"RECURRING",51.00,202,"","","GBP",64.38,-1.00,66658322,"2019-05-28 09:13:00","CC","2019-04-28 11:17:00" 47 | "2019-02-28",2794328,202,"RECURRING",51.00,202,"","","GBP",66.56,1.00,66658322,"2019-03-28 09:13:00","CC","2019-02-28 10:37:00" 48 | "2019-01-30",2794328,202,"RECURRING",38.25,202,"","","GBP",49.28,NULL,66658322,"2019-02-28 10:13:00","CC","2019-01-30 10:13:00" 49 | "2019-04-30",2794328,202,"CANCELLATION",0.00,202,"","","GBP",0.00,-66.00,66658322,"2019-05-28 09:13:00","CC","2019-04-30 10:08:00" 50 | "2019-03-28",2794328,202,"RECURRING",51.00,202,"","","GBP",66.12,-1.00,66658322,"2019-04-28 09:13:00","CC","2019-03-28 09:24:00" 51 | "2019-01-17",2793924,264,"CONTRACT_CHANGE",0.00,263,"SUCCESS","","USD",0.00,17.00,66839172,"2021-01-17 13:06:00","CC","2019-01-17 13:06:00" 52 | "2019-01-17",2793924,263,"CHARGE",768.00,NULL,"","","USD",768.00,32.00,66839172,"2021-01-17 03:06:00","CC","2019-01-17 03:06:00" 53 | "2019-01-17",2793924,264,"RECURRING",408.00,264,"","","USD",408.00,NULL,66839172,"2021-01-17 13:06:00","CC","2019-01-17 13:06:00" 54 | "2019-06-29",2793924,268,"CONTRACT_CHANGE",0.00,264,"SUCCESS","","USD",0.00,50.00,66839172,"2021-06-29 10:08:00","CC","2019-06-29 10:08:00" 55 | "2019-06-29",2793924,268,"RECURRING",1470.48,268,"","","USD",1470.48,NULL,66839172,"2021-06-29 10:08:00","CC","2019-06-29 10:08:00" 56 | "2019-01-12",2794423,232,"CHARGE",708.00,NULL,"","","USD",708.00,59.00,66415970,"2020-01-12 13:22:00","CC","2019-01-12 13:22:00" 57 | "2019-02-28",2794463,818,"CHARGE",48.00,NULL,"","","USD",48.00,48.00,70971926,"2019-03-28 21:07:00","CC","2019-02-28 22:07:00" 58 | "2019-04-03",2794463,200,"RECURRING",33.60,200,"","","USD",33.60,NULL,70971926,"2019-05-03 10:11:00","CC","2019-04-03 10:11:00" 59 | "2019-03-28",2794463,818,"RECURRING",48.00,818,"","","USD",48.00,0.00,70971926,"2019-04-28 21:07:00","CC","2019-03-28 21:23:00" 60 | "2019-05-03",2794463,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,70971926,"2019-06-03 10:11:00","CC","2019-05-03 10:34:00" 61 | "2019-07-03",2794463,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,70971926,"2019-08-03 10:11:00","CC","2019-07-03 10:01:00" 62 | "2019-06-03",2794463,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,70971926,"2019-07-03 10:11:00","CC","2019-06-03 10:33:00" 63 | "2019-04-03",2794463,200,"CONTRACT_CHANGE",0.00,818,"SUCCESS","","USD",0.00,24.00,70971926,"2019-05-03 10:11:00","CC","2019-04-03 10:11:00" 64 | "2019-01-09",2794383,232,"RECURRING",240.39,232,"","","GBP",300.52,NULL,65941602,"2020-01-09 09:04:00","CC","2019-01-09 09:04:00" 65 | "2019-01-30",2794383,236,"CONTRACT_CHANGE",0.00,232,"SUCCESS","","GBP",0.00,69.00,65941602,"2020-01-30 10:29:00","CC","2019-01-30 10:29:00" 66 | "2019-01-30",2794383,236,"RECURRING",648.72,236,"","","GBP",835.92,NULL,65941602,"2020-01-30 10:29:00","CC","2019-01-30 10:29:00" 67 | "2019-01-07",2794383,231,"CHARGE",375.36,NULL,"","","GBP",468.67,40.00,65941602,"2020-01-07 14:01:00","CC","2019-01-07 14:01:00" 68 | "2019-01-09",2794383,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","GBP",0.00,25.00,65941602,"2020-01-09 09:04:00","CC","2019-01-09 09:04:00" 69 | "2019-07-12",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","CC","2019-07-12 16:03:00" 70 | "2019-07-07",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","CC","2019-07-07 16:02:00" 71 | "2019-04-17",2794060,200,"PAYMENT_METHOD_UPDATED",0.00,200,"","","",0.00,NULL,66570782,NULL,"CC","2019-04-17 19:40:00" 72 | "2019-04-17",2794060,200,"PAYMENT_METHOD_UPDATED",0.00,200,"","","",0.00,NULL,66570782,NULL,"CC","2019-04-17 18:38:00" 73 | "2019-04-17",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-05-02 15:41:00","CC","2019-04-17 19:40:00" 74 | "2019-07-16",2794060,199,"RECURRING",48.00,199,"","","USD",48.00,NULL,66570782,"2019-08-16 20:15:00","CC","2019-07-16 20:15:00" 75 | "2019-02-02",2794060,200,"RECURRING",53.76,200,"","","USD",53.76,NULL,66570782,"2019-03-02 16:41:00","PAYPAL","2019-02-02 16:41:00" 76 | "2019-01-14",2794060,199,"CHARGE",48.00,NULL,"","","USD",48.00,48.00,66570782,"2019-02-14 15:33:00","PAYPAL","2019-01-14 15:34:00" 77 | "2019-02-02",2794060,200,"CONTRACT_CHANGE",0.00,199,"SUCCESS","","USD",0.00,24.00,66570782,"2019-03-02 16:41:00","PAYPAL","2019-02-02 16:41:00" 78 | "2019-07-03",2794060,232,"CONTRACT_CHANGE",0.00,200,"FAILED","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","PAYPAL","2019-07-03 17:08:00" 79 | "2019-06-02",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-07-02 15:41:00","CC","2019-06-02 15:34:00" 80 | "2019-04-17",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","This transaction has been declined. Please try a different card or contact the credit card provider for assistance:[Do not Honour] (PV-05)","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","CC","2019-04-17 18:38:00" 81 | "2019-07-16",2794060,199,"CONTRACT_CHANGE",0.00,200,"SUCCESS","","USD",0.00,-24.00,66570782,"2019-08-16 20:15:00","CC","2019-07-16 20:15:00" 82 | "2019-07-16",2794060,200,"RECURRING",24.00,200,"","","USD",24.00,NULL,66570782,"2019-08-16 20:17:00","CC","2019-07-16 20:17:00" 83 | "2019-04-09",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","10417: Transaction cannot complete.","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","PAYPAL","2019-04-09 16:24:00" 84 | "2019-04-17",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","This transaction has been declined. Please try a different card or contact the credit card provider for assistance:[Do not Honour] (PV-05)","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","CC","2019-04-17 19:23:00" 85 | "2019-07-16",2794060,200,"CONTRACT_CHANGE",0.00,199,"SUCCESS","","USD",0.00,24.00,66570782,"2019-08-16 20:17:00","CC","2019-07-16 20:17:00" 86 | "2019-05-02",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-06-02 15:41:00","CC","2019-05-02 15:34:00" 87 | "2019-07-02",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","CC","2019-07-02 15:03:00" 88 | "2019-04-02",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","10417: Transaction cannot complete.","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","PAYPAL","2019-04-02 15:24:00" 89 | "2019-03-02",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-04-02 15:41:00","PAYPAL","2019-03-02 16:34:00" 90 | "2019-01-10",2793508,200,"CHARGE",72.00,NULL,"","","USD",72.00,72.00,66280160,"2019-02-10 22:16:00","PAYPAL","2019-01-10 22:16:00" 91 | "2019-02-10",2793508,200,"CANCELLATION",0.00,200,"","","USD",0.00,-72.00,66280160,"2019-02-10 22:16:00","PAYPAL","2019-02-10 22:33:00" 92 | "2019-02-06",2793508,200,"CANCEL_ON_RENEWAL",0.00,200,"","","USD",0.00,NULL,66280160,"2019-02-10 22:16:00","PAYPAL","2019-02-06 11:07:00" 93 | "2019-06-21",2793730,234,"RECURRING",420.00,234,"","","USD",420.00,NULL,66630878,"2020-06-21 10:03:00","CC","2019-06-21 10:03:00" 94 | "2019-06-21",2793730,234,"CONTRACT_CHANGE",0.00,230,"SUCCESS","","USD",0.00,24.00,66630878,"2020-06-21 10:03:00","CC","2019-06-21 10:03:00" 95 | "2019-01-15",2793730,230,"CHARGE",300.00,NULL,"","","USD",300.00,25.00,66630878,"2020-01-15 01:34:00","CC","2019-01-15 01:35:00" 96 | -------------------------------------------------------------------------------- /datalearn19intro/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Shay Palachy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /datalearn19intro/README.rst: -------------------------------------------------------------------------------- 1 | datalearn19intro 2 | ################ 3 | |PyPI-Status| |PyPI-Versions| |LICENCE| 4 | 5 | Helper code for DataLearn 2019 ML Intro Workshop. 6 | 7 | .. code-block:: python 8 | 9 | from datalearn19intro import get_accounts 10 | accounts = get_accounts() 11 | 12 | .. contents:: 13 | 14 | .. section-numbering:: 15 | 16 | Installation 17 | ============ 18 | 19 | Install ``datalearn19intro`` with: 20 | 21 | .. code-block:: bash 22 | 23 | pip install datalearn19intro 24 | 25 | 26 | Credits 27 | ======= 28 | Created by Shay Palachy (shay.palachy@gmail.com). 29 | 30 | .. alternative: 31 | .. https://badge.fury.io/py/yellowbrick.svg 32 | 33 | .. |PyPI-Status| image:: https://img.shields.io/pypi/v/datalearn19intro.svg 34 | :target: https://pypi.org/project/datalearn19intro 35 | 36 | .. |PyPI-Versions| image:: https://img.shields.io/pypi/pyversions/datalearn19intro.svg 37 | :target: https://pypi.org/project/datalearn19intro 38 | 39 | .. |LICENCE| image:: https://img.shields.io/badge/License-MIT-yellow.svg 40 | :target: https://pypi.python.org/pypi/datalearn19intro 41 | -------------------------------------------------------------------------------- /datalearn19intro/datalearn19intro/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataloader import ( # noqa: F401 2 | get_accounts, 3 | get_users, 4 | get_events, 5 | get_subscriptions, 6 | get_processed_intro_dataset, 7 | ) 8 | -------------------------------------------------------------------------------- /datalearn19intro/datalearn19intro/dataloader.py: -------------------------------------------------------------------------------- 1 | """Data loading code for DataLearn prep night workshop.""" 2 | 3 | import pip 4 | import subprocess 5 | 6 | import pandas as pd 7 | 8 | try: 9 | import google.colab # noqa: F401 10 | 11 | IN_COLAB = True 12 | except ImportError: 13 | IN_COLAB = False 14 | 15 | 16 | def in_notebook(): 17 | try: 18 | from IPython import get_ipython 19 | 20 | if 'IPKernelApp' not in get_ipython().config: # pragma: no cover 21 | return False 22 | except ImportError: 23 | return False 24 | return True 25 | 26 | 27 | def pipinstall(package): 28 | if hasattr(pip, 'main'): 29 | pip.main(['install', package]) 30 | else: 31 | pip._internal.main(['install', package]) 32 | 33 | 34 | GDRIVE = None 35 | 36 | 37 | def gdrive_authenticate(): 38 | global GDRIVE 39 | if GDRIVE is not None: 40 | return 41 | print('Installing PyDrive...') 42 | subprocess.run(["pip", "install", "-U", "-q", "PyDrive"]) 43 | # pipinstall('PyDrive') 44 | # !pip install -U -q PyDrive 45 | from pydrive.auth import GoogleAuth 46 | from pydrive.drive import GoogleDrive 47 | from google.colab import auth 48 | from oauth2client.client import GoogleCredentials 49 | 50 | # Authenticate and create the PyDrive client.GDRIVE_AUTHENICATED# This only 51 | # needs to be done once per notebook. 52 | print('Authenticating with Google Drive...') 53 | auth.authenticate_user() 54 | gauth = GoogleAuth() 55 | gauth.credentials = GoogleCredentials.get_application_default() 56 | GDRIVE = GoogleDrive(gauth) 57 | 58 | 59 | def _get_file(fname, id): 60 | if IN_COLAB: 61 | gdrive_authenticate() 62 | # you can see it with "get sherable link" 63 | print("Downloading {} from Google Drive...".format(fname)) 64 | downloaded = GDRIVE.CreateFile({'id': id}) 65 | downloaded.GetContentFile(fname) 66 | print("Done.") 67 | return pd.read_csv(fname) 68 | else: 69 | return pd.read_csv('data/{}'.format(fname)) 70 | 71 | 72 | def get_accounts(): 73 | return _get_file('accounts.csv', '1SFFGL_FIq3-l6CP9MTe9ueuLRMz_tvrw') 74 | 75 | 76 | def get_users(): 77 | return _get_file('users.csv', '1fG6ebyTaWWOVRFHw9svNjgJLYdUcu5th') 78 | 79 | 80 | def get_events(): 81 | return _get_file( 82 | 'Dynamic events table.csv', '1Gv0Z_IJ1kBwuUnPDkpgFM8mK1dGeTNi4') 83 | 84 | 85 | def get_subscriptions(): 86 | return _get_file( 87 | 'Dynamic subscription table.csv', '1qC0VOpUkZo4O4lggzp45YcNxC7NXY4VV') 88 | 89 | 90 | def get_processed_intro_dataset(): 91 | return _get_file( 92 | 'monday_datalearn.csv', '1W2D192QF_LIixPws1mj57C6OBNSxILFI') 93 | -------------------------------------------------------------------------------- /datalearn19intro/mit_license_badge.svg: -------------------------------------------------------------------------------- 1 | licenselicenseMITMIT -------------------------------------------------------------------------------- /datalearn19intro/setup.py: -------------------------------------------------------------------------------- 1 | """Setup for the datalearn19intro package.""" 2 | 3 | # !/usr/bin/env python 4 | # -*- coding: utf-8 -*- 5 | 6 | import setuptools 7 | 8 | 9 | INSTALL_REQUIRES = [ 10 | 'numpy', 11 | 'pandas' 12 | ] 13 | 14 | with open('README.rst') as f: 15 | README = f.read() 16 | 17 | setuptools.setup( 18 | author="Shay Palachy", 19 | author_email="shay.palachy@gmail.com", 20 | name='datalearn19intro', 21 | license="MIT", 22 | description='Helper code for DataLearn 2019 ML Intro Workshop.', 23 | version='v0.0.5', 24 | # cmdclass=versioneer.get_cmdclass(), 25 | long_description=README, 26 | url='https://github.com/DataHackIL/DataLearn-ML-Intro-2019', 27 | packages=setuptools.find_packages(), 28 | include_package_data=True, 29 | python_requires=">=3.5", 30 | install_requires=INSTALL_REQUIRES, 31 | # extras_require={ 32 | # 'test': TEST_REQUIRES + INSTALL_REQUIRES, 33 | # }, 34 | classifiers=[ 35 | # Trove classifiers 36 | # (https://pypi.python.org/pypi?%3Aaction=list_classifiers) 37 | 'Development Status :: 4 - Beta', 38 | 'License :: OSI Approved :: MIT License', 39 | 'Programming Language :: Python', 40 | 'Programming Language :: Python :: 3.5', 41 | 'Programming Language :: Python :: 3.6', 42 | 'Programming Language :: Python :: 3.7', 43 | 'Topic :: Software Development :: Libraries', 44 | 'Topic :: Software Development :: Libraries :: Python Modules', 45 | 'Intended Audience :: Developers', 46 | ], 47 | ) 48 | -------------------------------------------------------------------------------- /part_1.introducing_jupyter.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.5"},"colab":{"name":"part_1.introducing_jupyter.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[]}},"cells":[{"cell_type":"markdown","metadata":{"id":"dXQRvMNhxdND","colab_type":"text"},"source":["# Part 1: Introducing the Jupyter Notebook"]},{"cell_type":"markdown","metadata":{"id":"Vdp7sdXHxdNF","colab_type":"text"},"source":["## What is the Jupyter Notebook?"]},{"cell_type":"markdown","metadata":{"id":"XxajIhHfxdNG","colab_type":"text"},"source":["The Jupyter Notebook is an interactive computing environment that enables users to author notebook documents that include:\n","\n","* Live code\n","* Interactive widgets\n","* Plots\n","* Narrative text\n","* Equations\n","* Images\n","* Video\n","\n","These documents provide a complete and self-contained record of a computation that can be converted to various formats and shared with others using email, Dropbox, version control systems (like git/GitHub) or nbviewer.jupyter.org."]},{"cell_type":"markdown","metadata":{"id":"FraUpL-JxdNH","colab_type":"text"},"source":["## Components\n","The Jupyter Notebook combines three components:\n","\n","* **The notebook web application**: An interactive web application for writing and running code interactively and authoring notebook documents.\n","* **Kernels**: Separate processes started by the notebook web application that runs users' code in a given language and returns output back to the notebook web application. The kernel also handles things like computations for interactive widgets, tab completion and introspection.\n","* **Notebook documents**: Self-contained documents that contain a representation of all content visible in the notebook web application, including inputs and outputs of the computations, narrative text, equations, images, and rich media representations of objects. Each notebook document has its own kernel.\n","\n","This enables the user to both **edit** and **run** code in the browser."]},{"cell_type":"markdown","metadata":{"id":"8cfyGFfPxdNI","colab_type":"text"},"source":["## Kernels\n","Through Jupyter's kernel and messaging architecture, the Notebook allows code to be run in a range of different programming languages. For each notebook document that a user opens, the web application starts a kernel that runs the code for that notebook. Each kernel is capable of running code in a single programming language and there are kernels available in the following languages:\n","\n","* Python (https://github.com/ipython/ipython)\n","* Julia (https://github.com/JuliaLang/IJulia.jl)\n","* R (https://github.com/takluyver/IRkernel)\n","* Ruby (https://github.com/minrk/iruby)\n","* Haskell (https://github.com/gibiansky/IHaskell)\n","* Scala (https://github.com/Bridgewater/scala-notebook)\n","* node.js (https://gist.github.com/Carreau/4279371)\n","* Go (https://github.com/takluyver/igo)\n","\n","The default kernel runs Python code. The notebook provides a simple way for users to pick which of these kernels is used for a given notebook."]},{"cell_type":"markdown","metadata":{"id":"D5Q2fTa5xdNJ","colab_type":"text"},"source":["## Notebook cells"]},{"cell_type":"markdown","metadata":{"id":"s-9D3V3hxdNK","colab_type":"text"},"source":["A Jupyter notebook is made up of consecutive cells. There are two basic types of cells:\n","* Markdown cells\n","* Code cells"]},{"cell_type":"markdown","metadata":{"id":"xF9L_V12xdNK","colab_type":"text"},"source":["You can turn a cell into a markdown cell by pressing `m` when it is selected (but the cursor is **not** inside it). To turn it to a code cell use `y`."]},{"cell_type":"markdown","metadata":{"id":"2a6sSfdkxdNL","colab_type":"text"},"source":["###More comands:\n","* 'Escape' outside of a cell using `Esc`.\n","* Edit the current cell using `Enter`.\n","* Execute a cell using `shift`+`Enter`.\n","\n","In Jupyter Notebooks (but not on *colab*):\n","* Add a cell above the current cell using `a`.\n","* Add a cell above the current cell using `b`.\n","* Copy the current cell using `c`.\n","* Delete the current cell using `dd`."]},{"cell_type":"markdown","metadata":{"id":"FIx3pfbtxdNL","colab_type":"text"},"source":["## Markdown Cells\n","\n","This cell is a markdown cell. This means you can use markdown to write *italic text* by surrounding you text with *asterisks* or _underscores_.\n","Strong emphasis, aka bold, with **asterisks** or __underscores__.\n","Combined emphasis with **asterisks and _underscores_**.\n","Strikethrough uses two tildes. ~~Scratch this.~~\n","\n","Here be headers:\n","# H1\n","## H2\n","### etc...\n","\n","You can create bulleted lists using\n","* Asterisks\n","- Or minuses\n","+ Or pluses\n","\n","### Ordered lists\n","1. First ordered list item\n","2. Another item\n"," * Unordered sub-list. \n","1. Actual numbers don't matter, just that it's a number\n"," 1. Ordered sub-list.\n"," \n"," An un-numbered indented paragraph.\n","4. And another item.\n","\n","[And of course add links](https://www.google.com)\n","\n","You can find a nice full markdown cheatsheet here: https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet"]},{"cell_type":"markdown","metadata":{"id":"XIcvwtmyxdNM","colab_type":"text"},"source":["

**NOTICE**

\n","\n","

HTML tags can also be used inside markdown cells.

\n",""]},{"cell_type":"markdown","metadata":{"id":"nGFegaYLxdNN","colab_type":"text"},"source":["## Code Cells"]},{"cell_type":"code","metadata":{"id":"c3DJV5VrxdNN","colab_type":"code","colab":{}},"source":["# this is a code cell\n","a = 5"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"11R4NJYYxdNR","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"9c4c7195-ac75-4e6e-9649-b7c622decabe","executionInfo":{"status":"ok","timestamp":1565864335301,"user_tz":-180,"elapsed":531,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# a line containing only an expression will print it\n","a"],"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5"]},"metadata":{"tags":[]},"execution_count":4}]},{"cell_type":"code","metadata":{"id":"Z0EcqEDvxdNU","colab_type":"code","colab":{}},"source":["# unless it ends with a semicolon\n","a;"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"2G5MWI9MxdNW","colab_type":"code","colab":{}},"source":["# you can also define functions\n","def foo(a, b):\n"," \"\"\"Foo documentation.\"\"\"\n"," return a*5 + 2/b"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"0bhzFsNFxdNY","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"5efcad27-e813-4c75-c1b3-3f6fbbb3782f","executionInfo":{"status":"ok","timestamp":1565864343265,"user_tz":-180,"elapsed":745,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# and then use them!\n","foo(5,8)"],"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["25.25"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"code","metadata":{"id":"nbpbD7pxxdNb","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"a3da4249-5f4c-4990-d70a-837ad0b2f406","executionInfo":{"status":"ok","timestamp":1565864346647,"user_tz":-180,"elapsed":535,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# you can also use IPython magic functions!\n","%time numbers = [x for x in range(1000000)]"],"execution_count":8,"outputs":[{"output_type":"stream","text":["CPU times: user 47.6 ms, sys: 45.9 ms, total: 93.5 ms\n","Wall time: 102 ms\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"ppBPkqUFxdNd","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"1e3ba24e-5252-4abd-85e4-965918cd6b42","executionInfo":{"status":"ok","timestamp":1565864352232,"user_tz":-180,"elapsed":3315,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["%timeit numbers = [x for x in range(100000)]"],"execution_count":9,"outputs":[{"output_type":"stream","text":["100 loops, best of 3: 6.29 ms per loop\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"QCf-fiySxdNf","colab_type":"text"},"source":["It is also easy to display plots inside a Jupyter notebook. We'll see this later."]},{"cell_type":"code","metadata":{"id":"uCjbABubxdNg","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]} -------------------------------------------------------------------------------- /part_2.numpy.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.5"},"colab":{"name":"part_2.numpy.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":["RwPgHgFP4H1l","CKd7UT3F4H20","Z4ZVTQa84H3p","6sPYGkYt4H3z","0onFRxLN4H4B","NU_ZudOj4H4e","8qWuhJ7E4H4v","WFkuwnxB4H5z","CNZcnWVi4H6R","R4QtrGfi4H6h","mgBbOZ2H4H6k","dPDTaPLn4H6m","hdcCMF5B4H6u"]}},"cells":[{"cell_type":"markdown","metadata":{"id":"Yai-UYMg4H1a","colab_type":"text"},"source":["# Numpy - multidimensional data arrays"]},{"cell_type":"markdown","metadata":{"id":"zhlhZMYL4H1f","colab_type":"text"},"source":["Based on J.R. Johansson's notebook (jrjohansson at gmail.com)"]},{"cell_type":"markdown","metadata":{"id":"RwPgHgFP4H1l","colab_type":"text"},"source":["## Introduction"]},{"cell_type":"markdown","metadata":{"id":"Ci0r993d4H1p","colab_type":"text"},"source":["The `numpy` package (module) is used in almost all numerical computation using Python. It is a package that provide high-performance vector, matrix and higher-dimensional data structures for Python. It is implemented in C and Fortran so when calculations are vectorized (formulated with vectors and matrices), performance is very good. \n","\n","To use `numpy` you need to import the module, using for example:"]},{"cell_type":"code","metadata":{"id":"OvvZa0Z44H1q","colab_type":"code","colab":{}},"source":["from numpy import *"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UAaDil2O4H1u","colab_type":"text"},"source":["In the `numpy` package the terminology used for vectors, matrices and higher-dimensional data sets is *array*. \n","\n"]},{"cell_type":"markdown","metadata":{"id":"aLAmXzIW4H2o","colab_type":"text"},"source":["## Creating `numpy` arrays"]},{"cell_type":"markdown","metadata":{"id":"f0RWAhOb4H2p","colab_type":"text"},"source":["There are a number of ways to initialize new numpy arrays, for example from\n","\n","* a Python list or tuples\n","* using functions that are dedicated to generating numpy arrays, such as `arange`, `linspace`, etc.\n","* reading data from files"]},{"cell_type":"markdown","metadata":{"id":"CKd7UT3F4H20","colab_type":"text"},"source":["### From lists"]},{"cell_type":"markdown","metadata":{"id":"ivCyIJYi4H23","colab_type":"text"},"source":["For example, to create new vector and matrix arrays from Python lists we can use the `numpy.array` function."]},{"cell_type":"code","metadata":{"id":"cfeoOXX94H24","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"85bca577-5bfa-472e-db58-f9ad3a139b50","executionInfo":{"status":"ok","timestamp":1565877235016,"user_tz":-180,"elapsed":1031,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# a vector: the argument to the array function is a Python list\n","v = array([1,2,3,4])\n","\n","v"],"execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([1, 2, 3, 4])"]},"metadata":{"tags":[]},"execution_count":5}]},{"cell_type":"code","metadata":{"id":"ks3LWXpq4H28","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":50},"outputId":"f47f7217-92f4-49c1-ee89-29b1aacdc600","executionInfo":{"status":"ok","timestamp":1565877235464,"user_tz":-180,"elapsed":1369,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# a matrix: the argument to the array function is a nested Python list\n","M = array([[1, 2], [3, 4]])\n","\n","M"],"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[1, 2],\n"," [3, 4]])"]},"metadata":{"tags":[]},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"uub0dXTZ4H3A","colab_type":"text"},"source":["The `v` and `M` objects are both of the type `ndarray` that the `numpy` module provides."]},{"cell_type":"code","metadata":{"id":"zkxf0n4a4H3A","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"3261d26f-670c-41d5-9e26-cb6f510a8254","executionInfo":{"status":"ok","timestamp":1565877236184,"user_tz":-180,"elapsed":540,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["type(v), type(M)"],"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(numpy.ndarray, numpy.ndarray)"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"markdown","metadata":{"id":"2YbxEAgV4H3C","colab_type":"text"},"source":["The difference between the `v` and `M` arrays is only their shapes. We can get information about the shape of an array by using the `ndarray.shape` property."]},{"cell_type":"code","metadata":{"id":"Fz7mVTEL4H3E","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"82cafd58-eb8f-4184-a7bf-768ff3a23889","executionInfo":{"status":"ok","timestamp":1565877289354,"user_tz":-180,"elapsed":1019,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["v.shape"],"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(4,)"]},"metadata":{"tags":[]},"execution_count":8}]},{"cell_type":"code","metadata":{"id":"QeGbgpLb4H3H","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"3c6874ef-e927-403a-b5c2-3a094ca7156b","executionInfo":{"status":"ok","timestamp":1565877295892,"user_tz":-180,"elapsed":936,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M.shape"],"execution_count":9,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(2, 2)"]},"metadata":{"tags":[]},"execution_count":9}]},{"cell_type":"markdown","metadata":{"id":"BmW4mybb4H3K","colab_type":"text"},"source":["The number of elements in the array is available through the `ndarray.size` property:"]},{"cell_type":"code","metadata":{"id":"pYoUPA_A4H3N","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"73705664-b047-46ee-dacb-dd5a53da8a2d","executionInfo":{"status":"ok","timestamp":1565877302429,"user_tz":-180,"elapsed":1236,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M.size"],"execution_count":10,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4"]},"metadata":{"tags":[]},"execution_count":10}]},{"cell_type":"markdown","metadata":{"id":"lbmKpqa64H3S","colab_type":"text"},"source":["Equivalently, we could use the function `numpy.shape` and `numpy.size`"]},{"cell_type":"code","metadata":{"id":"va2u6UBS4H3T","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"c7626ae7-7abd-4857-eb6a-5f092321fa7e","executionInfo":{"status":"ok","timestamp":1565877305074,"user_tz":-180,"elapsed":819,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["shape(M)"],"execution_count":11,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(2, 2)"]},"metadata":{"tags":[]},"execution_count":11}]},{"cell_type":"code","metadata":{"id":"6Nt1VX094H3a","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"e1d574bc-e5fd-42f7-8a12-3d857957cc11","executionInfo":{"status":"ok","timestamp":1565877306018,"user_tz":-180,"elapsed":525,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["size(M)"],"execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4"]},"metadata":{"tags":[]},"execution_count":12}]},{"cell_type":"markdown","metadata":{"id":"8uURf4TK4H3c","colab_type":"text"},"source":["So far the `numpy.ndarray` looks awefully much like a Python list (or nested list). Why not simply use Python lists for computations instead of creating a new array type? \n","\n","There are several reasons:\n","\n","* Python lists are very general. They can contain any kind of object. They are dynamically typed. They do not support mathematical functions such as matrix and dot multiplications, etc. Implementing such functions for Python lists would not be very efficient because of the dynamic typing.\n","* Numpy arrays are **statically typed** and **homogeneous**. The type of the elements is determined when the array is created.\n","* Numpy arrays are memory efficient.\n","* Because of the static typing, fast implementation of mathematical functions such as multiplication and addition of `numpy` arrays can be implemented in a compiled language (C and Fortran is used).\n","\n","Using the `dtype` (data type) property of an `ndarray`, we can see what type the data of an array has:"]},{"cell_type":"code","metadata":{"id":"H1ny0dwq4H3d","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"e5948335-7540-4b99-a3e7-354f4de59da8","executionInfo":{"status":"ok","timestamp":1565877336197,"user_tz":-180,"elapsed":556,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M.dtype"],"execution_count":13,"outputs":[{"output_type":"execute_result","data":{"text/plain":["dtype('int64')"]},"metadata":{"tags":[]},"execution_count":13}]},{"cell_type":"markdown","metadata":{"id":"mOQjfIdD4H3g","colab_type":"text"},"source":["We get an error if we try to assign a value of the wrong type to an element in a numpy array:"]},{"cell_type":"code","metadata":{"id":"HQ0ySny04H3g","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":162},"outputId":"80cf5e53-3fcc-444c-f08d-fc35e8bf7b88","executionInfo":{"status":"error","timestamp":1565877346786,"user_tz":-180,"elapsed":561,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M[0,0] = \"hello\""],"execution_count":14,"outputs":[{"output_type":"error","ename":"ValueError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mM\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"hello\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: 'hello'"]}]},{"cell_type":"markdown","metadata":{"id":"NxoE_aas4H3i","colab_type":"text"},"source":["If we want, we can explicitly define the type of the array data when we create it, using the `dtype` keyword argument: "]},{"cell_type":"code","metadata":{"id":"tcoqshv34H3j","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":50},"outputId":"53d4f229-57ac-493c-ff77-acc9f1b30cbc","executionInfo":{"status":"ok","timestamp":1565877353544,"user_tz":-180,"elapsed":800,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M = array([[1, 2], [3, 4]], dtype=complex)\n","\n","M"],"execution_count":15,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[1.+0.j, 2.+0.j],\n"," [3.+0.j, 4.+0.j]])"]},"metadata":{"tags":[]},"execution_count":15}]},{"cell_type":"markdown","metadata":{"id":"71v9E4m84H3o","colab_type":"text"},"source":["Common data types that can be used with `dtype` are: `int`, `float`, `complex`, `bool`, `object`, etc.\n","\n","We can also explicitly define the bit size of the data types, for example: `int64`, `int16`, `float128`, `complex128`."]},{"cell_type":"markdown","metadata":{"id":"Z4ZVTQa84H3p","colab_type":"text"},"source":["### Using array-generating functions"]},{"cell_type":"markdown","metadata":{"id":"aYFas-Rn4H3p","colab_type":"text"},"source":["For larger arrays it is inpractical to initialize the data manually, using explicit python lists. Instead we can use one of the many functions in `numpy` that generate arrays of different forms. Some of the more common are:"]},{"cell_type":"code","metadata":{"id":"irdANGOL4H3q","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"549095e2-3930-456e-fd1d-f3a7ebc9e946","executionInfo":{"status":"ok","timestamp":1565877387626,"user_tz":-180,"elapsed":951,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# create a range\n","\n","x = arange(0, 10, 1) # arguments: start, stop, step\n","\n","x"],"execution_count":16,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"]},"metadata":{"tags":[]},"execution_count":16}]},{"cell_type":"code","metadata":{"id":"Sr-MPcyK4H3s","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":101},"outputId":"bdd6d5cc-d486-4923-e3cf-7373591297da","executionInfo":{"status":"ok","timestamp":1565877387628,"user_tz":-180,"elapsed":552,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["x = arange(-1, 1, 0.1)\n","\n","x"],"execution_count":17,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([-1.00000000e+00, -9.00000000e-01, -8.00000000e-01, -7.00000000e-01,\n"," -6.00000000e-01, -5.00000000e-01, -4.00000000e-01, -3.00000000e-01,\n"," -2.00000000e-01, -1.00000000e-01, -2.22044605e-16, 1.00000000e-01,\n"," 2.00000000e-01, 3.00000000e-01, 4.00000000e-01, 5.00000000e-01,\n"," 6.00000000e-01, 7.00000000e-01, 8.00000000e-01, 9.00000000e-01])"]},"metadata":{"tags":[]},"execution_count":17}]},{"cell_type":"code","metadata":{"scrolled":true,"id":"xXZsubZo4H3v","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":101},"outputId":"421b1c6d-e865-437b-f68a-a6a5e15f46e2","executionInfo":{"status":"ok","timestamp":1565877391062,"user_tz":-180,"elapsed":860,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# using linspace, both end points ARE included\n","linspace(0, 10, 25)"],"execution_count":18,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0. , 0.41666667, 0.83333333, 1.25 , 1.66666667,\n"," 2.08333333, 2.5 , 2.91666667, 3.33333333, 3.75 ,\n"," 4.16666667, 4.58333333, 5. , 5.41666667, 5.83333333,\n"," 6.25 , 6.66666667, 7.08333333, 7.5 , 7.91666667,\n"," 8.33333333, 8.75 , 9.16666667, 9.58333333, 10. ])"]},"metadata":{"tags":[]},"execution_count":18}]},{"cell_type":"markdown","metadata":{"id":"6sPYGkYt4H3z","colab_type":"text"},"source":["#### mgrid"]},{"cell_type":"code","metadata":{"id":"LTtylXc24H34","colab_type":"code","colab":{}},"source":["x, y = mgrid[0:5, 0:5] # similar to meshgrid in MATLAB"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"CSsUMPEr4H37","colab_type":"code","colab":{},"outputId":"469c0bf4-a5f2-4a0c-8638-7ac71c62539c"},"source":["x"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[0, 0, 0, 0, 0],\n"," [1, 1, 1, 1, 1],\n"," [2, 2, 2, 2, 2],\n"," [3, 3, 3, 3, 3],\n"," [4, 4, 4, 4, 4]])"]},"metadata":{"tags":[]},"execution_count":19}]},{"cell_type":"code","metadata":{"id":"KWaBRngd4H3-","colab_type":"code","colab":{},"outputId":"71ba3bbd-ddff-4392-a4c0-d3abc5838b09"},"source":["y"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[0, 1, 2, 3, 4],\n"," [0, 1, 2, 3, 4],\n"," [0, 1, 2, 3, 4],\n"," [0, 1, 2, 3, 4],\n"," [0, 1, 2, 3, 4]])"]},"metadata":{"tags":[]},"execution_count":20}]},{"cell_type":"markdown","metadata":{"id":"0onFRxLN4H4B","colab_type":"text"},"source":["#### random data"]},{"cell_type":"code","metadata":{"id":"KbOBN6R34H4D","colab_type":"code","colab":{}},"source":["from numpy import random"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ek8AG7Br4H4Q","colab_type":"code","colab":{},"outputId":"9f97fd7e-e017-4fbd-8ef8-ee318ca7916d"},"source":["# uniform random numbers in [0,1]\n","random.rand(5,5)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.92932506, 0.19684255, 0.736434 , 0.18125714, 0.70905038],\n"," [ 0.18803573, 0.9312815 , 0.1284532 , 0.38138008, 0.36646481],\n"," [ 0.53700462, 0.02361381, 0.97760688, 0.73296701, 0.23042324],\n"," [ 0.9024635 , 0.20860922, 0.67729644, 0.68386687, 0.49385729],\n"," [ 0.95876515, 0.29341553, 0.37520629, 0.29194432, 0.64102804]])"]},"metadata":{"tags":[]},"execution_count":22}]},{"cell_type":"code","metadata":{"id":"3Qb6y0hv4H4a","colab_type":"code","colab":{},"outputId":"f8bff034-1d25-42d5-e1ac-d5b1a9c95c8c"},"source":["# standard normal distributed random numbers\n","random.randn(5,5)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.117907 , -1.57016164, 0.78256246, 1.45386709, 0.54744436],\n"," [ 2.30356897, -0.28352021, -0.9087325 , 1.2285279 , -1.00760167],\n"," [ 0.72216801, 0.77507299, -0.37793178, -0.31852241, 0.84493629],\n"," [-0.10682252, 1.15930142, -0.47291444, -0.69496967, -0.58912034],\n"," [ 0.34513487, -0.92389516, -0.216978 , 0.42153272, 0.86650101]])"]},"metadata":{"tags":[]},"execution_count":23}]},{"cell_type":"markdown","metadata":{"id":"NU_ZudOj4H4e","colab_type":"text"},"source":["#### zeros and ones"]},{"cell_type":"code","metadata":{"id":"MC5Qqrih4H4g","colab_type":"code","colab":{},"outputId":"2da7218a-809e-4151-a584-1c10a7311e8d"},"source":["zeros((3,3))"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0., 0., 0.],\n"," [ 0., 0., 0.],\n"," [ 0., 0., 0.]])"]},"metadata":{"tags":[]},"execution_count":26}]},{"cell_type":"code","metadata":{"id":"q8-TzdJ34H4r","colab_type":"code","colab":{},"outputId":"dd3cf0f6-d086-491b-8ac0-acd1ead5ce42"},"source":["ones((3,3))"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1., 1., 1.],\n"," [ 1., 1., 1.],\n"," [ 1., 1., 1.]])"]},"metadata":{"tags":[]},"execution_count":27}]},{"cell_type":"markdown","metadata":{"id":"8qWuhJ7E4H4v","colab_type":"text"},"source":["## More properties of numpy arrays"]},{"cell_type":"code","metadata":{"id":"rcREVjoG4H4w","colab_type":"code","colab":{},"outputId":"7df199df-f2d1-4a27-9287-05b427e50cc7"},"source":["M.itemsize # bytes per element"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["8"]},"metadata":{"tags":[]},"execution_count":38}]},{"cell_type":"code","metadata":{"id":"o4LXNp644H40","colab_type":"code","colab":{},"outputId":"22b37e4c-d92b-4ed9-d901-5246282f41a7"},"source":["M.nbytes # number of bytes"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["72"]},"metadata":{"tags":[]},"execution_count":39}]},{"cell_type":"code","metadata":{"id":"WEPR97Rx4H49","colab_type":"code","colab":{},"outputId":"09bd572f-e6e6-45af-d7b1-2f519f0b5776"},"source":["M.ndim # number of dimensions"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["2"]},"metadata":{"tags":[]},"execution_count":40}]},{"cell_type":"markdown","metadata":{"id":"ZE5H_AbI4H4_","colab_type":"text"},"source":["## Manipulating arrays"]},{"cell_type":"markdown","metadata":{"id":"Gn5WEg754H5A","colab_type":"text"},"source":["### Indexing"]},{"cell_type":"markdown","metadata":{"id":"xZxt-9h44H5A","colab_type":"text"},"source":["We can index elements in an array using square brackets and indices:"]},{"cell_type":"code","metadata":{"id":"Ry0do-4K4H5B","colab_type":"code","colab":{},"outputId":"809671ed-3317-433f-dbbf-a82c584d7d0e"},"source":["# v is a vector, and has only one dimension, taking one index\n","v[0]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1"]},"metadata":{"tags":[]},"execution_count":41}]},{"cell_type":"code","metadata":{"id":"SeXl-uSv4H5D","colab_type":"code","colab":{},"outputId":"86699f4f-7c80-4e0b-d91f-320f16efecea"},"source":["# M is a matrix, or a 2 dimensional array, taking two indices \n","M[1,1]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.47913739949636192"]},"metadata":{"tags":[]},"execution_count":42}]},{"cell_type":"markdown","metadata":{"id":"UIS_PIPQ4H5F","colab_type":"text"},"source":["If we omit an index of a multidimensional array it returns the whole row (or, in general, a N-1 dimensional array) "]},{"cell_type":"code","metadata":{"id":"wmTMc0ia4H5G","colab_type":"code","colab":{},"outputId":"09d6e405-e1d6-441e-ba9b-52218318032b"},"source":["M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.77872576, 0.40043577, 0.66254019],\n"," [ 0.60410063, 0.4791374 , 0.8237106 ],\n"," [ 0.96856318, 0.15459644, 0.96082399]])"]},"metadata":{"tags":[]},"execution_count":43}]},{"cell_type":"code","metadata":{"id":"hrO_oej84H5N","colab_type":"code","colab":{},"outputId":"eba958b0-6259-481b-9bf7-6f06c9fd30db"},"source":["M[1]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.60410063, 0.4791374 , 0.8237106 ])"]},"metadata":{"tags":[]},"execution_count":44}]},{"cell_type":"markdown","metadata":{"id":"hyj1kRXP4H5P","colab_type":"text"},"source":["The same thing can be achieved with using `:` instead of an index: "]},{"cell_type":"code","metadata":{"id":"_RhaEcYM4H5Q","colab_type":"code","colab":{},"outputId":"79daddb0-ca06-40c4-c734-602f7971e5e6"},"source":["M[1,:] # row 1"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.60410063, 0.4791374 , 0.8237106 ])"]},"metadata":{"tags":[]},"execution_count":45}]},{"cell_type":"code","metadata":{"id":"H3DX8HsU4H5T","colab_type":"code","colab":{},"outputId":"bb107fd0-f7ee-4748-fee6-7c179a2d5151"},"source":["M[:,1] # column 1"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.40043577, 0.4791374 , 0.15459644])"]},"metadata":{"tags":[]},"execution_count":46}]},{"cell_type":"markdown","metadata":{"id":"YVOTbsDS4H5W","colab_type":"text"},"source":["We can assign new values to elements in an array using indexing:"]},{"cell_type":"code","metadata":{"id":"AiHZCF_q4H5X","colab_type":"code","colab":{}},"source":["M[0,0] = 1"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"gdRL6hoq4H5f","colab_type":"code","colab":{},"outputId":"f3534e44-0686-460e-dfdb-82fd009314c0"},"source":["M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1. , 0.40043577, 0.66254019],\n"," [ 0.60410063, 0.4791374 , 0.8237106 ],\n"," [ 0.96856318, 0.15459644, 0.96082399]])"]},"metadata":{"tags":[]},"execution_count":48}]},{"cell_type":"code","metadata":{"id":"-CkU0him4H5l","colab_type":"code","colab":{}},"source":["# also works for rows and columns\n","M[1,:] = 0\n","M[:,2] = -1"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBRT1tyZ4H5x","colab_type":"code","colab":{},"outputId":"f47a6691-2251-4a73-f3ca-b2fbbf08022e"},"source":["M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1. , 0.40043577, -1. ],\n"," [ 0. , 0. , -1. ],\n"," [ 0.96856318, 0.15459644, -1. ]])"]},"metadata":{"tags":[]},"execution_count":50}]},{"cell_type":"markdown","metadata":{"id":"WFkuwnxB4H5z","colab_type":"text"},"source":["### Index slicing"]},{"cell_type":"markdown","metadata":{"id":"dFRAptaD4H50","colab_type":"text"},"source":["Index slicing is the technical name for the syntax `M[lower:upper:step]` to extract part of an array:"]},{"cell_type":"code","metadata":{"id":"9wz8jo3Z4H51","colab_type":"code","colab":{},"outputId":"02e30d19-c5a6-4059-b660-536c28a72141"},"source":["A = array([1,2,3,4,5])\n","A"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([1, 2, 3, 4, 5])"]},"metadata":{"tags":[]},"execution_count":51}]},{"cell_type":"code","metadata":{"id":"nn08qKFJ4H54","colab_type":"code","colab":{},"outputId":"3253b970-9460-4623-b23c-ee3779336e36"},"source":["A[1:3]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([2, 3])"]},"metadata":{"tags":[]},"execution_count":52}]},{"cell_type":"markdown","metadata":{"id":"9HANOCRZ4H5-","colab_type":"text"},"source":["Array slices are *mutable*: if they are assigned a new value the original array from which the slice was extracted is modified:"]},{"cell_type":"code","metadata":{"id":"OgYbFcS64H5_","colab_type":"code","colab":{},"outputId":"bb43f189-fd8c-432a-9810-810de1dc66df"},"source":["A[1:3] = [-2,-3]\n","\n","A"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 1, -2, -3, 4, 5])"]},"metadata":{"tags":[]},"execution_count":53}]},{"cell_type":"markdown","metadata":{"id":"jNczBl2X4H6E","colab_type":"text"},"source":["Negative indices counts from the end of the array (positive index from the begining):"]},{"cell_type":"code","metadata":{"id":"Mgur3nS94H6F","colab_type":"code","colab":{}},"source":["A = array([1,2,3,4,5])"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ovmQzTPT4H6I","colab_type":"code","colab":{},"outputId":"61419e48-b0d2-4ebe-8849-61b1efca5b38"},"source":["A[-1] # the last element in the array"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5"]},"metadata":{"tags":[]},"execution_count":59}]},{"cell_type":"code","metadata":{"id":"ZVHNR1Up4H6O","colab_type":"code","colab":{},"outputId":"bb0ef677-294c-4760-9bff-43567b1f09d8"},"source":["A[-3:] # the last three elements"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([3, 4, 5])"]},"metadata":{"tags":[]},"execution_count":60}]},{"cell_type":"markdown","metadata":{"id":"CNZcnWVi4H6R","colab_type":"text"},"source":["### Fancy indexing"]},{"cell_type":"markdown","metadata":{"id":"xDsu0l564H6S","colab_type":"text"},"source":["Fancy indexing is the name for when an array or list is used in-place of an index: "]},{"cell_type":"code","metadata":{"id":"GgHGCVGp4H6S","colab_type":"code","colab":{},"outputId":"531f2545-88a5-4078-b4e3-fcfc72f1f2a9"},"source":["row_indices = [1, 2, 3]\n","A[row_indices]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[10, 11, 12, 13, 14],\n"," [20, 21, 22, 23, 24],\n"," [30, 31, 32, 33, 34]])"]},"metadata":{"tags":[]},"execution_count":64}]},{"cell_type":"code","metadata":{"id":"0Ol2NX0j4H6W","colab_type":"code","colab":{},"outputId":"f484a32e-826e-4c7b-982c-59dd7700e14f"},"source":["col_indices = [1, 2, -1] # remember, index -1 means the last element\n","A[row_indices, col_indices]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([11, 22, 34])"]},"metadata":{"tags":[]},"execution_count":65}]},{"cell_type":"markdown","metadata":{"id":"sYOLqh5s4H6Y","colab_type":"text"},"source":["### Linear and Matrix algebra"]},{"cell_type":"markdown","metadata":{"id":"cE8dK2DK4H6Z","colab_type":"text"},"source":["Numpy's real strength is in optimized linear and matrix algebric operations on vectors and matrices, but that's less relevant here."]},{"cell_type":"markdown","metadata":{"id":"0UsCmaIm4H6Z","colab_type":"text"},"source":["### Data processing"]},{"cell_type":"markdown","metadata":{"id":"NXPyElfW4H6a","colab_type":"text"},"source":["Often it is useful to store datasets in Numpy arrays. Numpy provides a number of functions to calculate statistics of datasets in arrays. \n","\n","For example, let's calculate some properties from the Stockholm temperature dataset used above."]},{"cell_type":"code","metadata":{"id":"k4KNJQvy4H6b","colab_type":"code","colab":{},"outputId":"7b6235eb-1446-44bc-da3a-678fe4780f24"},"source":["# reminder, the tempeature dataset is stored in the data variable:\n","data = random.randint(10,size=(8,8))\n","shape(data)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(8, 8)"]},"metadata":{"tags":[]},"execution_count":18}]},{"cell_type":"markdown","metadata":{"id":"R4QtrGfi4H6h","colab_type":"text"},"source":["#### mean"]},{"cell_type":"code","metadata":{"id":"px1LjwPK4H6h","colab_type":"code","colab":{},"outputId":"2ab080ba-418a-49f5-c780-a5649e405c72"},"source":["mean(data[:,3])"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5.25"]},"metadata":{"tags":[]},"execution_count":20}]},{"cell_type":"markdown","metadata":{"id":"mgBbOZ2H4H6k","colab_type":"text"},"source":["#### standard deviations and variance"]},{"cell_type":"code","metadata":{"id":"KNsHBTsm4H6l","colab_type":"code","colab":{},"outputId":"1cedb05f-e5b4-405f-f902-a39370b87bae"},"source":["std(data[:,3]), var(data[:,3])"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(1.6393596310755001, 2.6875)"]},"metadata":{"tags":[]},"execution_count":21}]},{"cell_type":"markdown","metadata":{"id":"dPDTaPLn4H6m","colab_type":"text"},"source":["#### min and max"]},{"cell_type":"code","metadata":{"id":"CoF7hVsZ4H6o","colab_type":"code","colab":{},"outputId":"c74bab1a-18c7-4d91-ccf4-6a8db61c653d"},"source":["data[:,3].min()"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4"]},"metadata":{"tags":[]},"execution_count":24}]},{"cell_type":"code","metadata":{"id":"_UMY69jr4H6q","colab_type":"code","colab":{},"outputId":"720b0a96-fa4d-4cd3-84ea-284c91cfb802"},"source":["data[:,3].max()"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["9"]},"metadata":{"tags":[]},"execution_count":25}]},{"cell_type":"markdown","metadata":{"id":"hdcCMF5B4H6u","colab_type":"text"},"source":["#### sum, prod, and their cumulative versions"]},{"cell_type":"code","metadata":{"id":"x5sBrhVU4H6x","colab_type":"code","colab":{},"outputId":"1eec9c5b-0005-40e6-f8bd-b2e8e8269e15"},"source":["d = arange(0, 10)\n","d"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"]},"metadata":{"tags":[]},"execution_count":26}]},{"cell_type":"code","metadata":{"id":"7SAdiHLi4H60","colab_type":"code","colab":{},"outputId":"ed91617c-4814-4a38-825f-75ebaff86666"},"source":["# sum up all elements\n","sum(d)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["45"]},"metadata":{"tags":[]},"execution_count":27}]},{"cell_type":"code","metadata":{"id":"i-3OP3WX4H63","colab_type":"code","colab":{},"outputId":"b52618b4-7f3d-4d70-aa43-19b47d1b6261"},"source":["# product of all elements\n","prod(d+1)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["3628800"]},"metadata":{"tags":[]},"execution_count":28}]},{"cell_type":"code","metadata":{"id":"gfm7Wv_E4H64","colab_type":"code","colab":{},"outputId":"794b747d-ba5f-49e2-f14c-22dbd3e6bde5"},"source":["# cummulative sum\n","cumsum(d)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45])"]},"metadata":{"tags":[]},"execution_count":29}]},{"cell_type":"code","metadata":{"id":"VTAeSiH54H66","colab_type":"code","colab":{},"outputId":"190d8a7d-7eba-48aa-e371-54fbf803c194"},"source":["# cummulative product\n","cumprod(d+1)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 1, 2, 6, 24, 120, 720, 5040,\n"," 40320, 362880, 3628800])"]},"metadata":{"tags":[]},"execution_count":30}]},{"cell_type":"markdown","metadata":{"id":"33g0Fjtk4H7D","colab_type":"text"},"source":["## Iterating over array elements"]},{"cell_type":"markdown","metadata":{"id":"kOVcn0eu4H7D","colab_type":"text"},"source":["Generally, we want to avoid iterating over the elements of arrays whenever we can (at all costs). The reason is that in a interpreted language like Python (or MATLAB), iterations are really slow compared to vectorized operations. \n","\n","However, sometimes iterations are unavoidable. For such cases, the Python `for` loop is the most convenient way to iterate over an array:"]},{"cell_type":"code","metadata":{"id":"5Bi7I-Lz4H7E","colab_type":"code","colab":{},"outputId":"b65ac7c8-e662-4bfe-e9da-9f087c09f09f"},"source":["v = array([1,2,3,4])\n","\n","for element in v:\n"," print(element)"],"execution_count":0,"outputs":[{"output_type":"stream","text":["1\n","2\n","3\n","4\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"q1Mp_Qvq4H7G","colab_type":"code","colab":{},"outputId":"d277af64-0bdd-4d09-f304-dd649607bd4f"},"source":["M = array([[1,2], [3,4]])\n","\n","for row in M:\n"," print(\"row\", row)\n"," \n"," for element in row:\n"," print(element)"],"execution_count":0,"outputs":[{"output_type":"stream","text":["row [1 2]\n","1\n","2\n","row [3 4]\n","3\n","4\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"sOq8jYMN4H7I","colab_type":"text"},"source":["When we need to iterate over each element of an array and modify its elements, it is convenient to use the `enumerate` function to obtain both the element and its index in the `for` loop: "]},{"cell_type":"code","metadata":{"id":"93m_6Ev34H7J","colab_type":"code","colab":{},"outputId":"86c347ae-b2ac-49c6-8fb3-8b1e115e4d91"},"source":["for row_idx, row in enumerate(M):\n"," print(\"row_idx\", row_idx, \"row\", row)\n"," \n"," for col_idx, element in enumerate(row):\n"," print(\"col_idx\", col_idx, \"element\", element)\n"," \n"," # update the matrix M: square each element\n"," M[row_idx, col_idx] = element ** 2"],"execution_count":0,"outputs":[{"output_type":"stream","text":["row_idx 0 row [1 2]\n","col_idx 0 element 1\n","col_idx 1 element 2\n","row_idx 1 row [3 4]\n","col_idx 0 element 3\n","col_idx 1 element 4\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"-UcjV-954H7K","colab_type":"code","colab":{},"outputId":"ca48ab3c-cf27-47ad-d6b5-83009cd1e724"},"source":["# each element in M is now squared\n","M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1, 4],\n"," [ 9, 16]])"]},"metadata":{"tags":[]},"execution_count":35}]},{"cell_type":"code","metadata":{"id":"kRTCtoHX4H7M","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]} -------------------------------------------------------------------------------- /part_6.modeling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Modelling" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Loading the processed dataset" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# !pip install -U -q datalearn19intro\n", 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "import seaborn as sns\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "from datalearn19intro import get_processed_intro_dataset\n", 29 | "%matplotlib inline" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "pd.options.display.float_format = '{:,.2f}'.format\n", 39 | "pd.set_option('display.max_columns', 150)\n", 40 | "pd.set_option('display.max_rows', 200)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "df = get_processed_intro_dataset()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 5, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "(1001, 22)" 61 | ] 62 | }, 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "df.shape" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 6, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "df = df.set_index('account_id')" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 7, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/html": [ 89 | "
\n", 90 | "\n", 103 | "\n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | "
is_gmail_fromaccountscollection_21_daysbilled_users_countlog_max_tsizeos_android_avgos_chrome_os_avgos_ios_avgos_linux_avgos_mac_avgtotal_events_sumnotification_events_sumnew_entry_events_sumpayment_events_suminbox_events_sumcommunicating_events_sumnon_communicating_events_sumweb_events_sumios_events_sumdesktop_app_events_sumempty_events_sumlead_score
account_id
27934961.000.000.000.570.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.00
27934971.000.000.000.570.450.000.550.000.000.010.040.020.000.000.000.000.000.170.000.000.00
27934981.000.000.000.430.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.00
27934991.000.000.000.571.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.00
27935001.000.000.000.430.001.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.00
27935010.000.000.000.570.000.000.000.001.000.000.000.000.000.000.000.000.000.000.000.000.00
27935021.000.000.000.430.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.00
27935031.000.000.000.571.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.00
\n", 349 | "
" 350 | ], 351 | "text/plain": [ 352 | " is_gmail_fromaccounts collection_21_days billed_users_count \\\n", 353 | "account_id \n", 354 | "2793496 1.00 0.00 0.00 \n", 355 | "2793497 1.00 0.00 0.00 \n", 356 | "2793498 1.00 0.00 0.00 \n", 357 | "2793499 1.00 0.00 0.00 \n", 358 | "2793500 1.00 0.00 0.00 \n", 359 | "2793501 0.00 0.00 0.00 \n", 360 | "2793502 1.00 0.00 0.00 \n", 361 | "2793503 1.00 0.00 0.00 \n", 362 | "\n", 363 | " log_max_tsize os_android_avg os_chrome_os_avg os_ios_avg \\\n", 364 | "account_id \n", 365 | "2793496 0.57 0.00 0.00 0.00 \n", 366 | "2793497 0.57 0.45 0.00 0.55 \n", 367 | "2793498 0.43 0.00 0.00 0.00 \n", 368 | "2793499 0.57 1.00 0.00 0.00 \n", 369 | "2793500 0.43 0.00 1.00 0.00 \n", 370 | "2793501 0.57 0.00 0.00 0.00 \n", 371 | "2793502 0.43 0.00 0.00 0.00 \n", 372 | "2793503 0.57 1.00 0.00 0.00 \n", 373 | "\n", 374 | " os_linux_avg os_mac_avg total_events_sum \\\n", 375 | "account_id \n", 376 | "2793496 0.00 0.00 0.00 \n", 377 | "2793497 0.00 0.00 0.01 \n", 378 | "2793498 0.00 0.00 0.00 \n", 379 | "2793499 0.00 0.00 0.00 \n", 380 | "2793500 0.00 0.00 0.00 \n", 381 | "2793501 0.00 1.00 0.00 \n", 382 | "2793502 0.00 0.00 0.00 \n", 383 | "2793503 0.00 0.00 0.00 \n", 384 | "\n", 385 | " notification_events_sum new_entry_events_sum payment_events_sum \\\n", 386 | "account_id \n", 387 | "2793496 0.00 0.00 0.00 \n", 388 | "2793497 0.04 0.02 0.00 \n", 389 | "2793498 0.00 0.00 0.00 \n", 390 | "2793499 0.00 0.00 0.00 \n", 391 | "2793500 0.00 0.00 0.00 \n", 392 | "2793501 0.00 0.00 0.00 \n", 393 | "2793502 0.00 0.00 0.00 \n", 394 | "2793503 0.00 0.00 0.00 \n", 395 | "\n", 396 | " inbox_events_sum communicating_events_sum \\\n", 397 | "account_id \n", 398 | "2793496 0.00 0.00 \n", 399 | "2793497 0.00 0.00 \n", 400 | "2793498 0.00 0.00 \n", 401 | "2793499 0.00 0.00 \n", 402 | "2793500 0.00 0.00 \n", 403 | "2793501 0.00 0.00 \n", 404 | "2793502 0.00 0.00 \n", 405 | "2793503 0.00 0.00 \n", 406 | "\n", 407 | " non_communicating_events_sum web_events_sum ios_events_sum \\\n", 408 | "account_id \n", 409 | "2793496 0.00 0.00 0.00 \n", 410 | "2793497 0.00 0.00 0.17 \n", 411 | "2793498 0.00 0.00 0.00 \n", 412 | "2793499 0.00 0.00 0.00 \n", 413 | "2793500 0.00 0.00 0.00 \n", 414 | "2793501 0.00 0.00 0.00 \n", 415 | "2793502 0.00 0.00 0.00 \n", 416 | "2793503 0.00 0.00 0.00 \n", 417 | "\n", 418 | " desktop_app_events_sum empty_events_sum lead_score \n", 419 | "account_id \n", 420 | "2793496 0.00 0.00 0.00 \n", 421 | "2793497 0.00 0.00 0.00 \n", 422 | "2793498 0.00 0.00 0.00 \n", 423 | "2793499 0.00 0.00 0.00 \n", 424 | "2793500 0.00 0.00 0.00 \n", 425 | "2793501 0.00 0.00 0.00 \n", 426 | "2793502 0.00 0.00 0.00 \n", 427 | "2793503 0.00 0.00 0.00 " 428 | ] 429 | }, 430 | "execution_count": 7, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "df.head(8)" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 8, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "X = df.drop('lead_score', axis=1)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 9, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "y = df['lead_score']" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "## Data split" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 10, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "from sklearn.model_selection import train_test_split" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "Let's split the data into train and test sets, at a 80/20 ratio." 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 11, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "## Model fit" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 32, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "from sklearn.linear_model import LogisticRegression" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 33, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "clf = LogisticRegression()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 34, 517 | "metadata": {}, 518 | "outputs": [ 519 | { 520 | "name": "stderr", 521 | "output_type": "stream", 522 | "text": [ 523 | "/Users/shaypalachy/.pyenv/versions/3.6.5/envs/py3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 524 | " FutureWarning)\n" 525 | ] 526 | }, 527 | { 528 | "data": { 529 | "text/plain": [ 530 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 531 | " intercept_scaling=1, max_iter=100, multi_class='warn',\n", 532 | " n_jobs=None, penalty='l2', random_state=None, solver='warn',\n", 533 | " tol=0.0001, verbose=0, warm_start=False)" 534 | ] 535 | }, 536 | "execution_count": 34, 537 | "metadata": {}, 538 | "output_type": "execute_result" 539 | } 540 | ], 541 | "source": [ 542 | "clf.fit(X_train, y_train)" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 35, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "y_pred = clf.predict(X_test)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 36, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "from sklearn.metrics import accuracy_score, precision_score, recall_score" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 37, 566 | "metadata": {}, 567 | "outputs": [ 568 | { 569 | "data": { 570 | "text/plain": [ 571 | "0.9701492537313433" 572 | ] 573 | }, 574 | "execution_count": 37, 575 | "metadata": {}, 576 | "output_type": "execute_result" 577 | } 578 | ], 579 | "source": [ 580 | "accuracy_score(y_test, y_pred)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "markdown", 585 | "metadata": {}, 586 | "source": [ 587 | "OMG! That's amazing!" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 38, 593 | "metadata": {}, 594 | "outputs": [ 595 | { 596 | "name": "stderr", 597 | "output_type": "stream", 598 | "text": [ 599 | "/Users/shaypalachy/.pyenv/versions/3.6.5/envs/py3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n", 600 | " 'precision', 'predicted', average, warn_for)\n" 601 | ] 602 | }, 603 | { 604 | "data": { 605 | "text/plain": [ 606 | "0.0" 607 | ] 608 | }, 609 | "execution_count": 38, 610 | "metadata": {}, 611 | "output_type": "execute_result" 612 | } 613 | ], 614 | "source": [ 615 | "precision_score(y_test, y_pred)" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 39, 621 | "metadata": {}, 622 | "outputs": [ 623 | { 624 | "data": { 625 | "text/plain": [ 626 | "0.0" 627 | ] 628 | }, 629 | "execution_count": 39, 630 | "metadata": {}, 631 | "output_type": "execute_result" 632 | } 633 | ], 634 | "source": [ 635 | "recall_score(y_test, y_pred)" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": {}, 641 | "source": [ 642 | "Oh no! Maybe our model wasn't as good as we thought!\n", 643 | "What happened?" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 40, 649 | "metadata": {}, 650 | "outputs": [ 651 | { 652 | "data": { 653 | "text/plain": [ 654 | "(array([0.]), array([201]))" 655 | ] 656 | }, 657 | "execution_count": 40, 658 | "metadata": {}, 659 | "output_type": "execute_result" 660 | } 661 | ], 662 | "source": [ 663 | "np.unique(y_pred, return_counts=True)" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 41, 669 | "metadata": {}, 670 | "outputs": [ 671 | { 672 | "data": { 673 | "text/plain": [ 674 | "(array([0., 1.]), array([980, 21]))" 675 | ] 676 | }, 677 | "execution_count": 41, 678 | "metadata": {}, 679 | "output_type": "execute_result" 680 | } 681 | ], 682 | "source": [ 683 | "np.unique(y, return_counts=True)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": 42, 689 | "metadata": {}, 690 | "outputs": [ 691 | { 692 | "name": "stdout", 693 | "output_type": "stream", 694 | "text": [ 695 | "Accuracy: 97.01%\n", 696 | "Precision: 0.00%\n", 697 | "Recall: 0.00%\n" 698 | ] 699 | }, 700 | { 701 | "name": "stderr", 702 | "output_type": "stream", 703 | "text": [ 704 | "/Users/shaypalachy/.pyenv/versions/3.6.5/envs/py3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n", 705 | " 'precision', 'predicted', average, warn_for)\n" 706 | ] 707 | } 708 | ], 709 | "source": [ 710 | "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n", 711 | "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n", 712 | "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "Notice, we have 980 negative examples and only 21 positive ones." 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": 43, 725 | "metadata": {}, 726 | "outputs": [ 727 | { 728 | "data": { 729 | "text/plain": [ 730 | "97.9020979020979" 731 | ] 732 | }, 733 | "execution_count": 43, 734 | "metadata": {}, 735 | "output_type": "execute_result" 736 | } 737 | ], 738 | "source": [ 739 | "980/1001 * 100" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "Our model was optimizing accuracy when fitting its parameters, and the easiest to do that is to simply predict 0 all the time for roughly 98% accuracy!" 747 | ] 748 | }, 749 | { 750 | "cell_type": "markdown", 751 | "metadata": {}, 752 | "source": [ 753 | "We can use `class_weight='balanced'` to each **class** equally important instead of each row/entry." 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 44, 759 | "metadata": {}, 760 | "outputs": [ 761 | { 762 | "name": "stderr", 763 | "output_type": "stream", 764 | "text": [ 765 | "/Users/shaypalachy/.pyenv/versions/3.6.5/envs/py3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 766 | " FutureWarning)\n" 767 | ] 768 | } 769 | ], 770 | "source": [ 771 | "clf = LogisticRegression(class_weight='balanced')\n", 772 | "clf.fit(X_train, y_train)\n", 773 | "y_pred = clf.predict(X_test)" 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": 45, 779 | "metadata": {}, 780 | "outputs": [ 781 | { 782 | "name": "stdout", 783 | "output_type": "stream", 784 | "text": [ 785 | "Accuracy: 68.66%\n", 786 | "Precision: 7.46%\n", 787 | "Recall: 83.33%\n" 788 | ] 789 | } 790 | ], 791 | "source": [ 792 | "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n", 793 | "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n", 794 | "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))" 795 | ] 796 | }, 797 | { 798 | "cell_type": "markdown", 799 | "metadata": {}, 800 | "source": [ 801 | "## Hyperparameter tuning" 802 | ] 803 | }, 804 | { 805 | "cell_type": "markdown", 806 | "metadata": {}, 807 | "source": [ 808 | "We can define, for each hyperparameter, a range of possible values." 809 | ] 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "metadata": {}, 814 | "source": [ 815 | "For logistic regression here we will play with just two hyperparameters:\n", 816 | "\n", 817 | "* `penalty` - The loss function use. Both L1 and L2 are common loss functions.\n", 818 | "* `C` - Inverse of regularization strength; smaller values specify stronger regularization. Regularization can prevent overfitting, a concept which you'll discuss in the advanced workshops." 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 46, 824 | "metadata": {}, 825 | "outputs": [], 826 | "source": [ 827 | "hyparam_grid = [{\n", 828 | " 'penalty': ['l1', 'l2'],\n", 829 | " 'C': [1, 10, 100, 1000],\n", 830 | " 'class_weight': ['balanced'],\n", 831 | "}]" 832 | ] 833 | }, 834 | { 835 | "cell_type": "markdown", 836 | "metadata": {}, 837 | "source": [ 838 | "We can optimize are hyperparameters for various metrics..." 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 49, 844 | "metadata": {}, 845 | "outputs": [], 846 | "source": [ 847 | "scores = ['precision', 'recall']" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": 70, 853 | "metadata": {}, 854 | "outputs": [], 855 | "source": [ 856 | "# silence annoying future warnings\n", 857 | "def warn(*args, **kwargs):\n", 858 | " pass\n", 859 | "import warnings\n", 860 | "warnings.warn = warn" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": 54, 866 | "metadata": {}, 867 | "outputs": [ 868 | { 869 | "name": "stdout", 870 | "output_type": "stream", 871 | "text": [ 872 | "# Tuning hyper-parameters for precision\n", 873 | "\n", 874 | "Best parameters set found on development set:\n", 875 | "\n", 876 | "{'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 877 | "\n", 878 | "Grid scores on development set:\n", 879 | "\n", 880 | "0.525 (+/-0.014) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}\n", 881 | "0.525 (+/-0.014) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 882 | "0.515 (+/-0.036) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l1'}\n", 883 | "0.525 (+/-0.014) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 884 | "0.508 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}\n", 885 | "0.521 (+/-0.024) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 886 | "0.516 (+/-0.032) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l1'}\n", 887 | "0.513 (+/-0.031) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 888 | "\n", 889 | "Detailed classification report:\n", 890 | "\n", 891 | "The model is trained on the full development set.\n", 892 | "The scores are computed on the full evaluation set.\n", 893 | "\n", 894 | " precision recall f1-score support\n", 895 | "\n", 896 | " 0.0 0.99 0.68 0.81 195\n", 897 | " 1.0 0.07 0.83 0.14 6\n", 898 | "\n", 899 | " micro avg 0.69 0.69 0.69 201\n", 900 | " macro avg 0.53 0.76 0.47 201\n", 901 | "weighted avg 0.97 0.69 0.79 201\n", 902 | "\n", 903 | "\n", 904 | "# Tuning hyper-parameters for recall\n", 905 | "\n", 906 | "Best parameters set found on development set:\n", 907 | "\n", 908 | "{'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 909 | "\n", 910 | "Grid scores on development set:\n", 911 | "\n", 912 | "0.786 (+/-0.160) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}\n", 913 | "0.785 (+/-0.162) for {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 914 | "0.669 (+/-0.380) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l1'}\n", 915 | "0.788 (+/-0.162) for {'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 916 | "0.586 (+/-0.219) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}\n", 917 | "0.728 (+/-0.256) for {'C': 100, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 918 | "0.621 (+/-0.242) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l1'}\n", 919 | "0.641 (+/-0.326) for {'C': 1000, 'class_weight': 'balanced', 'penalty': 'l2'}\n", 920 | "\n", 921 | "Detailed classification report:\n", 922 | "\n", 923 | "The model is trained on the full development set.\n", 924 | "The scores are computed on the full evaluation set.\n", 925 | "\n", 926 | " precision recall f1-score support\n", 927 | "\n", 928 | " 0.0 0.99 0.68 0.81 195\n", 929 | " 1.0 0.07 0.83 0.14 6\n", 930 | "\n", 931 | " micro avg 0.69 0.69 0.69 201\n", 932 | " macro avg 0.53 0.76 0.47 201\n", 933 | "weighted avg 0.97 0.69 0.79 201\n", 934 | "\n", 935 | "\n" 936 | ] 937 | } 938 | ], 939 | "source": [ 940 | "from sklearn.metrics import classification_report\n", 941 | "from sklearn.model_selection import GridSearchCV\n", 942 | "for score in scores:\n", 943 | " print(\"# Tuning hyper-parameters for %s\" % score)\n", 944 | " print()\n", 945 | "\n", 946 | " clf = GridSearchCV(LogisticRegression(), hyparam_grid, cv=5,\n", 947 | " scoring='%s_macro' % score)\n", 948 | " clf.fit(X_train, y_train)\n", 949 | "\n", 950 | " print(\"Best parameters set found on development set:\")\n", 951 | " print()\n", 952 | " print(clf.best_params_)\n", 953 | " print()\n", 954 | " print(\"Grid scores on development set:\")\n", 955 | " print()\n", 956 | " means = clf.cv_results_['mean_test_score']\n", 957 | " stds = clf.cv_results_['std_test_score']\n", 958 | " for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n", 959 | " print(\"%0.3f (+/-%0.03f) for %r\"\n", 960 | " % (mean, std * 2, params))\n", 961 | " print()\n", 962 | "\n", 963 | " print(\"Detailed classification report:\")\n", 964 | " print()\n", 965 | " print(\"The model is trained on the full development set.\")\n", 966 | " print(\"The scores are computed on the full evaluation set.\")\n", 967 | " print()\n", 968 | " y_true, y_pred = y_test, clf.predict(X_test)\n", 969 | " print(classification_report(y_true, y_pred))\n", 970 | " print()" 971 | ] 972 | }, 973 | { 974 | "cell_type": "markdown", 975 | "metadata": {}, 976 | "source": [ 977 | "We can now fit a model with the tune hyperparameters over the entire training set and test its performace on the test set. This will provide an estimate of performance which is likely to be (possibly substantially) optimistically biased.\n", 978 | "\n", 979 | "A more advanced way to get estimates for hyperoptimization results - and a less biased one - is nested cross-validation; this is a more computationaly intensive method, and out of scope here, of course. :)" 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": 55, 985 | "metadata": {}, 986 | "outputs": [ 987 | { 988 | "name": "stdout", 989 | "output_type": "stream", 990 | "text": [ 991 | "Accuracy: 68.66%\n", 992 | "Precision: 7.46%\n", 993 | "Recall: 83.33%\n" 994 | ] 995 | } 996 | ], 997 | "source": [ 998 | "clf = LogisticRegression(class_weight='balanced', C=10, penalty='l2')\n", 999 | "clf.fit(X_train, y_train)\n", 1000 | "y_pred = clf.predict(X_test)\n", 1001 | "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n", 1002 | "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n", 1003 | "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "markdown", 1008 | "metadata": {}, 1009 | "source": [ 1010 | "(note: this produced no difference in relation to the default hyperparameters)" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "markdown", 1015 | "metadata": {}, 1016 | "source": [ 1017 | "# Trying other models is easy" 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "markdown", 1022 | "metadata": {}, 1023 | "source": [ 1024 | "## SVM" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": 67, 1030 | "metadata": {}, 1031 | "outputs": [], 1032 | "source": [ 1033 | "from sklearn.svm import SVC" 1034 | ] 1035 | }, 1036 | { 1037 | "cell_type": "code", 1038 | "execution_count": 58, 1039 | "metadata": {}, 1040 | "outputs": [ 1041 | { 1042 | "name": "stdout", 1043 | "output_type": "stream", 1044 | "text": [ 1045 | "Accuracy: 97.01%\n", 1046 | "Precision: 0.00%\n", 1047 | "Recall: 0.00%\n" 1048 | ] 1049 | } 1050 | ], 1051 | "source": [ 1052 | "clf = RandomForestClassifier(class_weight='balanced')\n", 1053 | "clf.fit(X_train, y_train)\n", 1054 | "y_pred = clf.predict(X_test)\n", 1055 | "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n", 1056 | "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n", 1057 | "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "code", 1062 | "execution_count": 60, 1063 | "metadata": {}, 1064 | "outputs": [ 1065 | { 1066 | "name": "stdout", 1067 | "output_type": "stream", 1068 | "text": [ 1069 | "Accuracy: 68.66%\n", 1070 | "Precision: 7.46%\n", 1071 | "Recall: 83.33%\n" 1072 | ] 1073 | } 1074 | ], 1075 | "source": [ 1076 | "clf = SVC(class_weight='balanced')\n", 1077 | "clf.fit(X_train, y_train)\n", 1078 | "y_pred = clf.predict(X_test)\n", 1079 | "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n", 1080 | "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n", 1081 | "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))" 1082 | ] 1083 | }, 1084 | { 1085 | "cell_type": "markdown", 1086 | "metadata": {}, 1087 | "source": [ 1088 | "Don't be surprised that different linear models tend to achieve the same performance" 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "code", 1093 | "execution_count": 69, 1094 | "metadata": {}, 1095 | "outputs": [ 1096 | { 1097 | "name": "stdout", 1098 | "output_type": "stream", 1099 | "text": [ 1100 | "Accuracy: 2.99%\n", 1101 | "Precision: 2.99%\n", 1102 | "Recall: 100.00%\n" 1103 | ] 1104 | } 1105 | ], 1106 | "source": [ 1107 | "clf = SVC(class_weight='balanced', kernel='poly')\n", 1108 | "clf.fit(X_train, y_train)\n", 1109 | "y_pred = clf.predict(X_test)\n", 1110 | "print(\"Accuracy: {:,.2f}%\".format(100*accuracy_score(y_test, y_pred)))\n", 1111 | "print(\"Precision: {:,.2f}%\".format(100*precision_score(y_test, y_pred)))\n", 1112 | "print(\"Recall: {:,.2f}%\".format(100*recall_score(y_test, y_pred)))" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": 73, 1118 | "metadata": {}, 1119 | "outputs": [], 1120 | "source": [ 1121 | "hyparam_grid = [{\n", 1122 | " 'kernel': ['rbf', 'poly', 'sigmoid'],\n", 1123 | " 'C': [0.1, 1, 10, 100],\n", 1124 | " 'degree': [2, 3, 4, 5],\n", 1125 | " 'class_weight': ['balanced'],\n", 1126 | "}]" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "code", 1131 | "execution_count": 74, 1132 | "metadata": { 1133 | "scrolled": false 1134 | }, 1135 | "outputs": [ 1136 | { 1137 | "name": "stdout", 1138 | "output_type": "stream", 1139 | "text": [ 1140 | "# Tuning hyper-parameters for precision\n", 1141 | "\n", 1142 | "Best parameters set found on development set:\n", 1143 | "\n", 1144 | "{'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1145 | "\n", 1146 | "Grid scores on development set:\n", 1147 | "\n", 1148 | "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1149 | "0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n", 1150 | "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n", 1151 | "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n", 1152 | "0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n", 1153 | "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n", 1154 | "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n", 1155 | "0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n", 1156 | "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n", 1157 | "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n", 1158 | "0.009 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n", 1159 | "0.530 (+/-0.004) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n", 1160 | "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1161 | "0.527 (+/-0.010) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n", 1162 | "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n", 1163 | "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n", 1164 | "0.009 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n", 1165 | "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n", 1166 | "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n", 1167 | "0.009 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n", 1168 | "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n", 1169 | "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n", 1170 | "0.009 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n", 1171 | "0.530 (+/-0.004) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n", 1172 | "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1173 | "0.530 (+/-0.004) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n", 1174 | "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n", 1175 | "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n", 1176 | "0.527 (+/-0.011) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n", 1177 | "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n", 1178 | "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n", 1179 | "0.491 (+/-0.000) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n", 1180 | "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n", 1181 | "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n", 1182 | "0.491 (+/-0.000) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n", 1183 | "0.528 (+/-0.012) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n", 1184 | "0.527 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1185 | "0.527 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n", 1186 | "0.528 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n", 1187 | "0.527 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n", 1188 | "0.530 (+/-0.004) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n", 1189 | "0.528 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n", 1190 | "0.527 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n", 1191 | "0.521 (+/-0.014) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n", 1192 | "0.528 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n", 1193 | "0.527 (+/-0.022) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n", 1194 | "0.009 (+/-0.000) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n", 1195 | "0.528 (+/-0.012) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n", 1196 | "\n", 1197 | "Detailed classification report:\n", 1198 | "\n", 1199 | "The model is trained on the full development set.\n", 1200 | "The scores are computed on the full evaluation set.\n", 1201 | "\n", 1202 | " precision recall f1-score support\n", 1203 | "\n", 1204 | " 0.0 0.99 0.68 0.81 195\n", 1205 | " 1.0 0.07 0.83 0.14 6\n", 1206 | "\n", 1207 | " micro avg 0.69 0.69 0.69 201\n", 1208 | " macro avg 0.53 0.76 0.47 201\n", 1209 | "weighted avg 0.97 0.69 0.79 201\n", 1210 | "\n", 1211 | "\n", 1212 | "# Tuning hyper-parameters for recall\n", 1213 | "\n", 1214 | "Best parameters set found on development set:\n", 1215 | "\n", 1216 | "{'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1217 | "\n", 1218 | "Grid scores on development set:\n", 1219 | "\n", 1220 | "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1221 | "0.500 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n", 1222 | "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n", 1223 | "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n", 1224 | "0.500 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n", 1225 | "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n", 1226 | "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n", 1227 | "0.500 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n", 1228 | "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n", 1229 | "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n", 1230 | "0.500 (+/-0.000) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n", 1231 | "0.848 (+/-0.025) for {'C': 0.1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n", 1232 | "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1233 | "0.825 (+/-0.083) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n", 1234 | "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n", 1235 | "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n", 1236 | "0.500 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n", 1237 | "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n", 1238 | "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n", 1239 | "0.500 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n", 1240 | "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n", 1241 | "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n", 1242 | "0.500 (+/-0.000) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n", 1243 | "0.848 (+/-0.025) for {'C': 1, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n", 1244 | "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1245 | "0.848 (+/-0.025) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n", 1246 | "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n", 1247 | "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n", 1248 | "0.821 (+/-0.097) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n", 1249 | "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n", 1250 | "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n", 1251 | "0.500 (+/-0.000) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n", 1252 | "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n", 1253 | "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n", 1254 | "0.500 (+/-0.000) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n", 1255 | "0.818 (+/-0.133) for {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n", 1256 | "0.801 (+/-0.247) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'rbf'}\n", 1257 | "0.818 (+/-0.136) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'poly'}\n", 1258 | "0.820 (+/-0.135) for {'C': 100, 'class_weight': 'balanced', 'degree': 2, 'kernel': 'sigmoid'}\n", 1259 | "0.801 (+/-0.247) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'rbf'}\n", 1260 | "0.847 (+/-0.025) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'poly'}\n", 1261 | "0.820 (+/-0.135) for {'C': 100, 'class_weight': 'balanced', 'degree': 3, 'kernel': 'sigmoid'}\n", 1262 | "0.801 (+/-0.247) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'rbf'}\n", 1263 | "0.735 (+/-0.191) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'poly'}\n", 1264 | "0.820 (+/-0.135) for {'C': 100, 'class_weight': 'balanced', 'degree': 4, 'kernel': 'sigmoid'}\n", 1265 | "0.801 (+/-0.247) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'rbf'}\n", 1266 | "0.500 (+/-0.000) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'poly'}\n", 1267 | "0.820 (+/-0.135) for {'C': 100, 'class_weight': 'balanced', 'degree': 5, 'kernel': 'sigmoid'}\n", 1268 | "\n", 1269 | "Detailed classification report:\n", 1270 | "\n", 1271 | "The model is trained on the full development set.\n", 1272 | "The scores are computed on the full evaluation set.\n", 1273 | "\n", 1274 | " precision recall f1-score support\n", 1275 | "\n", 1276 | " 0.0 0.99 0.68 0.81 195\n", 1277 | " 1.0 0.07 0.83 0.14 6\n", 1278 | "\n", 1279 | " micro avg 0.69 0.69 0.69 201\n", 1280 | " macro avg 0.53 0.76 0.47 201\n", 1281 | "weighted avg 0.97 0.69 0.79 201\n", 1282 | "\n", 1283 | "\n" 1284 | ] 1285 | } 1286 | ], 1287 | "source": [ 1288 | "from sklearn.metrics import classification_report\n", 1289 | "from sklearn.model_selection import GridSearchCV\n", 1290 | "for score in scores:\n", 1291 | " print(\"# Tuning hyper-parameters for %s\" % score)\n", 1292 | " print()\n", 1293 | "\n", 1294 | " clf = GridSearchCV(SVC(), hyparam_grid, cv=5,\n", 1295 | " scoring='%s_macro' % score)\n", 1296 | " clf.fit(X_train, y_train)\n", 1297 | "\n", 1298 | " print(\"Best parameters set found on development set:\")\n", 1299 | " print()\n", 1300 | " print(clf.best_params_)\n", 1301 | " print()\n", 1302 | " print(\"Grid scores on development set:\")\n", 1303 | " print()\n", 1304 | " means = clf.cv_results_['mean_test_score']\n", 1305 | " stds = clf.cv_results_['std_test_score']\n", 1306 | " for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n", 1307 | " print(\"%0.3f (+/-%0.03f) for %r\"\n", 1308 | " % (mean, std * 2, params))\n", 1309 | " print()\n", 1310 | "\n", 1311 | " print(\"Detailed classification report:\")\n", 1312 | " print()\n", 1313 | " print(\"The model is trained on the full development set.\")\n", 1314 | " print(\"The scores are computed on the full evaluation set.\")\n", 1315 | " print()\n", 1316 | " y_true, y_pred = y_test, clf.predict(X_test)\n", 1317 | " print(classification_report(y_true, y_pred))\n", 1318 | " print()" 1319 | ] 1320 | }, 1321 | { 1322 | "cell_type": "markdown", 1323 | "metadata": {}, 1324 | "source": [ 1325 | "## Random forest" 1326 | ] 1327 | }, 1328 | { 1329 | "cell_type": "code", 1330 | "execution_count": 68, 1331 | "metadata": {}, 1332 | "outputs": [], 1333 | "source": [ 1334 | "from sklearn.ensemble import RandomForestClassifier" 1335 | ] 1336 | }, 1337 | { 1338 | "cell_type": "code", 1339 | "execution_count": 63, 1340 | "metadata": {}, 1341 | "outputs": [], 1342 | "source": [ 1343 | "hyparam_grid = [{\n", 1344 | " 'n_estimators': [5, 20, 100],\n", 1345 | " 'criterion': ['gini', 'entropy'],\n", 1346 | " 'max_depth': [None, 3, 5],\n", 1347 | " 'class_weight': ['balanced'],\n", 1348 | "}]" 1349 | ] 1350 | }, 1351 | { 1352 | "cell_type": "code", 1353 | "execution_count": 64, 1354 | "metadata": { 1355 | "scrolled": false 1356 | }, 1357 | "outputs": [ 1358 | { 1359 | "name": "stdout", 1360 | "output_type": "stream", 1361 | "text": [ 1362 | "# Tuning hyper-parameters for precision\n", 1363 | "\n", 1364 | "Best parameters set found on development set:\n", 1365 | "\n", 1366 | "{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 5}\n", 1367 | "\n", 1368 | "Grid scores on development set:\n", 1369 | "\n", 1370 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 5}\n", 1371 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 20}\n", 1372 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}\n", 1373 | "0.524 (+/-0.058) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 5}\n", 1374 | "0.524 (+/-0.136) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 20}\n", 1375 | "0.511 (+/-0.083) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 100}\n", 1376 | "0.533 (+/-0.132) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 5}\n", 1377 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 20}\n", 1378 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 100}\n", 1379 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 5}\n", 1380 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 20}\n", 1381 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 100}\n", 1382 | "0.503 (+/-0.053) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 5}\n", 1383 | "0.533 (+/-0.132) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 20}\n", 1384 | "0.505 (+/-0.060) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 100}\n", 1385 | "0.518 (+/-0.071) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 5}\n", 1386 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 20}\n", 1387 | "0.491 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 100}\n", 1388 | "\n", 1389 | "Detailed classification report:\n", 1390 | "\n", 1391 | "The model is trained on the full development set.\n", 1392 | "The scores are computed on the full evaluation set.\n", 1393 | "\n", 1394 | " precision recall f1-score support\n", 1395 | "\n", 1396 | " 0.0 0.98 0.90 0.94 195\n", 1397 | " 1.0 0.10 0.33 0.15 6\n", 1398 | "\n", 1399 | " micro avg 0.89 0.89 0.89 201\n", 1400 | " macro avg 0.54 0.62 0.54 201\n", 1401 | "weighted avg 0.95 0.89 0.92 201\n", 1402 | "\n", 1403 | "\n", 1404 | "# Tuning hyper-parameters for recall\n", 1405 | "\n", 1406 | "Best parameters set found on development set:\n", 1407 | "\n", 1408 | "{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 5}\n", 1409 | "\n", 1410 | "Grid scores on development set:\n", 1411 | "\n", 1412 | "0.499 (+/-0.003) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 5}\n", 1413 | "0.500 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 20}\n", 1414 | "0.500 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}\n", 1415 | "0.513 (+/-0.223) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 5}\n", 1416 | "0.503 (+/-0.133) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 20}\n", 1417 | "0.486 (+/-0.010) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 100}\n", 1418 | "0.508 (+/-0.145) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 5}\n", 1419 | "0.492 (+/-0.015) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 20}\n", 1420 | "0.496 (+/-0.006) for {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 100}\n", 1421 | "0.530 (+/-0.133) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 5}\n", 1422 | "0.499 (+/-0.003) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 20}\n", 1423 | "0.500 (+/-0.000) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 100}\n", 1424 | "0.557 (+/-0.257) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 5}\n", 1425 | "0.539 (+/-0.139) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 20}\n", 1426 | "0.512 (+/-0.140) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 100}\n", 1427 | "0.472 (+/-0.064) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 5}\n", 1428 | "0.492 (+/-0.019) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 20}\n", 1429 | "0.497 (+/-0.006) for {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 100}\n", 1430 | "\n", 1431 | "Detailed classification report:\n", 1432 | "\n", 1433 | "The model is trained on the full development set.\n", 1434 | "The scores are computed on the full evaluation set.\n", 1435 | "\n", 1436 | " precision recall f1-score support\n", 1437 | "\n", 1438 | " 0.0 0.98 0.81 0.88 195\n", 1439 | " 1.0 0.05 0.33 0.09 6\n", 1440 | "\n", 1441 | " micro avg 0.79 0.79 0.79 201\n", 1442 | " macro avg 0.51 0.57 0.48 201\n", 1443 | "weighted avg 0.95 0.79 0.86 201\n", 1444 | "\n", 1445 | "\n" 1446 | ] 1447 | } 1448 | ], 1449 | "source": [ 1450 | "from sklearn.metrics import classification_report\n", 1451 | "from sklearn.model_selection import GridSearchCV\n", 1452 | "for score in scores:\n", 1453 | " print(\"# Tuning hyper-parameters for %s\" % score)\n", 1454 | " print()\n", 1455 | "\n", 1456 | " clf = GridSearchCV(RandomForestClassifier(), hyparam_grid, cv=5,\n", 1457 | " scoring='%s_macro' % score)\n", 1458 | " clf.fit(X_train, y_train)\n", 1459 | "\n", 1460 | " print(\"Best parameters set found on development set:\")\n", 1461 | " print()\n", 1462 | " print(clf.best_params_)\n", 1463 | " print()\n", 1464 | " print(\"Grid scores on development set:\")\n", 1465 | " print()\n", 1466 | " means = clf.cv_results_['mean_test_score']\n", 1467 | " stds = clf.cv_results_['std_test_score']\n", 1468 | " for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n", 1469 | " print(\"%0.3f (+/-%0.03f) for %r\"\n", 1470 | " % (mean, std * 2, params))\n", 1471 | " print()\n", 1472 | "\n", 1473 | " print(\"Detailed classification report:\")\n", 1474 | " print()\n", 1475 | " print(\"The model is trained on the full development set.\")\n", 1476 | " print(\"The scores are computed on the full evaluation set.\")\n", 1477 | " print()\n", 1478 | " y_true, y_pred = y_test, clf.predict(X_test)\n", 1479 | " print(classification_report(y_true, y_pred))\n", 1480 | " print()" 1481 | ] 1482 | }, 1483 | { 1484 | "cell_type": "markdown", 1485 | "metadata": {}, 1486 | "source": [ 1487 | "Without putting too much effort into hyperparameter tunning, a random forest classifier got an F1 score of 54% (averaged over classes) compared to 47% of logistic regression and SVM." 1488 | ] 1489 | }, 1490 | { 1491 | "cell_type": "markdown", 1492 | "metadata": {}, 1493 | "source": [ 1494 | "Even if that is not what we want to optimize for here (and that depends on a lot of factors), this still demonstrates why giving other models at least a superficial examination can be worthwhile." 1495 | ] 1496 | }, 1497 | { 1498 | "cell_type": "markdown", 1499 | "metadata": {}, 1500 | "source": [ 1501 | "## Final notes: The classification problem here is significantly unbalanced; we did not treat this at all except for balancing class weights. An imbalance of 98/2 probably justifies more sophisticated tools.\n", 1502 | "\n", 1503 | "## One such option is to treat positive examples as anomalies, and to then draw on knowledge and methods from the field of anomaly detection.\n", 1504 | "\n", 1505 | "## Other options can include more advnaced processing of the data.\n", 1506 | "\n", 1507 | "## The advanced workshops might go into more details." 1508 | ] 1509 | }, 1510 | { 1511 | "cell_type": "markdown", 1512 | "metadata": {}, 1513 | "source": [ 1514 | "# We're done! Thank you!" 1515 | ] 1516 | } 1517 | ], 1518 | "metadata": { 1519 | "kernelspec": { 1520 | "display_name": "py3", 1521 | "language": "python", 1522 | "name": "py3" 1523 | }, 1524 | "language_info": { 1525 | "codemirror_mode": { 1526 | "name": "ipython", 1527 | "version": 3 1528 | }, 1529 | "file_extension": ".py", 1530 | "mimetype": "text/x-python", 1531 | "name": "python", 1532 | "nbconvert_exporter": "python", 1533 | "pygments_lexer": "ipython3", 1534 | "version": "3.6.5" 1535 | } 1536 | }, 1537 | "nbformat": 4, 1538 | "nbformat_minor": 2 1539 | } 1540 | -------------------------------------------------------------------------------- /util_0.reading_the_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Reading the data the easy way" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Package installation and imports" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!pip install -U -q datalearn19intro" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from datalearn19intro import (get_accounts, get_events, get_subscriptions, get_users)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Reading the data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "acc = get_accounts()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/html": [ 59 | "
\n", 60 | "\n", 73 | "\n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | "
account_idmarketing_sourcemarketing_referrercreated_atplan_idtrial_startstarted_plan_atsignup_box_originchurn_statechurn_date...utm_cluster_idpricing_versionhas_domainmrrlead_scoreindustry.1team_sizeuser_goaluser_descriptionsub_industry
02793496binghttps://www.bing.com/search?q=basecamp login2019-01-01NaN2019-01-01NaNNaNnoneNaN...orders30NaN0NaNNaNNaNNaNNaN
12793497NaNNaN2019-01-01NaN2019-01-01NaNmobile_appnoneNaN...NaN30NaN0NaNNaNNaNNaNNaN
22793498adwordsverticalshttps://www.google.com/2019-01-01NaN2019-01-01NaNNaNnoneNaN...todos30NaN0Other1NaNNaNNaN
\n", 175 | "

3 rows × 49 columns

\n", 176 | "
" 177 | ], 178 | "text/plain": [ 179 | " account_id marketing_source marketing_referrer \\\n", 180 | "0 2793496 bing https://www.bing.com/search?q=basecamp login \n", 181 | "1 2793497 NaN NaN \n", 182 | "2 2793498 adwordsverticals https://www.google.com/ \n", 183 | "\n", 184 | " created_at plan_id trial_start started_plan_at signup_box_origin \\\n", 185 | "0 2019-01-01 NaN 2019-01-01 NaN NaN \n", 186 | "1 2019-01-01 NaN 2019-01-01 NaN mobile_app \n", 187 | "2 2019-01-01 NaN 2019-01-01 NaN NaN \n", 188 | "\n", 189 | " churn_state churn_date ... utm_cluster_id pricing_version has_domain mrr \\\n", 190 | "0 none NaN ... orders 3 0 NaN \n", 191 | "1 none NaN ... NaN 3 0 NaN \n", 192 | "2 none NaN ... todos 3 0 NaN \n", 193 | "\n", 194 | " lead_score industry.1 team_size user_goal user_description sub_industry \n", 195 | "0 0 NaN NaN NaN NaN NaN \n", 196 | "1 0 NaN NaN NaN NaN NaN \n", 197 | "2 0 Other 1 NaN NaN NaN \n", 198 | "\n", 199 | "[3 rows x 49 columns]" 200 | ] 201 | }, 202 | "execution_count": 5, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "acc.head(3)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 6, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "events = get_events()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 8, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/html": [ 228 | "
\n", 229 | "\n", 242 | "\n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | "
DATEuser_idaccount_idtotal_eventscolumn_eventsboard_eventsnum_of_boardscount_kind_columnsraw_eventsgroup_events...new_entry_eventspayment_eventsinbox_eventscommunicating_eventsnon_communicating_eventsweb_eventsios_eventsandroid_eventsdesktop_app_eventsempty_events
02019-01-016181915279379021230114140...1014582010009
12019-01-0161822662793900207000000...100010189008
22019-01-01618219027938604000000...0000000000
\n", 344 | "

3 rows × 23 columns

\n", 345 | "
" 346 | ], 347 | "text/plain": [ 348 | " DATE user_id account_id total_events column_events board_events \\\n", 349 | "0 2019-01-01 6181915 2793790 212 30 1 \n", 350 | "1 2019-01-01 6182266 2793900 207 0 0 \n", 351 | "2 2019-01-01 6182190 2793860 4 0 0 \n", 352 | "\n", 353 | " num_of_boards count_kind_columns raw_events group_events ... \\\n", 354 | "0 1 4 14 0 ... \n", 355 | "1 0 0 0 0 ... \n", 356 | "2 0 0 0 0 ... \n", 357 | "\n", 358 | " new_entry_events payment_events inbox_events communicating_events \\\n", 359 | "0 1 0 1 4 \n", 360 | "1 1 0 0 0 \n", 361 | "2 0 0 0 0 \n", 362 | "\n", 363 | " non_communicating_events web_events ios_events android_events \\\n", 364 | "0 58 201 0 0 \n", 365 | "1 1 0 189 0 \n", 366 | "2 0 0 0 0 \n", 367 | "\n", 368 | " desktop_app_events empty_events \n", 369 | "0 0 9 \n", 370 | "1 0 8 \n", 371 | "2 0 0 \n", 372 | "\n", 373 | "[3 rows x 23 columns]" 374 | ] 375 | }, 376 | "execution_count": 8, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "events.head(3)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 9, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "subs = get_subscriptions()" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 10, 397 | "metadata": {}, 398 | "outputs": [ 399 | { 400 | "data": { 401 | "text/html": [ 402 | "
\n", 403 | "\n", 416 | "\n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | "
event_dateaccount_idplan_idevent_typeinvoice_charge_amountprev_plan_idstatusstatus_reasoncurrencyinvoice_charge_amount_usdmrr_gainsubscription_idnext_charge_datepayment_typetransaction_date
02019-01-072793955199CHARGE64.0NaNNaNNaNAUD44.6746.065984302.02019-02-07 21:05:00CC2019-01-07 21:05:00
12019-05-072793955199RECURRING64.0199.0NaNNaNAUD43.940.065984302.02019-06-07 20:05:00CC2019-05-07 20:34:00
22019-03-072793955199RECURRING64.0199.0NaNNaNAUD44.200.065984302.02019-04-07 20:05:00CC2019-03-07 21:33:00
\n", 494 | "
" 495 | ], 496 | "text/plain": [ 497 | " event_date account_id plan_id event_type invoice_charge_amount \\\n", 498 | "0 2019-01-07 2793955 199 CHARGE 64.0 \n", 499 | "1 2019-05-07 2793955 199 RECURRING 64.0 \n", 500 | "2 2019-03-07 2793955 199 RECURRING 64.0 \n", 501 | "\n", 502 | " prev_plan_id status status_reason currency invoice_charge_amount_usd \\\n", 503 | "0 NaN NaN NaN AUD 44.67 \n", 504 | "1 199.0 NaN NaN AUD 43.94 \n", 505 | "2 199.0 NaN NaN AUD 44.20 \n", 506 | "\n", 507 | " mrr_gain subscription_id next_charge_date payment_type \\\n", 508 | "0 46.0 65984302.0 2019-02-07 21:05:00 CC \n", 509 | "1 0.0 65984302.0 2019-06-07 20:05:00 CC \n", 510 | "2 0.0 65984302.0 2019-04-07 20:05:00 CC \n", 511 | "\n", 512 | " transaction_date \n", 513 | "0 2019-01-07 21:05:00 \n", 514 | "1 2019-05-07 20:34:00 \n", 515 | "2 2019-03-07 21:33:00 " 516 | ] 517 | }, 518 | "execution_count": 10, 519 | "metadata": {}, 520 | "output_type": "execute_result" 521 | } 522 | ], 523 | "source": [ 524 | "subs.head(3)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 11, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "users = get_users()" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 12, 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "data": { 543 | "text/html": [ 544 | "
\n", 545 | "\n", 558 | "\n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | "
account_iduser_idcreated_atis_adminpendingenabledbecame_active_attime_diffcityregion...browseris_gmailcampaign_idfirst_user_in_account_idLANGUAGEgendersenioritytitlemobile_activation_datehas_phone
0279349661813412018-12-311012019-01-0111.0WarraweeNew South Wales...microsoft edge14005514.029837820.0NaNMNaNNaNNaN1
1279349761813392018-12-311012019-01-01-5.0Old BridgeNew Jersey...NaN14005516.029837847.0NaNFNaNNaN2019-01-011
2279349761813862019-01-010012019-01-01-5.0New YorkNew York...NaN14005619.029839571.0NaNMNaNNaN2019-01-011
\n", 660 | "

3 rows × 25 columns

\n", 661 | "
" 662 | ], 663 | "text/plain": [ 664 | " account_id user_id created_at is_admin pending enabled \\\n", 665 | "0 2793496 6181341 2018-12-31 1 0 1 \n", 666 | "1 2793497 6181339 2018-12-31 1 0 1 \n", 667 | "2 2793497 6181386 2019-01-01 0 0 1 \n", 668 | "\n", 669 | " became_active_at time_diff city region ... \\\n", 670 | "0 2019-01-01 11.0 Warrawee New South Wales ... \n", 671 | "1 2019-01-01 -5.0 Old Bridge New Jersey ... \n", 672 | "2 2019-01-01 -5.0 New York New York ... \n", 673 | "\n", 674 | " browser is_gmail campaign_id first_user_in_account_id LANGUAGE \\\n", 675 | "0 microsoft edge 1 4005514.0 29837820.0 NaN \n", 676 | "1 NaN 1 4005516.0 29837847.0 NaN \n", 677 | "2 NaN 1 4005619.0 29839571.0 NaN \n", 678 | "\n", 679 | " gender seniority title mobile_activation_date has_phone \n", 680 | "0 M NaN NaN NaN 1 \n", 681 | "1 F NaN NaN 2019-01-01 1 \n", 682 | "2 M NaN NaN 2019-01-01 1 \n", 683 | "\n", 684 | "[3 rows x 25 columns]" 685 | ] 686 | }, 687 | "execution_count": 12, 688 | "metadata": {}, 689 | "output_type": "execute_result" 690 | } 691 | ], 692 | "source": [ 693 | "users.head(3)" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "metadata": {}, 700 | "outputs": [], 701 | "source": [] 702 | } 703 | ], 704 | "metadata": { 705 | "kernelspec": { 706 | "display_name": "py3", 707 | "language": "python", 708 | "name": "py3" 709 | }, 710 | "language_info": { 711 | "codemirror_mode": { 712 | "name": "ipython", 713 | "version": 3 714 | }, 715 | "file_extension": ".py", 716 | "mimetype": "text/x-python", 717 | "name": "python", 718 | "nbconvert_exporter": "python", 719 | "pygments_lexer": "ipython3", 720 | "version": "3.6.5" 721 | } 722 | }, 723 | "nbformat": 4, 724 | "nbformat_minor": 2 725 | } 726 | --------------------------------------------------------------------------------