├── .gitignore
├── Intro to Machine Learning.pdf
├── README.rst
├── data
├── Dynamic events table.csv
├── Dynamic subscription table.csv
├── accounts.csv
├── monday_datalearn.csv
└── users.csv
├── datalearn19intro
├── LICENSE
├── README.rst
├── datalearn19intro
│ ├── __init__.py
│ └── dataloader.py
├── mit_license_badge.svg
└── setup.py
├── part_1.introducing_jupyter.ipynb
├── part_2.numpy.ipynb
├── part_3.pandas.ipynb
├── part_4.EDA.ipynb
├── part_5.Preprocessing.ipynb
├── part_6.modeling.ipynb
└── util_0.reading_the_data.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 | db.sqlite3-journal
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # IPython
80 | profile_default/
81 | ipython_config.py
82 |
83 | # pyenv
84 | .python-version
85 |
86 | # pipenv
87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
90 | # install all needed dependencies.
91 | #Pipfile.lock
92 |
93 | # celery beat schedule file
94 | celerybeat-schedule
95 |
96 | # SageMath parsed files
97 | *.sage.py
98 |
99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 |
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 |
112 | # Rope project settings
113 | .ropeproject
114 |
115 | # mkdocs documentation
116 | /site
117 |
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 |
123 | # Pyre type checker
124 | .pyre/
125 |
126 | # vim swap files
127 | *.swp
128 |
129 | .DS_Store
130 |
--------------------------------------------------------------------------------
/Intro to Machine Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataHackIL/DataLearn-ML-Intro-2019/614ed306726f2b5f073b7da2d621069ecbd26023/Intro to Machine Learning.pdf
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | DataLearn Supervised ML Intro 2019
2 | ##################################
3 |
4 | The repository of the hands-on introduction to machine learning workshop of the DataLearn 2019 track at DataHack 2019.
5 |
6 | Video link: https://youtu.be/Su8YcXgkDsk?t=1701
7 |
8 | `Meetup event link `_
9 |
10 | Notebooks by Shay Palachy. Presentation by Shay Palachy and Dana Kaner. *(Thank you Dana <3)*
11 |
12 | Resources
13 | =========
14 |
15 | * Presentation:
16 |
17 | * `Intro to Machine Learning presentation `_
18 | * Credits: Shay Palachy **and Dana Kaner**
19 |
20 | * Video recording: https://youtu.be/Su8YcXgkDsk?t=1701
21 |
22 | * Notebooks:
23 |
24 | 1. `Introducing Jupyter notebooks `_
25 | 2. `Introduction to numpy `_
26 | 3. `Introduction to pandas `_
27 | 4. `Exploratory Data Analysis `_
28 | 5. `Preprocessing `_
29 | 6. `Modeling `_
30 | 7. `Utility: Reading the data `_
31 |
32 |
33 | Outline
34 | =======
35 |
36 | * Tools of the trade
37 |
38 | * Jupyter notebooks
39 | * numpy
40 | * pandas
41 |
42 | * Data exploration
43 | * Preprocessing
44 |
45 | * Imputation
46 | * Scaling and normalization
47 | * Handling outliers
48 | * Feature extraction/generation
49 | * Feature selection
50 | * Dimensionality reduction
51 |
52 | * Modeling
53 |
54 | * Model fit & loss functions
55 | * Splitting your data
56 | * Model evaluation
57 | * Hyperparameter Optimization
58 |
--------------------------------------------------------------------------------
/data/Dynamic subscription table.csv:
--------------------------------------------------------------------------------
1 | "event_date","account_id","plan_id","event_type","invoice_charge_amount","prev_plan_id","status","status_reason","currency","invoice_charge_amount_usd","mrr_gain","subscription_id","next_charge_date","payment_type","transaction_date"
2 | "2019-01-07",2793955,199,"CHARGE",64.00,NULL,"","","AUD",44.67,46.00,65984302,"2019-02-07 21:05:00","CC","2019-01-07 21:05:00"
3 | "2019-05-07",2793955,199,"RECURRING",64.00,199,"","","AUD",43.94,0.00,65984302,"2019-06-07 20:05:00","CC","2019-05-07 20:34:00"
4 | "2019-03-07",2793955,199,"RECURRING",64.00,199,"","","AUD",44.20,0.00,65984302,"2019-04-07 20:05:00","CC","2019-03-07 21:33:00"
5 | "2019-06-07",2793955,199,"RECURRING",64.00,199,"","","AUD",43.62,0.00,65984302,"2019-07-07 20:05:00","CC","2019-06-07 20:34:00"
6 | "2019-04-10",2793955,199,"PAYMENT_METHOD_UPDATED",0.00,199,"","","",0.00,NULL,65984302,NULL,"CC","2019-04-10 02:32:00"
7 | "2019-02-07",2793955,199,"RECURRING",64.00,199,"","","AUD",44.52,-1.00,65984302,"2019-03-07 21:05:00","CC","2019-02-07 21:33:00"
8 | "2019-04-07",2793955,199,"RECURRING",64.00,199,"","","AUD",44.54,0.00,65984302,"2019-05-07 20:05:00","CC","2019-04-07 20:23:00"
9 | "2019-07-07",2793955,199,"RECURRING",64.00,199,"","","AUD",43.91,0.00,65984302,"2019-08-07 20:05:00","CC","2019-07-07 20:01:00"
10 | "2019-01-07",2794172,198,"CHARGE",29.00,NULL,"","","USD",29.00,29.00,65925498,"2019-02-07 07:09:00","CC","2019-01-07 07:09:00"
11 | "2019-01-28",2794172,198,"CANCEL_ON_RENEWAL",0.00,198,"","","USD",0.00,NULL,65925498,"2019-02-07 07:09:00","CC","2019-01-28 07:27:00"
12 | "2019-02-07",2794172,198,"CANCELLATION",0.00,198,"","","USD",0.00,-29.00,65925498,"2019-02-07 07:09:00","CC","2019-02-07 07:32:00"
13 | "2019-01-28",2794489,231,"CONTRACT_CHANGE",0.00,232,"SUCCESS","","USD",0.00,-20.00,66852410,"2020-01-28 15:02:00","CC","2019-01-28 15:02:00"
14 | "2019-01-28",2794489,232,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2020-03-28 07:00:00","",NULL
15 | "2019-01-28",2794489,231,"RECURRING",0.00,231,"","","USD",0.00,NULL,66852410,"2020-01-28 15:02:00","BALANCE","2019-01-28 15:02:00"
16 | "2019-01-29",2794489,232,"REFUND",-258.72,232,"","","USD",-258.72,NULL,66852410,"2020-01-28 15:02:00","CC","2019-01-29 09:51:00"
17 | "2019-01-29",2794489,231,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2020-01-17 08:00:00","",NULL
18 | "2019-01-28",2794489,232,"RECURRING",258.72,232,"","","USD",258.72,NULL,66852410,"2020-01-28 14:40:00","CC","2019-01-28 14:40:00"
19 | "2019-01-17",2794489,231,"CHARGE",468.00,NULL,"","","USD",468.00,39.00,66852410,"2020-01-17 08:51:00","CC","2019-01-17 08:51:00"
20 | "2019-01-28",2794489,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","USD",0.00,20.00,66852410,"2020-01-28 14:40:00","CC","2019-01-28 14:40:00"
21 | "2019-01-14",2793704,231,"CHARGE",624.00,NULL,"","","CAD",460.64,39.00,66569204,"2020-01-14 15:16:00","CC","2019-01-14 15:16:00"
22 | "2019-01-15",2793906,239,"CHARGE",1428.00,NULL,"","","USD",1428.00,119.00,66641482,"2020-01-15 04:57:00","CC","2019-01-15 04:57:00"
23 | "2019-01-15",2793906,239,"CC_CHARGE_FAILED",0.00,239,"","This transaction has been declined. Please check card details and try again, or contact your bank for assistance","USD",0.00,NULL,NULL,NULL,"CC","2019-01-15 04:56:00"
24 | "2019-01-03",2794458,232,"RECURRING",291.01,232,"","","USD",291.01,NULL,65496398,"2020-01-03 16:20:00","CC","2019-01-03 16:20:00"
25 | "2019-01-03",2794458,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","USD",0.00,24.00,65496398,"2020-01-03 16:20:00","CC","2019-01-03 16:20:00"
26 | "2019-01-02",2794458,231,"CHARGE",421.20,NULL,"","","USD",421.20,35.00,65496398,"2020-01-02 15:46:00","CC","2019-01-02 15:46:00"
27 | "2019-07-22",2794064,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","AUD",0.00,19.00,66092630,"2020-07-22 07:33:00","CC","2019-07-22 07:33:00"
28 | "2019-01-08",2794064,231,"CHARGE",624.00,NULL,"","","AUD",436.18,37.00,66092630,"2020-01-08 22:31:00","CC","2019-01-08 22:31:00"
29 | "2019-07-22",2794064,232,"RECURRING",660.96,232,"","","AUD",455.81,NULL,66092630,"2020-07-22 07:33:00","CC","2019-07-22 07:33:00"
30 | "2019-01-19",2794278,199,"CANCEL_ON_RENEWAL",0.00,199,"","","USD",0.00,NULL,65670660,"2019-02-04 10:17:00","CC","2019-01-19 14:45:00"
31 | "2019-02-04",2794278,199,"CANCELLATION",0.00,199,"","","USD",0.00,-48.00,65670660,"2019-02-04 10:17:00","CC","2019-02-04 10:33:00"
32 | "2019-01-04",2794278,199,"CHARGE",48.00,NULL,"","","USD",48.00,48.00,65670660,"2019-02-04 10:17:00","CC","2019-01-04 10:17:00"
33 | "2019-01-21",2793789,231,"CHARGE",468.00,NULL,"","","USD",468.00,39.00,67146528,"2020-01-21 06:06:00","CC","2019-01-21 06:06:00"
34 | "2019-01-18",2793567,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","CAD",0.00,20.00,66577666,"2020-01-18 15:09:00","CC","2019-01-18 15:09:00"
35 | "2019-03-20",2793567,264,"CONTRACT_CHANGE",0.00,232,"SUCCESS","","CAD",0.00,-10.00,66577666,"2021-03-20 20:38:00","CC","2019-03-20 20:38:00"
36 | "2019-01-18",2793567,232,"RECURRING",336.48,232,"","","CAD",248.73,NULL,66577666,"2020-01-18 15:09:00","CC","2019-01-18 15:09:00"
37 | "2019-01-18",2793567,232,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2020-03-18 07:00:00","",NULL
38 | "2019-03-20",2793567,264,"RECURRING",797.16,264,"","","CAD",585.82,NULL,66577666,"2021-03-20 20:38:00","CC","2019-03-20 20:38:00"
39 | "2019-01-14",2793567,231,"CHARGE",624.00,NULL,"","","CAD",460.64,39.00,66577666,"2020-01-14 16:33:00","CC","2019-01-14 16:33:00"
40 | "2019-03-20",2793567,264,"FREE_DAYS_WERE_GIVEN",0.00,NULL,"","","",0.00,NULL,NULL,"2021-05-20 07:00:00","",NULL
41 | "2019-01-11",2793828,230,"CHARGE",276.00,NULL,"","","USD",276.00,23.00,66303986,"2020-01-11 03:35:00","CC","2019-01-11 03:35:00"
42 | "2019-01-15",2794328,198,"CHARGE",25.00,NULL,"","","GBP",31.48,32.00,66658322,"2019-02-15 12:38:00","CC","2019-01-15 12:38:00"
43 | "2019-04-30",2794328,202,"REFUND",-51.00,202,"","","GBP",-64.38,NULL,66658322,"2019-05-28 09:13:00","CC","2019-04-30 10:08:00"
44 | "2019-01-30",2794328,202,"CONTRACT_CHANGE",0.00,198,"SUCCESS","","GBP",0.00,35.00,66658322,"2019-02-28 10:13:00","CC","2019-01-30 10:13:00"
45 | "2019-04-30",2794328,202,"CANCEL_ON_RENEWAL",0.00,202,"","","GBP",0.00,NULL,66658322,"2019-05-28 09:13:00","CC","2019-04-30 09:49:00"
46 | "2019-04-28",2794328,202,"RECURRING",51.00,202,"","","GBP",64.38,-1.00,66658322,"2019-05-28 09:13:00","CC","2019-04-28 11:17:00"
47 | "2019-02-28",2794328,202,"RECURRING",51.00,202,"","","GBP",66.56,1.00,66658322,"2019-03-28 09:13:00","CC","2019-02-28 10:37:00"
48 | "2019-01-30",2794328,202,"RECURRING",38.25,202,"","","GBP",49.28,NULL,66658322,"2019-02-28 10:13:00","CC","2019-01-30 10:13:00"
49 | "2019-04-30",2794328,202,"CANCELLATION",0.00,202,"","","GBP",0.00,-66.00,66658322,"2019-05-28 09:13:00","CC","2019-04-30 10:08:00"
50 | "2019-03-28",2794328,202,"RECURRING",51.00,202,"","","GBP",66.12,-1.00,66658322,"2019-04-28 09:13:00","CC","2019-03-28 09:24:00"
51 | "2019-01-17",2793924,264,"CONTRACT_CHANGE",0.00,263,"SUCCESS","","USD",0.00,17.00,66839172,"2021-01-17 13:06:00","CC","2019-01-17 13:06:00"
52 | "2019-01-17",2793924,263,"CHARGE",768.00,NULL,"","","USD",768.00,32.00,66839172,"2021-01-17 03:06:00","CC","2019-01-17 03:06:00"
53 | "2019-01-17",2793924,264,"RECURRING",408.00,264,"","","USD",408.00,NULL,66839172,"2021-01-17 13:06:00","CC","2019-01-17 13:06:00"
54 | "2019-06-29",2793924,268,"CONTRACT_CHANGE",0.00,264,"SUCCESS","","USD",0.00,50.00,66839172,"2021-06-29 10:08:00","CC","2019-06-29 10:08:00"
55 | "2019-06-29",2793924,268,"RECURRING",1470.48,268,"","","USD",1470.48,NULL,66839172,"2021-06-29 10:08:00","CC","2019-06-29 10:08:00"
56 | "2019-01-12",2794423,232,"CHARGE",708.00,NULL,"","","USD",708.00,59.00,66415970,"2020-01-12 13:22:00","CC","2019-01-12 13:22:00"
57 | "2019-02-28",2794463,818,"CHARGE",48.00,NULL,"","","USD",48.00,48.00,70971926,"2019-03-28 21:07:00","CC","2019-02-28 22:07:00"
58 | "2019-04-03",2794463,200,"RECURRING",33.60,200,"","","USD",33.60,NULL,70971926,"2019-05-03 10:11:00","CC","2019-04-03 10:11:00"
59 | "2019-03-28",2794463,818,"RECURRING",48.00,818,"","","USD",48.00,0.00,70971926,"2019-04-28 21:07:00","CC","2019-03-28 21:23:00"
60 | "2019-05-03",2794463,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,70971926,"2019-06-03 10:11:00","CC","2019-05-03 10:34:00"
61 | "2019-07-03",2794463,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,70971926,"2019-08-03 10:11:00","CC","2019-07-03 10:01:00"
62 | "2019-06-03",2794463,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,70971926,"2019-07-03 10:11:00","CC","2019-06-03 10:33:00"
63 | "2019-04-03",2794463,200,"CONTRACT_CHANGE",0.00,818,"SUCCESS","","USD",0.00,24.00,70971926,"2019-05-03 10:11:00","CC","2019-04-03 10:11:00"
64 | "2019-01-09",2794383,232,"RECURRING",240.39,232,"","","GBP",300.52,NULL,65941602,"2020-01-09 09:04:00","CC","2019-01-09 09:04:00"
65 | "2019-01-30",2794383,236,"CONTRACT_CHANGE",0.00,232,"SUCCESS","","GBP",0.00,69.00,65941602,"2020-01-30 10:29:00","CC","2019-01-30 10:29:00"
66 | "2019-01-30",2794383,236,"RECURRING",648.72,236,"","","GBP",835.92,NULL,65941602,"2020-01-30 10:29:00","CC","2019-01-30 10:29:00"
67 | "2019-01-07",2794383,231,"CHARGE",375.36,NULL,"","","GBP",468.67,40.00,65941602,"2020-01-07 14:01:00","CC","2019-01-07 14:01:00"
68 | "2019-01-09",2794383,232,"CONTRACT_CHANGE",0.00,231,"SUCCESS","","GBP",0.00,25.00,65941602,"2020-01-09 09:04:00","CC","2019-01-09 09:04:00"
69 | "2019-07-12",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","CC","2019-07-12 16:03:00"
70 | "2019-07-07",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","CC","2019-07-07 16:02:00"
71 | "2019-04-17",2794060,200,"PAYMENT_METHOD_UPDATED",0.00,200,"","","",0.00,NULL,66570782,NULL,"CC","2019-04-17 19:40:00"
72 | "2019-04-17",2794060,200,"PAYMENT_METHOD_UPDATED",0.00,200,"","","",0.00,NULL,66570782,NULL,"CC","2019-04-17 18:38:00"
73 | "2019-04-17",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-05-02 15:41:00","CC","2019-04-17 19:40:00"
74 | "2019-07-16",2794060,199,"RECURRING",48.00,199,"","","USD",48.00,NULL,66570782,"2019-08-16 20:15:00","CC","2019-07-16 20:15:00"
75 | "2019-02-02",2794060,200,"RECURRING",53.76,200,"","","USD",53.76,NULL,66570782,"2019-03-02 16:41:00","PAYPAL","2019-02-02 16:41:00"
76 | "2019-01-14",2794060,199,"CHARGE",48.00,NULL,"","","USD",48.00,48.00,66570782,"2019-02-14 15:33:00","PAYPAL","2019-01-14 15:34:00"
77 | "2019-02-02",2794060,200,"CONTRACT_CHANGE",0.00,199,"SUCCESS","","USD",0.00,24.00,66570782,"2019-03-02 16:41:00","PAYPAL","2019-02-02 16:41:00"
78 | "2019-07-03",2794060,232,"CONTRACT_CHANGE",0.00,200,"FAILED","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","PAYPAL","2019-07-03 17:08:00"
79 | "2019-06-02",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-07-02 15:41:00","CC","2019-06-02 15:34:00"
80 | "2019-04-17",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","This transaction has been declined. Please try a different card or contact the credit card provider for assistance:[Do not Honour] (PV-05)","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","CC","2019-04-17 18:38:00"
81 | "2019-07-16",2794060,199,"CONTRACT_CHANGE",0.00,200,"SUCCESS","","USD",0.00,-24.00,66570782,"2019-08-16 20:15:00","CC","2019-07-16 20:15:00"
82 | "2019-07-16",2794060,200,"RECURRING",24.00,200,"","","USD",24.00,NULL,66570782,"2019-08-16 20:17:00","CC","2019-07-16 20:17:00"
83 | "2019-04-09",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","10417: Transaction cannot complete.","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","PAYPAL","2019-04-09 16:24:00"
84 | "2019-04-17",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","This transaction has been declined. Please try a different card or contact the credit card provider for assistance:[Do not Honour] (PV-05)","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","CC","2019-04-17 19:23:00"
85 | "2019-07-16",2794060,200,"CONTRACT_CHANGE",0.00,199,"SUCCESS","","USD",0.00,24.00,66570782,"2019-08-16 20:17:00","CC","2019-07-16 20:17:00"
86 | "2019-05-02",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-06-02 15:41:00","CC","2019-05-02 15:34:00"
87 | "2019-07-02",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","Insufficient funds. Please use another card or contact your bank for assistance (PV-51)","USD",0.00,NULL,66570782,"2019-07-02 15:41:00","CC","2019-07-02 15:03:00"
88 | "2019-04-02",2794060,200,"SUBSCRIPTION_CHARGE_FAILURE",0.00,200,"","10417: Transaction cannot complete.","USD",0.00,NULL,66570782,"2019-04-02 15:41:00","PAYPAL","2019-04-02 15:24:00"
89 | "2019-03-02",2794060,200,"RECURRING",72.00,200,"","","USD",72.00,0.00,66570782,"2019-04-02 15:41:00","PAYPAL","2019-03-02 16:34:00"
90 | "2019-01-10",2793508,200,"CHARGE",72.00,NULL,"","","USD",72.00,72.00,66280160,"2019-02-10 22:16:00","PAYPAL","2019-01-10 22:16:00"
91 | "2019-02-10",2793508,200,"CANCELLATION",0.00,200,"","","USD",0.00,-72.00,66280160,"2019-02-10 22:16:00","PAYPAL","2019-02-10 22:33:00"
92 | "2019-02-06",2793508,200,"CANCEL_ON_RENEWAL",0.00,200,"","","USD",0.00,NULL,66280160,"2019-02-10 22:16:00","PAYPAL","2019-02-06 11:07:00"
93 | "2019-06-21",2793730,234,"RECURRING",420.00,234,"","","USD",420.00,NULL,66630878,"2020-06-21 10:03:00","CC","2019-06-21 10:03:00"
94 | "2019-06-21",2793730,234,"CONTRACT_CHANGE",0.00,230,"SUCCESS","","USD",0.00,24.00,66630878,"2020-06-21 10:03:00","CC","2019-06-21 10:03:00"
95 | "2019-01-15",2793730,230,"CHARGE",300.00,NULL,"","","USD",300.00,25.00,66630878,"2020-01-15 01:34:00","CC","2019-01-15 01:35:00"
96 |
--------------------------------------------------------------------------------
/datalearn19intro/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Shay Palachy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/datalearn19intro/README.rst:
--------------------------------------------------------------------------------
1 | datalearn19intro
2 | ################
3 | |PyPI-Status| |PyPI-Versions| |LICENCE|
4 |
5 | Helper code for DataLearn 2019 ML Intro Workshop.
6 |
7 | .. code-block:: python
8 |
9 | from datalearn19intro import get_accounts
10 | accounts = get_accounts()
11 |
12 | .. contents::
13 |
14 | .. section-numbering::
15 |
16 | Installation
17 | ============
18 |
19 | Install ``datalearn19intro`` with:
20 |
21 | .. code-block:: bash
22 |
23 | pip install datalearn19intro
24 |
25 |
26 | Credits
27 | =======
28 | Created by Shay Palachy (shay.palachy@gmail.com).
29 |
30 | .. alternative:
31 | .. https://badge.fury.io/py/yellowbrick.svg
32 |
33 | .. |PyPI-Status| image:: https://img.shields.io/pypi/v/datalearn19intro.svg
34 | :target: https://pypi.org/project/datalearn19intro
35 |
36 | .. |PyPI-Versions| image:: https://img.shields.io/pypi/pyversions/datalearn19intro.svg
37 | :target: https://pypi.org/project/datalearn19intro
38 |
39 | .. |LICENCE| image:: https://img.shields.io/badge/License-MIT-yellow.svg
40 | :target: https://pypi.python.org/pypi/datalearn19intro
41 |
--------------------------------------------------------------------------------
/datalearn19intro/datalearn19intro/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataloader import ( # noqa: F401
2 | get_accounts,
3 | get_users,
4 | get_events,
5 | get_subscriptions,
6 | get_processed_intro_dataset,
7 | )
8 |
--------------------------------------------------------------------------------
/datalearn19intro/datalearn19intro/dataloader.py:
--------------------------------------------------------------------------------
1 | """Data loading code for DataLearn prep night workshop."""
2 |
3 | import pip
4 | import subprocess
5 |
6 | import pandas as pd
7 |
8 | try:
9 | import google.colab # noqa: F401
10 |
11 | IN_COLAB = True
12 | except ImportError:
13 | IN_COLAB = False
14 |
15 |
16 | def in_notebook():
17 | try:
18 | from IPython import get_ipython
19 |
20 | if 'IPKernelApp' not in get_ipython().config: # pragma: no cover
21 | return False
22 | except ImportError:
23 | return False
24 | return True
25 |
26 |
27 | def pipinstall(package):
28 | if hasattr(pip, 'main'):
29 | pip.main(['install', package])
30 | else:
31 | pip._internal.main(['install', package])
32 |
33 |
34 | GDRIVE = None
35 |
36 |
37 | def gdrive_authenticate():
38 | global GDRIVE
39 | if GDRIVE is not None:
40 | return
41 | print('Installing PyDrive...')
42 | subprocess.run(["pip", "install", "-U", "-q", "PyDrive"])
43 | # pipinstall('PyDrive')
44 | # !pip install -U -q PyDrive
45 | from pydrive.auth import GoogleAuth
46 | from pydrive.drive import GoogleDrive
47 | from google.colab import auth
48 | from oauth2client.client import GoogleCredentials
49 |
50 | # Authenticate and create the PyDrive client.GDRIVE_AUTHENICATED# This only
51 | # needs to be done once per notebook.
52 | print('Authenticating with Google Drive...')
53 | auth.authenticate_user()
54 | gauth = GoogleAuth()
55 | gauth.credentials = GoogleCredentials.get_application_default()
56 | GDRIVE = GoogleDrive(gauth)
57 |
58 |
59 | def _get_file(fname, id):
60 | if IN_COLAB:
61 | gdrive_authenticate()
62 | # you can see it with "get sherable link"
63 | print("Downloading {} from Google Drive...".format(fname))
64 | downloaded = GDRIVE.CreateFile({'id': id})
65 | downloaded.GetContentFile(fname)
66 | print("Done.")
67 | return pd.read_csv(fname)
68 | else:
69 | return pd.read_csv('data/{}'.format(fname))
70 |
71 |
72 | def get_accounts():
73 | return _get_file('accounts.csv', '1SFFGL_FIq3-l6CP9MTe9ueuLRMz_tvrw')
74 |
75 |
76 | def get_users():
77 | return _get_file('users.csv', '1fG6ebyTaWWOVRFHw9svNjgJLYdUcu5th')
78 |
79 |
80 | def get_events():
81 | return _get_file(
82 | 'Dynamic events table.csv', '1Gv0Z_IJ1kBwuUnPDkpgFM8mK1dGeTNi4')
83 |
84 |
85 | def get_subscriptions():
86 | return _get_file(
87 | 'Dynamic subscription table.csv', '1qC0VOpUkZo4O4lggzp45YcNxC7NXY4VV')
88 |
89 |
90 | def get_processed_intro_dataset():
91 | return _get_file(
92 | 'monday_datalearn.csv', '1W2D192QF_LIixPws1mj57C6OBNSxILFI')
93 |
--------------------------------------------------------------------------------
/datalearn19intro/mit_license_badge.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/datalearn19intro/setup.py:
--------------------------------------------------------------------------------
1 | """Setup for the datalearn19intro package."""
2 |
3 | # !/usr/bin/env python
4 | # -*- coding: utf-8 -*-
5 |
6 | import setuptools
7 |
8 |
9 | INSTALL_REQUIRES = [
10 | 'numpy',
11 | 'pandas'
12 | ]
13 |
14 | with open('README.rst') as f:
15 | README = f.read()
16 |
17 | setuptools.setup(
18 | author="Shay Palachy",
19 | author_email="shay.palachy@gmail.com",
20 | name='datalearn19intro',
21 | license="MIT",
22 | description='Helper code for DataLearn 2019 ML Intro Workshop.',
23 | version='v0.0.5',
24 | # cmdclass=versioneer.get_cmdclass(),
25 | long_description=README,
26 | url='https://github.com/DataHackIL/DataLearn-ML-Intro-2019',
27 | packages=setuptools.find_packages(),
28 | include_package_data=True,
29 | python_requires=">=3.5",
30 | install_requires=INSTALL_REQUIRES,
31 | # extras_require={
32 | # 'test': TEST_REQUIRES + INSTALL_REQUIRES,
33 | # },
34 | classifiers=[
35 | # Trove classifiers
36 | # (https://pypi.python.org/pypi?%3Aaction=list_classifiers)
37 | 'Development Status :: 4 - Beta',
38 | 'License :: OSI Approved :: MIT License',
39 | 'Programming Language :: Python',
40 | 'Programming Language :: Python :: 3.5',
41 | 'Programming Language :: Python :: 3.6',
42 | 'Programming Language :: Python :: 3.7',
43 | 'Topic :: Software Development :: Libraries',
44 | 'Topic :: Software Development :: Libraries :: Python Modules',
45 | 'Intended Audience :: Developers',
46 | ],
47 | )
48 |
--------------------------------------------------------------------------------
/part_1.introducing_jupyter.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.5"},"colab":{"name":"part_1.introducing_jupyter.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[]}},"cells":[{"cell_type":"markdown","metadata":{"id":"dXQRvMNhxdND","colab_type":"text"},"source":["# Part 1: Introducing the Jupyter Notebook"]},{"cell_type":"markdown","metadata":{"id":"Vdp7sdXHxdNF","colab_type":"text"},"source":["## What is the Jupyter Notebook?"]},{"cell_type":"markdown","metadata":{"id":"XxajIhHfxdNG","colab_type":"text"},"source":["The Jupyter Notebook is an interactive computing environment that enables users to author notebook documents that include:\n","\n","* Live code\n","* Interactive widgets\n","* Plots\n","* Narrative text\n","* Equations\n","* Images\n","* Video\n","\n","These documents provide a complete and self-contained record of a computation that can be converted to various formats and shared with others using email, Dropbox, version control systems (like git/GitHub) or nbviewer.jupyter.org."]},{"cell_type":"markdown","metadata":{"id":"FraUpL-JxdNH","colab_type":"text"},"source":["## Components\n","The Jupyter Notebook combines three components:\n","\n","* **The notebook web application**: An interactive web application for writing and running code interactively and authoring notebook documents.\n","* **Kernels**: Separate processes started by the notebook web application that runs users' code in a given language and returns output back to the notebook web application. The kernel also handles things like computations for interactive widgets, tab completion and introspection.\n","* **Notebook documents**: Self-contained documents that contain a representation of all content visible in the notebook web application, including inputs and outputs of the computations, narrative text, equations, images, and rich media representations of objects. Each notebook document has its own kernel.\n","\n","This enables the user to both **edit** and **run** code in the browser."]},{"cell_type":"markdown","metadata":{"id":"8cfyGFfPxdNI","colab_type":"text"},"source":["## Kernels\n","Through Jupyter's kernel and messaging architecture, the Notebook allows code to be run in a range of different programming languages. For each notebook document that a user opens, the web application starts a kernel that runs the code for that notebook. Each kernel is capable of running code in a single programming language and there are kernels available in the following languages:\n","\n","* Python (https://github.com/ipython/ipython)\n","* Julia (https://github.com/JuliaLang/IJulia.jl)\n","* R (https://github.com/takluyver/IRkernel)\n","* Ruby (https://github.com/minrk/iruby)\n","* Haskell (https://github.com/gibiansky/IHaskell)\n","* Scala (https://github.com/Bridgewater/scala-notebook)\n","* node.js (https://gist.github.com/Carreau/4279371)\n","* Go (https://github.com/takluyver/igo)\n","\n","The default kernel runs Python code. The notebook provides a simple way for users to pick which of these kernels is used for a given notebook."]},{"cell_type":"markdown","metadata":{"id":"D5Q2fTa5xdNJ","colab_type":"text"},"source":["## Notebook cells"]},{"cell_type":"markdown","metadata":{"id":"s-9D3V3hxdNK","colab_type":"text"},"source":["A Jupyter notebook is made up of consecutive cells. There are two basic types of cells:\n","* Markdown cells\n","* Code cells"]},{"cell_type":"markdown","metadata":{"id":"xF9L_V12xdNK","colab_type":"text"},"source":["You can turn a cell into a markdown cell by pressing `m` when it is selected (but the cursor is **not** inside it). To turn it to a code cell use `y`."]},{"cell_type":"markdown","metadata":{"id":"2a6sSfdkxdNL","colab_type":"text"},"source":["###More comands:\n","* 'Escape' outside of a cell using `Esc`.\n","* Edit the current cell using `Enter`.\n","* Execute a cell using `shift`+`Enter`.\n","\n","In Jupyter Notebooks (but not on *colab*):\n","* Add a cell above the current cell using `a`.\n","* Add a cell above the current cell using `b`.\n","* Copy the current cell using `c`.\n","* Delete the current cell using `dd`."]},{"cell_type":"markdown","metadata":{"id":"FIx3pfbtxdNL","colab_type":"text"},"source":["## Markdown Cells\n","\n","This cell is a markdown cell. This means you can use markdown to write *italic text* by surrounding you text with *asterisks* or _underscores_.\n","Strong emphasis, aka bold, with **asterisks** or __underscores__.\n","Combined emphasis with **asterisks and _underscores_**.\n","Strikethrough uses two tildes. ~~Scratch this.~~\n","\n","Here be headers:\n","# H1\n","## H2\n","### etc...\n","\n","You can create bulleted lists using\n","* Asterisks\n","- Or minuses\n","+ Or pluses\n","\n","### Ordered lists\n","1. First ordered list item\n","2. Another item\n"," * Unordered sub-list. \n","1. Actual numbers don't matter, just that it's a number\n"," 1. Ordered sub-list.\n"," \n"," An un-numbered indented paragraph.\n","4. And another item.\n","\n","[And of course add links](https://www.google.com)\n","\n","You can find a nice full markdown cheatsheet here: https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet"]},{"cell_type":"markdown","metadata":{"id":"XIcvwtmyxdNM","colab_type":"text"},"source":["
**NOTICE**
\n","\n","
HTML tags can also be used inside markdown cells.
\n",""]},{"cell_type":"markdown","metadata":{"id":"nGFegaYLxdNN","colab_type":"text"},"source":["## Code Cells"]},{"cell_type":"code","metadata":{"id":"c3DJV5VrxdNN","colab_type":"code","colab":{}},"source":["# this is a code cell\n","a = 5"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"11R4NJYYxdNR","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"9c4c7195-ac75-4e6e-9649-b7c622decabe","executionInfo":{"status":"ok","timestamp":1565864335301,"user_tz":-180,"elapsed":531,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# a line containing only an expression will print it\n","a"],"execution_count":4,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5"]},"metadata":{"tags":[]},"execution_count":4}]},{"cell_type":"code","metadata":{"id":"Z0EcqEDvxdNU","colab_type":"code","colab":{}},"source":["# unless it ends with a semicolon\n","a;"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"2G5MWI9MxdNW","colab_type":"code","colab":{}},"source":["# you can also define functions\n","def foo(a, b):\n"," \"\"\"Foo documentation.\"\"\"\n"," return a*5 + 2/b"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"0bhzFsNFxdNY","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"5efcad27-e813-4c75-c1b3-3f6fbbb3782f","executionInfo":{"status":"ok","timestamp":1565864343265,"user_tz":-180,"elapsed":745,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# and then use them!\n","foo(5,8)"],"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["25.25"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"code","metadata":{"id":"nbpbD7pxxdNb","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"a3da4249-5f4c-4990-d70a-837ad0b2f406","executionInfo":{"status":"ok","timestamp":1565864346647,"user_tz":-180,"elapsed":535,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# you can also use IPython magic functions!\n","%time numbers = [x for x in range(1000000)]"],"execution_count":8,"outputs":[{"output_type":"stream","text":["CPU times: user 47.6 ms, sys: 45.9 ms, total: 93.5 ms\n","Wall time: 102 ms\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"ppBPkqUFxdNd","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"1e3ba24e-5252-4abd-85e4-965918cd6b42","executionInfo":{"status":"ok","timestamp":1565864352232,"user_tz":-180,"elapsed":3315,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["%timeit numbers = [x for x in range(100000)]"],"execution_count":9,"outputs":[{"output_type":"stream","text":["100 loops, best of 3: 6.29 ms per loop\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"QCf-fiySxdNf","colab_type":"text"},"source":["It is also easy to display plots inside a Jupyter notebook. We'll see this later."]},{"cell_type":"code","metadata":{"id":"uCjbABubxdNg","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}
--------------------------------------------------------------------------------
/part_2.numpy.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.6.5"},"colab":{"name":"part_2.numpy.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":["RwPgHgFP4H1l","CKd7UT3F4H20","Z4ZVTQa84H3p","6sPYGkYt4H3z","0onFRxLN4H4B","NU_ZudOj4H4e","8qWuhJ7E4H4v","WFkuwnxB4H5z","CNZcnWVi4H6R","R4QtrGfi4H6h","mgBbOZ2H4H6k","dPDTaPLn4H6m","hdcCMF5B4H6u"]}},"cells":[{"cell_type":"markdown","metadata":{"id":"Yai-UYMg4H1a","colab_type":"text"},"source":["# Numpy - multidimensional data arrays"]},{"cell_type":"markdown","metadata":{"id":"zhlhZMYL4H1f","colab_type":"text"},"source":["Based on J.R. Johansson's notebook (jrjohansson at gmail.com)"]},{"cell_type":"markdown","metadata":{"id":"RwPgHgFP4H1l","colab_type":"text"},"source":["## Introduction"]},{"cell_type":"markdown","metadata":{"id":"Ci0r993d4H1p","colab_type":"text"},"source":["The `numpy` package (module) is used in almost all numerical computation using Python. It is a package that provide high-performance vector, matrix and higher-dimensional data structures for Python. It is implemented in C and Fortran so when calculations are vectorized (formulated with vectors and matrices), performance is very good. \n","\n","To use `numpy` you need to import the module, using for example:"]},{"cell_type":"code","metadata":{"id":"OvvZa0Z44H1q","colab_type":"code","colab":{}},"source":["from numpy import *"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UAaDil2O4H1u","colab_type":"text"},"source":["In the `numpy` package the terminology used for vectors, matrices and higher-dimensional data sets is *array*. \n","\n"]},{"cell_type":"markdown","metadata":{"id":"aLAmXzIW4H2o","colab_type":"text"},"source":["## Creating `numpy` arrays"]},{"cell_type":"markdown","metadata":{"id":"f0RWAhOb4H2p","colab_type":"text"},"source":["There are a number of ways to initialize new numpy arrays, for example from\n","\n","* a Python list or tuples\n","* using functions that are dedicated to generating numpy arrays, such as `arange`, `linspace`, etc.\n","* reading data from files"]},{"cell_type":"markdown","metadata":{"id":"CKd7UT3F4H20","colab_type":"text"},"source":["### From lists"]},{"cell_type":"markdown","metadata":{"id":"ivCyIJYi4H23","colab_type":"text"},"source":["For example, to create new vector and matrix arrays from Python lists we can use the `numpy.array` function."]},{"cell_type":"code","metadata":{"id":"cfeoOXX94H24","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"85bca577-5bfa-472e-db58-f9ad3a139b50","executionInfo":{"status":"ok","timestamp":1565877235016,"user_tz":-180,"elapsed":1031,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# a vector: the argument to the array function is a Python list\n","v = array([1,2,3,4])\n","\n","v"],"execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([1, 2, 3, 4])"]},"metadata":{"tags":[]},"execution_count":5}]},{"cell_type":"code","metadata":{"id":"ks3LWXpq4H28","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":50},"outputId":"f47f7217-92f4-49c1-ee89-29b1aacdc600","executionInfo":{"status":"ok","timestamp":1565877235464,"user_tz":-180,"elapsed":1369,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# a matrix: the argument to the array function is a nested Python list\n","M = array([[1, 2], [3, 4]])\n","\n","M"],"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[1, 2],\n"," [3, 4]])"]},"metadata":{"tags":[]},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"uub0dXTZ4H3A","colab_type":"text"},"source":["The `v` and `M` objects are both of the type `ndarray` that the `numpy` module provides."]},{"cell_type":"code","metadata":{"id":"zkxf0n4a4H3A","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"3261d26f-670c-41d5-9e26-cb6f510a8254","executionInfo":{"status":"ok","timestamp":1565877236184,"user_tz":-180,"elapsed":540,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["type(v), type(M)"],"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(numpy.ndarray, numpy.ndarray)"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"markdown","metadata":{"id":"2YbxEAgV4H3C","colab_type":"text"},"source":["The difference between the `v` and `M` arrays is only their shapes. We can get information about the shape of an array by using the `ndarray.shape` property."]},{"cell_type":"code","metadata":{"id":"Fz7mVTEL4H3E","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"82cafd58-eb8f-4184-a7bf-768ff3a23889","executionInfo":{"status":"ok","timestamp":1565877289354,"user_tz":-180,"elapsed":1019,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["v.shape"],"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(4,)"]},"metadata":{"tags":[]},"execution_count":8}]},{"cell_type":"code","metadata":{"id":"QeGbgpLb4H3H","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"3c6874ef-e927-403a-b5c2-3a094ca7156b","executionInfo":{"status":"ok","timestamp":1565877295892,"user_tz":-180,"elapsed":936,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M.shape"],"execution_count":9,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(2, 2)"]},"metadata":{"tags":[]},"execution_count":9}]},{"cell_type":"markdown","metadata":{"id":"BmW4mybb4H3K","colab_type":"text"},"source":["The number of elements in the array is available through the `ndarray.size` property:"]},{"cell_type":"code","metadata":{"id":"pYoUPA_A4H3N","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"73705664-b047-46ee-dacb-dd5a53da8a2d","executionInfo":{"status":"ok","timestamp":1565877302429,"user_tz":-180,"elapsed":1236,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M.size"],"execution_count":10,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4"]},"metadata":{"tags":[]},"execution_count":10}]},{"cell_type":"markdown","metadata":{"id":"lbmKpqa64H3S","colab_type":"text"},"source":["Equivalently, we could use the function `numpy.shape` and `numpy.size`"]},{"cell_type":"code","metadata":{"id":"va2u6UBS4H3T","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"c7626ae7-7abd-4857-eb6a-5f092321fa7e","executionInfo":{"status":"ok","timestamp":1565877305074,"user_tz":-180,"elapsed":819,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["shape(M)"],"execution_count":11,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(2, 2)"]},"metadata":{"tags":[]},"execution_count":11}]},{"cell_type":"code","metadata":{"id":"6Nt1VX094H3a","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"e1d574bc-e5fd-42f7-8a12-3d857957cc11","executionInfo":{"status":"ok","timestamp":1565877306018,"user_tz":-180,"elapsed":525,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["size(M)"],"execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4"]},"metadata":{"tags":[]},"execution_count":12}]},{"cell_type":"markdown","metadata":{"id":"8uURf4TK4H3c","colab_type":"text"},"source":["So far the `numpy.ndarray` looks awefully much like a Python list (or nested list). Why not simply use Python lists for computations instead of creating a new array type? \n","\n","There are several reasons:\n","\n","* Python lists are very general. They can contain any kind of object. They are dynamically typed. They do not support mathematical functions such as matrix and dot multiplications, etc. Implementing such functions for Python lists would not be very efficient because of the dynamic typing.\n","* Numpy arrays are **statically typed** and **homogeneous**. The type of the elements is determined when the array is created.\n","* Numpy arrays are memory efficient.\n","* Because of the static typing, fast implementation of mathematical functions such as multiplication and addition of `numpy` arrays can be implemented in a compiled language (C and Fortran is used).\n","\n","Using the `dtype` (data type) property of an `ndarray`, we can see what type the data of an array has:"]},{"cell_type":"code","metadata":{"id":"H1ny0dwq4H3d","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"e5948335-7540-4b99-a3e7-354f4de59da8","executionInfo":{"status":"ok","timestamp":1565877336197,"user_tz":-180,"elapsed":556,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M.dtype"],"execution_count":13,"outputs":[{"output_type":"execute_result","data":{"text/plain":["dtype('int64')"]},"metadata":{"tags":[]},"execution_count":13}]},{"cell_type":"markdown","metadata":{"id":"mOQjfIdD4H3g","colab_type":"text"},"source":["We get an error if we try to assign a value of the wrong type to an element in a numpy array:"]},{"cell_type":"code","metadata":{"id":"HQ0ySny04H3g","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":162},"outputId":"80cf5e53-3fcc-444c-f08d-fc35e8bf7b88","executionInfo":{"status":"error","timestamp":1565877346786,"user_tz":-180,"elapsed":561,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M[0,0] = \"hello\""],"execution_count":14,"outputs":[{"output_type":"error","ename":"ValueError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mM\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"hello\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mValueError\u001b[0m: invalid literal for int() with base 10: 'hello'"]}]},{"cell_type":"markdown","metadata":{"id":"NxoE_aas4H3i","colab_type":"text"},"source":["If we want, we can explicitly define the type of the array data when we create it, using the `dtype` keyword argument: "]},{"cell_type":"code","metadata":{"id":"tcoqshv34H3j","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":50},"outputId":"53d4f229-57ac-493c-ff77-acc9f1b30cbc","executionInfo":{"status":"ok","timestamp":1565877353544,"user_tz":-180,"elapsed":800,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["M = array([[1, 2], [3, 4]], dtype=complex)\n","\n","M"],"execution_count":15,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[1.+0.j, 2.+0.j],\n"," [3.+0.j, 4.+0.j]])"]},"metadata":{"tags":[]},"execution_count":15}]},{"cell_type":"markdown","metadata":{"id":"71v9E4m84H3o","colab_type":"text"},"source":["Common data types that can be used with `dtype` are: `int`, `float`, `complex`, `bool`, `object`, etc.\n","\n","We can also explicitly define the bit size of the data types, for example: `int64`, `int16`, `float128`, `complex128`."]},{"cell_type":"markdown","metadata":{"id":"Z4ZVTQa84H3p","colab_type":"text"},"source":["### Using array-generating functions"]},{"cell_type":"markdown","metadata":{"id":"aYFas-Rn4H3p","colab_type":"text"},"source":["For larger arrays it is inpractical to initialize the data manually, using explicit python lists. Instead we can use one of the many functions in `numpy` that generate arrays of different forms. Some of the more common are:"]},{"cell_type":"code","metadata":{"id":"irdANGOL4H3q","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"549095e2-3930-456e-fd1d-f3a7ebc9e946","executionInfo":{"status":"ok","timestamp":1565877387626,"user_tz":-180,"elapsed":951,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# create a range\n","\n","x = arange(0, 10, 1) # arguments: start, stop, step\n","\n","x"],"execution_count":16,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"]},"metadata":{"tags":[]},"execution_count":16}]},{"cell_type":"code","metadata":{"id":"Sr-MPcyK4H3s","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":101},"outputId":"bdd6d5cc-d486-4923-e3cf-7373591297da","executionInfo":{"status":"ok","timestamp":1565877387628,"user_tz":-180,"elapsed":552,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["x = arange(-1, 1, 0.1)\n","\n","x"],"execution_count":17,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([-1.00000000e+00, -9.00000000e-01, -8.00000000e-01, -7.00000000e-01,\n"," -6.00000000e-01, -5.00000000e-01, -4.00000000e-01, -3.00000000e-01,\n"," -2.00000000e-01, -1.00000000e-01, -2.22044605e-16, 1.00000000e-01,\n"," 2.00000000e-01, 3.00000000e-01, 4.00000000e-01, 5.00000000e-01,\n"," 6.00000000e-01, 7.00000000e-01, 8.00000000e-01, 9.00000000e-01])"]},"metadata":{"tags":[]},"execution_count":17}]},{"cell_type":"code","metadata":{"scrolled":true,"id":"xXZsubZo4H3v","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":101},"outputId":"421b1c6d-e865-437b-f68a-a6a5e15f46e2","executionInfo":{"status":"ok","timestamp":1565877391062,"user_tz":-180,"elapsed":860,"user":{"displayName":"Shay Palachy","photoUrl":"https://lh5.googleusercontent.com/-_8J8Je8o3NQ/AAAAAAAAAAI/AAAAAAAAFOE/d2-wJaGxpaA/s64/photo.jpg","userId":"01723284119030760908"}}},"source":["# using linspace, both end points ARE included\n","linspace(0, 10, 25)"],"execution_count":18,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0. , 0.41666667, 0.83333333, 1.25 , 1.66666667,\n"," 2.08333333, 2.5 , 2.91666667, 3.33333333, 3.75 ,\n"," 4.16666667, 4.58333333, 5. , 5.41666667, 5.83333333,\n"," 6.25 , 6.66666667, 7.08333333, 7.5 , 7.91666667,\n"," 8.33333333, 8.75 , 9.16666667, 9.58333333, 10. ])"]},"metadata":{"tags":[]},"execution_count":18}]},{"cell_type":"markdown","metadata":{"id":"6sPYGkYt4H3z","colab_type":"text"},"source":["#### mgrid"]},{"cell_type":"code","metadata":{"id":"LTtylXc24H34","colab_type":"code","colab":{}},"source":["x, y = mgrid[0:5, 0:5] # similar to meshgrid in MATLAB"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"CSsUMPEr4H37","colab_type":"code","colab":{},"outputId":"469c0bf4-a5f2-4a0c-8638-7ac71c62539c"},"source":["x"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[0, 0, 0, 0, 0],\n"," [1, 1, 1, 1, 1],\n"," [2, 2, 2, 2, 2],\n"," [3, 3, 3, 3, 3],\n"," [4, 4, 4, 4, 4]])"]},"metadata":{"tags":[]},"execution_count":19}]},{"cell_type":"code","metadata":{"id":"KWaBRngd4H3-","colab_type":"code","colab":{},"outputId":"71ba3bbd-ddff-4392-a4c0-d3abc5838b09"},"source":["y"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[0, 1, 2, 3, 4],\n"," [0, 1, 2, 3, 4],\n"," [0, 1, 2, 3, 4],\n"," [0, 1, 2, 3, 4],\n"," [0, 1, 2, 3, 4]])"]},"metadata":{"tags":[]},"execution_count":20}]},{"cell_type":"markdown","metadata":{"id":"0onFRxLN4H4B","colab_type":"text"},"source":["#### random data"]},{"cell_type":"code","metadata":{"id":"KbOBN6R34H4D","colab_type":"code","colab":{}},"source":["from numpy import random"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ek8AG7Br4H4Q","colab_type":"code","colab":{},"outputId":"9f97fd7e-e017-4fbd-8ef8-ee318ca7916d"},"source":["# uniform random numbers in [0,1]\n","random.rand(5,5)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.92932506, 0.19684255, 0.736434 , 0.18125714, 0.70905038],\n"," [ 0.18803573, 0.9312815 , 0.1284532 , 0.38138008, 0.36646481],\n"," [ 0.53700462, 0.02361381, 0.97760688, 0.73296701, 0.23042324],\n"," [ 0.9024635 , 0.20860922, 0.67729644, 0.68386687, 0.49385729],\n"," [ 0.95876515, 0.29341553, 0.37520629, 0.29194432, 0.64102804]])"]},"metadata":{"tags":[]},"execution_count":22}]},{"cell_type":"code","metadata":{"id":"3Qb6y0hv4H4a","colab_type":"code","colab":{},"outputId":"f8bff034-1d25-42d5-e1ac-d5b1a9c95c8c"},"source":["# standard normal distributed random numbers\n","random.randn(5,5)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.117907 , -1.57016164, 0.78256246, 1.45386709, 0.54744436],\n"," [ 2.30356897, -0.28352021, -0.9087325 , 1.2285279 , -1.00760167],\n"," [ 0.72216801, 0.77507299, -0.37793178, -0.31852241, 0.84493629],\n"," [-0.10682252, 1.15930142, -0.47291444, -0.69496967, -0.58912034],\n"," [ 0.34513487, -0.92389516, -0.216978 , 0.42153272, 0.86650101]])"]},"metadata":{"tags":[]},"execution_count":23}]},{"cell_type":"markdown","metadata":{"id":"NU_ZudOj4H4e","colab_type":"text"},"source":["#### zeros and ones"]},{"cell_type":"code","metadata":{"id":"MC5Qqrih4H4g","colab_type":"code","colab":{},"outputId":"2da7218a-809e-4151-a584-1c10a7311e8d"},"source":["zeros((3,3))"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0., 0., 0.],\n"," [ 0., 0., 0.],\n"," [ 0., 0., 0.]])"]},"metadata":{"tags":[]},"execution_count":26}]},{"cell_type":"code","metadata":{"id":"q8-TzdJ34H4r","colab_type":"code","colab":{},"outputId":"dd3cf0f6-d086-491b-8ac0-acd1ead5ce42"},"source":["ones((3,3))"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1., 1., 1.],\n"," [ 1., 1., 1.],\n"," [ 1., 1., 1.]])"]},"metadata":{"tags":[]},"execution_count":27}]},{"cell_type":"markdown","metadata":{"id":"8qWuhJ7E4H4v","colab_type":"text"},"source":["## More properties of numpy arrays"]},{"cell_type":"code","metadata":{"id":"rcREVjoG4H4w","colab_type":"code","colab":{},"outputId":"7df199df-f2d1-4a27-9287-05b427e50cc7"},"source":["M.itemsize # bytes per element"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["8"]},"metadata":{"tags":[]},"execution_count":38}]},{"cell_type":"code","metadata":{"id":"o4LXNp644H40","colab_type":"code","colab":{},"outputId":"22b37e4c-d92b-4ed9-d901-5246282f41a7"},"source":["M.nbytes # number of bytes"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["72"]},"metadata":{"tags":[]},"execution_count":39}]},{"cell_type":"code","metadata":{"id":"WEPR97Rx4H49","colab_type":"code","colab":{},"outputId":"09bd572f-e6e6-45af-d7b1-2f519f0b5776"},"source":["M.ndim # number of dimensions"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["2"]},"metadata":{"tags":[]},"execution_count":40}]},{"cell_type":"markdown","metadata":{"id":"ZE5H_AbI4H4_","colab_type":"text"},"source":["## Manipulating arrays"]},{"cell_type":"markdown","metadata":{"id":"Gn5WEg754H5A","colab_type":"text"},"source":["### Indexing"]},{"cell_type":"markdown","metadata":{"id":"xZxt-9h44H5A","colab_type":"text"},"source":["We can index elements in an array using square brackets and indices:"]},{"cell_type":"code","metadata":{"id":"Ry0do-4K4H5B","colab_type":"code","colab":{},"outputId":"809671ed-3317-433f-dbbf-a82c584d7d0e"},"source":["# v is a vector, and has only one dimension, taking one index\n","v[0]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1"]},"metadata":{"tags":[]},"execution_count":41}]},{"cell_type":"code","metadata":{"id":"SeXl-uSv4H5D","colab_type":"code","colab":{},"outputId":"86699f4f-7c80-4e0b-d91f-320f16efecea"},"source":["# M is a matrix, or a 2 dimensional array, taking two indices \n","M[1,1]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.47913739949636192"]},"metadata":{"tags":[]},"execution_count":42}]},{"cell_type":"markdown","metadata":{"id":"UIS_PIPQ4H5F","colab_type":"text"},"source":["If we omit an index of a multidimensional array it returns the whole row (or, in general, a N-1 dimensional array) "]},{"cell_type":"code","metadata":{"id":"wmTMc0ia4H5G","colab_type":"code","colab":{},"outputId":"09d6e405-e1d6-441e-ba9b-52218318032b"},"source":["M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 0.77872576, 0.40043577, 0.66254019],\n"," [ 0.60410063, 0.4791374 , 0.8237106 ],\n"," [ 0.96856318, 0.15459644, 0.96082399]])"]},"metadata":{"tags":[]},"execution_count":43}]},{"cell_type":"code","metadata":{"id":"hrO_oej84H5N","colab_type":"code","colab":{},"outputId":"eba958b0-6259-481b-9bf7-6f06c9fd30db"},"source":["M[1]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.60410063, 0.4791374 , 0.8237106 ])"]},"metadata":{"tags":[]},"execution_count":44}]},{"cell_type":"markdown","metadata":{"id":"hyj1kRXP4H5P","colab_type":"text"},"source":["The same thing can be achieved with using `:` instead of an index: "]},{"cell_type":"code","metadata":{"id":"_RhaEcYM4H5Q","colab_type":"code","colab":{},"outputId":"79daddb0-ca06-40c4-c734-602f7971e5e6"},"source":["M[1,:] # row 1"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.60410063, 0.4791374 , 0.8237106 ])"]},"metadata":{"tags":[]},"execution_count":45}]},{"cell_type":"code","metadata":{"id":"H3DX8HsU4H5T","colab_type":"code","colab":{},"outputId":"bb107fd0-f7ee-4748-fee6-7c179a2d5151"},"source":["M[:,1] # column 1"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0.40043577, 0.4791374 , 0.15459644])"]},"metadata":{"tags":[]},"execution_count":46}]},{"cell_type":"markdown","metadata":{"id":"YVOTbsDS4H5W","colab_type":"text"},"source":["We can assign new values to elements in an array using indexing:"]},{"cell_type":"code","metadata":{"id":"AiHZCF_q4H5X","colab_type":"code","colab":{}},"source":["M[0,0] = 1"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"gdRL6hoq4H5f","colab_type":"code","colab":{},"outputId":"f3534e44-0686-460e-dfdb-82fd009314c0"},"source":["M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1. , 0.40043577, 0.66254019],\n"," [ 0.60410063, 0.4791374 , 0.8237106 ],\n"," [ 0.96856318, 0.15459644, 0.96082399]])"]},"metadata":{"tags":[]},"execution_count":48}]},{"cell_type":"code","metadata":{"id":"-CkU0him4H5l","colab_type":"code","colab":{}},"source":["# also works for rows and columns\n","M[1,:] = 0\n","M[:,2] = -1"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBRT1tyZ4H5x","colab_type":"code","colab":{},"outputId":"f47a6691-2251-4a73-f3ca-b2fbbf08022e"},"source":["M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1. , 0.40043577, -1. ],\n"," [ 0. , 0. , -1. ],\n"," [ 0.96856318, 0.15459644, -1. ]])"]},"metadata":{"tags":[]},"execution_count":50}]},{"cell_type":"markdown","metadata":{"id":"WFkuwnxB4H5z","colab_type":"text"},"source":["### Index slicing"]},{"cell_type":"markdown","metadata":{"id":"dFRAptaD4H50","colab_type":"text"},"source":["Index slicing is the technical name for the syntax `M[lower:upper:step]` to extract part of an array:"]},{"cell_type":"code","metadata":{"id":"9wz8jo3Z4H51","colab_type":"code","colab":{},"outputId":"02e30d19-c5a6-4059-b660-536c28a72141"},"source":["A = array([1,2,3,4,5])\n","A"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([1, 2, 3, 4, 5])"]},"metadata":{"tags":[]},"execution_count":51}]},{"cell_type":"code","metadata":{"id":"nn08qKFJ4H54","colab_type":"code","colab":{},"outputId":"3253b970-9460-4623-b23c-ee3779336e36"},"source":["A[1:3]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([2, 3])"]},"metadata":{"tags":[]},"execution_count":52}]},{"cell_type":"markdown","metadata":{"id":"9HANOCRZ4H5-","colab_type":"text"},"source":["Array slices are *mutable*: if they are assigned a new value the original array from which the slice was extracted is modified:"]},{"cell_type":"code","metadata":{"id":"OgYbFcS64H5_","colab_type":"code","colab":{},"outputId":"bb43f189-fd8c-432a-9810-810de1dc66df"},"source":["A[1:3] = [-2,-3]\n","\n","A"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 1, -2, -3, 4, 5])"]},"metadata":{"tags":[]},"execution_count":53}]},{"cell_type":"markdown","metadata":{"id":"jNczBl2X4H6E","colab_type":"text"},"source":["Negative indices counts from the end of the array (positive index from the begining):"]},{"cell_type":"code","metadata":{"id":"Mgur3nS94H6F","colab_type":"code","colab":{}},"source":["A = array([1,2,3,4,5])"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ovmQzTPT4H6I","colab_type":"code","colab":{},"outputId":"61419e48-b0d2-4ebe-8849-61b1efca5b38"},"source":["A[-1] # the last element in the array"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5"]},"metadata":{"tags":[]},"execution_count":59}]},{"cell_type":"code","metadata":{"id":"ZVHNR1Up4H6O","colab_type":"code","colab":{},"outputId":"bb0ef677-294c-4760-9bff-43567b1f09d8"},"source":["A[-3:] # the last three elements"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([3, 4, 5])"]},"metadata":{"tags":[]},"execution_count":60}]},{"cell_type":"markdown","metadata":{"id":"CNZcnWVi4H6R","colab_type":"text"},"source":["### Fancy indexing"]},{"cell_type":"markdown","metadata":{"id":"xDsu0l564H6S","colab_type":"text"},"source":["Fancy indexing is the name for when an array or list is used in-place of an index: "]},{"cell_type":"code","metadata":{"id":"GgHGCVGp4H6S","colab_type":"code","colab":{},"outputId":"531f2545-88a5-4078-b4e3-fcfc72f1f2a9"},"source":["row_indices = [1, 2, 3]\n","A[row_indices]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[10, 11, 12, 13, 14],\n"," [20, 21, 22, 23, 24],\n"," [30, 31, 32, 33, 34]])"]},"metadata":{"tags":[]},"execution_count":64}]},{"cell_type":"code","metadata":{"id":"0Ol2NX0j4H6W","colab_type":"code","colab":{},"outputId":"f484a32e-826e-4c7b-982c-59dd7700e14f"},"source":["col_indices = [1, 2, -1] # remember, index -1 means the last element\n","A[row_indices, col_indices]"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([11, 22, 34])"]},"metadata":{"tags":[]},"execution_count":65}]},{"cell_type":"markdown","metadata":{"id":"sYOLqh5s4H6Y","colab_type":"text"},"source":["### Linear and Matrix algebra"]},{"cell_type":"markdown","metadata":{"id":"cE8dK2DK4H6Z","colab_type":"text"},"source":["Numpy's real strength is in optimized linear and matrix algebric operations on vectors and matrices, but that's less relevant here."]},{"cell_type":"markdown","metadata":{"id":"0UsCmaIm4H6Z","colab_type":"text"},"source":["### Data processing"]},{"cell_type":"markdown","metadata":{"id":"NXPyElfW4H6a","colab_type":"text"},"source":["Often it is useful to store datasets in Numpy arrays. Numpy provides a number of functions to calculate statistics of datasets in arrays. \n","\n","For example, let's calculate some properties from the Stockholm temperature dataset used above."]},{"cell_type":"code","metadata":{"id":"k4KNJQvy4H6b","colab_type":"code","colab":{},"outputId":"7b6235eb-1446-44bc-da3a-678fe4780f24"},"source":["# reminder, the tempeature dataset is stored in the data variable:\n","data = random.randint(10,size=(8,8))\n","shape(data)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(8, 8)"]},"metadata":{"tags":[]},"execution_count":18}]},{"cell_type":"markdown","metadata":{"id":"R4QtrGfi4H6h","colab_type":"text"},"source":["#### mean"]},{"cell_type":"code","metadata":{"id":"px1LjwPK4H6h","colab_type":"code","colab":{},"outputId":"2ab080ba-418a-49f5-c780-a5649e405c72"},"source":["mean(data[:,3])"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["5.25"]},"metadata":{"tags":[]},"execution_count":20}]},{"cell_type":"markdown","metadata":{"id":"mgBbOZ2H4H6k","colab_type":"text"},"source":["#### standard deviations and variance"]},{"cell_type":"code","metadata":{"id":"KNsHBTsm4H6l","colab_type":"code","colab":{},"outputId":"1cedb05f-e5b4-405f-f902-a39370b87bae"},"source":["std(data[:,3]), var(data[:,3])"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(1.6393596310755001, 2.6875)"]},"metadata":{"tags":[]},"execution_count":21}]},{"cell_type":"markdown","metadata":{"id":"dPDTaPLn4H6m","colab_type":"text"},"source":["#### min and max"]},{"cell_type":"code","metadata":{"id":"CoF7hVsZ4H6o","colab_type":"code","colab":{},"outputId":"c74bab1a-18c7-4d91-ccf4-6a8db61c653d"},"source":["data[:,3].min()"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["4"]},"metadata":{"tags":[]},"execution_count":24}]},{"cell_type":"code","metadata":{"id":"_UMY69jr4H6q","colab_type":"code","colab":{},"outputId":"720b0a96-fa4d-4cd3-84ea-284c91cfb802"},"source":["data[:,3].max()"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["9"]},"metadata":{"tags":[]},"execution_count":25}]},{"cell_type":"markdown","metadata":{"id":"hdcCMF5B4H6u","colab_type":"text"},"source":["#### sum, prod, and their cumulative versions"]},{"cell_type":"code","metadata":{"id":"x5sBrhVU4H6x","colab_type":"code","colab":{},"outputId":"1eec9c5b-0005-40e6-f8bd-b2e8e8269e15"},"source":["d = arange(0, 10)\n","d"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"]},"metadata":{"tags":[]},"execution_count":26}]},{"cell_type":"code","metadata":{"id":"7SAdiHLi4H60","colab_type":"code","colab":{},"outputId":"ed91617c-4814-4a38-825f-75ebaff86666"},"source":["# sum up all elements\n","sum(d)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["45"]},"metadata":{"tags":[]},"execution_count":27}]},{"cell_type":"code","metadata":{"id":"i-3OP3WX4H63","colab_type":"code","colab":{},"outputId":"b52618b4-7f3d-4d70-aa43-19b47d1b6261"},"source":["# product of all elements\n","prod(d+1)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["3628800"]},"metadata":{"tags":[]},"execution_count":28}]},{"cell_type":"code","metadata":{"id":"gfm7Wv_E4H64","colab_type":"code","colab":{},"outputId":"794b747d-ba5f-49e2-f14c-22dbd3e6bde5"},"source":["# cummulative sum\n","cumsum(d)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45])"]},"metadata":{"tags":[]},"execution_count":29}]},{"cell_type":"code","metadata":{"id":"VTAeSiH54H66","colab_type":"code","colab":{},"outputId":"190d8a7d-7eba-48aa-e371-54fbf803c194"},"source":["# cummulative product\n","cumprod(d+1)"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([ 1, 2, 6, 24, 120, 720, 5040,\n"," 40320, 362880, 3628800])"]},"metadata":{"tags":[]},"execution_count":30}]},{"cell_type":"markdown","metadata":{"id":"33g0Fjtk4H7D","colab_type":"text"},"source":["## Iterating over array elements"]},{"cell_type":"markdown","metadata":{"id":"kOVcn0eu4H7D","colab_type":"text"},"source":["Generally, we want to avoid iterating over the elements of arrays whenever we can (at all costs). The reason is that in a interpreted language like Python (or MATLAB), iterations are really slow compared to vectorized operations. \n","\n","However, sometimes iterations are unavoidable. For such cases, the Python `for` loop is the most convenient way to iterate over an array:"]},{"cell_type":"code","metadata":{"id":"5Bi7I-Lz4H7E","colab_type":"code","colab":{},"outputId":"b65ac7c8-e662-4bfe-e9da-9f087c09f09f"},"source":["v = array([1,2,3,4])\n","\n","for element in v:\n"," print(element)"],"execution_count":0,"outputs":[{"output_type":"stream","text":["1\n","2\n","3\n","4\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"q1Mp_Qvq4H7G","colab_type":"code","colab":{},"outputId":"d277af64-0bdd-4d09-f304-dd649607bd4f"},"source":["M = array([[1,2], [3,4]])\n","\n","for row in M:\n"," print(\"row\", row)\n"," \n"," for element in row:\n"," print(element)"],"execution_count":0,"outputs":[{"output_type":"stream","text":["row [1 2]\n","1\n","2\n","row [3 4]\n","3\n","4\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"sOq8jYMN4H7I","colab_type":"text"},"source":["When we need to iterate over each element of an array and modify its elements, it is convenient to use the `enumerate` function to obtain both the element and its index in the `for` loop: "]},{"cell_type":"code","metadata":{"id":"93m_6Ev34H7J","colab_type":"code","colab":{},"outputId":"86c347ae-b2ac-49c6-8fb3-8b1e115e4d91"},"source":["for row_idx, row in enumerate(M):\n"," print(\"row_idx\", row_idx, \"row\", row)\n"," \n"," for col_idx, element in enumerate(row):\n"," print(\"col_idx\", col_idx, \"element\", element)\n"," \n"," # update the matrix M: square each element\n"," M[row_idx, col_idx] = element ** 2"],"execution_count":0,"outputs":[{"output_type":"stream","text":["row_idx 0 row [1 2]\n","col_idx 0 element 1\n","col_idx 1 element 2\n","row_idx 1 row [3 4]\n","col_idx 0 element 3\n","col_idx 1 element 4\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"-UcjV-954H7K","colab_type":"code","colab":{},"outputId":"ca48ab3c-cf27-47ad-d6b5-83009cd1e724"},"source":["# each element in M is now squared\n","M"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array([[ 1, 4],\n"," [ 9, 16]])"]},"metadata":{"tags":[]},"execution_count":35}]},{"cell_type":"code","metadata":{"id":"kRTCtoHX4H7M","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}
--------------------------------------------------------------------------------
/part_6.modeling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Modelling"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Loading the processed dataset"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "# !pip install -U -q datalearn19intro\n",
24 | "import numpy as np\n",
25 | "import pandas as pd\n",
26 | "import seaborn as sns\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "from datalearn19intro import get_processed_intro_dataset\n",
29 | "%matplotlib inline"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "pd.options.display.float_format = '{:,.2f}'.format\n",
39 | "pd.set_option('display.max_columns', 150)\n",
40 | "pd.set_option('display.max_rows', 200)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 4,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "df = get_processed_intro_dataset()"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 5,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/plain": [
60 | "(1001, 22)"
61 | ]
62 | },
63 | "execution_count": 5,
64 | "metadata": {},
65 | "output_type": "execute_result"
66 | }
67 | ],
68 | "source": [
69 | "df.shape"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 6,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "df = df.set_index('account_id')"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 7,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "data": {
88 | "text/html": [
89 | "